Merge branch 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (142 commits)
  KVM: Initialize fpu state in preemptible context
  KVM: VMX: when entering real mode align segment base to 16 bytes
  KVM: MMU: handle 'map_writable' in set_spte() function
  KVM: MMU: audit: allow audit more guests at the same time
  KVM: Fetch guest cr3 from hardware on demand
  KVM: Replace reads of vcpu->arch.cr3 by an accessor
  KVM: MMU: only write protect mappings at pagetable level
  KVM: VMX: Correct asm constraint in vmcs_load()/vmcs_clear()
  KVM: MMU: Initialize base_role for tdp mmus
  KVM: VMX: Optimize atomic EFER load
  KVM: VMX: Add definitions for more vm entry/exit control bits
  KVM: SVM: copy instruction bytes from VMCB
  KVM: SVM: implement enhanced INVLPG intercept
  KVM: SVM: enhance mov DR intercept handler
  KVM: SVM: enhance MOV CR intercept handler
  KVM: SVM: add new SVM feature bit names
  KVM: cleanup emulate_instruction
  KVM: move complete_insn_gp() into x86.c
  KVM: x86: fix CR8 handling
  KVM guest: Fix kvm clock initialization when it's configured out
  ...
This commit is contained in:
Linus Torvalds 2011-01-13 10:14:24 -08:00
commit 55065bc527
43 changed files with 3073 additions and 1180 deletions

View File

@ -1705,6 +1705,9 @@ and is between 256 and 4096 characters. It is defined in the file
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
fault handling.
nolapic [X86-32,APIC] Do not enable or use the local APIC.
nolapic_timer [X86-32,APIC] Do not use the local APIC timer.

View File

@ -1085,6 +1085,184 @@ of 4 instructions that make up a hypercall.
If any additional field gets added to this structure later on, a bit for that
additional piece of information will be set in the flags bitmap.
4.47 KVM_ASSIGN_PCI_DEVICE
Capability: KVM_CAP_DEVICE_ASSIGNMENT
Architectures: x86 ia64
Type: vm ioctl
Parameters: struct kvm_assigned_pci_dev (in)
Returns: 0 on success, -1 on error
Assigns a host PCI device to the VM.
struct kvm_assigned_pci_dev {
__u32 assigned_dev_id;
__u32 busnr;
__u32 devfn;
__u32 flags;
__u32 segnr;
union {
__u32 reserved[11];
};
};
The PCI device is specified by the triple segnr, busnr, and devfn.
Identification in succeeding service requests is done via assigned_dev_id. The
following flags are specified:
/* Depends on KVM_CAP_IOMMU */
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
4.48 KVM_DEASSIGN_PCI_DEVICE
Capability: KVM_CAP_DEVICE_DEASSIGNMENT
Architectures: x86 ia64
Type: vm ioctl
Parameters: struct kvm_assigned_pci_dev (in)
Returns: 0 on success, -1 on error
Ends PCI device assignment, releasing all associated resources.
See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is
used in kvm_assigned_pci_dev to identify the device.
4.49 KVM_ASSIGN_DEV_IRQ
Capability: KVM_CAP_ASSIGN_DEV_IRQ
Architectures: x86 ia64
Type: vm ioctl
Parameters: struct kvm_assigned_irq (in)
Returns: 0 on success, -1 on error
Assigns an IRQ to a passed-through device.
struct kvm_assigned_irq {
__u32 assigned_dev_id;
__u32 host_irq;
__u32 guest_irq;
__u32 flags;
union {
struct {
__u32 addr_lo;
__u32 addr_hi;
__u32 data;
} guest_msi;
__u32 reserved[12];
};
};
The following flags are defined:
#define KVM_DEV_IRQ_HOST_INTX (1 << 0)
#define KVM_DEV_IRQ_HOST_MSI (1 << 1)
#define KVM_DEV_IRQ_HOST_MSIX (1 << 2)
#define KVM_DEV_IRQ_GUEST_INTX (1 << 8)
#define KVM_DEV_IRQ_GUEST_MSI (1 << 9)
#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10)
It is not valid to specify multiple types per host or guest IRQ. However, the
IRQ type of host and guest can differ or can even be null.
4.50 KVM_DEASSIGN_DEV_IRQ
Capability: KVM_CAP_ASSIGN_DEV_IRQ
Architectures: x86 ia64
Type: vm ioctl
Parameters: struct kvm_assigned_irq (in)
Returns: 0 on success, -1 on error
Ends an IRQ assignment to a passed-through device.
See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
by assigned_dev_id, flags must correspond to the IRQ type specified on
KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
4.51 KVM_SET_GSI_ROUTING
Capability: KVM_CAP_IRQ_ROUTING
Architectures: x86 ia64
Type: vm ioctl
Parameters: struct kvm_irq_routing (in)
Returns: 0 on success, -1 on error
Sets the GSI routing table entries, overwriting any previously set entries.
struct kvm_irq_routing {
__u32 nr;
__u32 flags;
struct kvm_irq_routing_entry entries[0];
};
No flags are specified so far, the corresponding field must be set to zero.
struct kvm_irq_routing_entry {
__u32 gsi;
__u32 type;
__u32 flags;
__u32 pad;
union {
struct kvm_irq_routing_irqchip irqchip;
struct kvm_irq_routing_msi msi;
__u32 pad[8];
} u;
};
/* gsi routing entry types */
#define KVM_IRQ_ROUTING_IRQCHIP 1
#define KVM_IRQ_ROUTING_MSI 2
No flags are specified so far, the corresponding field must be set to zero.
struct kvm_irq_routing_irqchip {
__u32 irqchip;
__u32 pin;
};
struct kvm_irq_routing_msi {
__u32 address_lo;
__u32 address_hi;
__u32 data;
__u32 pad;
};
4.52 KVM_ASSIGN_SET_MSIX_NR
Capability: KVM_CAP_DEVICE_MSIX
Architectures: x86 ia64
Type: vm ioctl
Parameters: struct kvm_assigned_msix_nr (in)
Returns: 0 on success, -1 on error
Set the number of MSI-X interrupts for an assigned device. This service can
only be called once in the lifetime of an assigned device.
struct kvm_assigned_msix_nr {
__u32 assigned_dev_id;
__u16 entry_nr;
__u16 padding;
};
#define KVM_MAX_MSIX_PER_DEV 256
4.53 KVM_ASSIGN_SET_MSIX_ENTRY
Capability: KVM_CAP_DEVICE_MSIX
Architectures: x86 ia64
Type: vm ioctl
Parameters: struct kvm_assigned_msix_entry (in)
Returns: 0 on success, -1 on error
Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting
the GSI vector to zero means disabling the interrupt.
struct kvm_assigned_msix_entry {
__u32 assigned_dev_id;
__u32 gsi;
__u16 entry; /* The index of entry in the MSI-X table */
__u16 padding[3];
};
5. The kvm_run structure
Application code obtains a pointer to the kvm_run structure by

View File

@ -36,6 +36,9 @@ KVM_FEATURE_MMU_OP || 2 || deprecated.
KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
|| || 0x4b564d00 and 0x4b564d01
------------------------------------------------------------------------------
KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by
|| || writing to msr 0x4b564d02
------------------------------------------------------------------------------
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
|| || per-cpu warps are expected in
|| || kvmclock.

View File

@ -3,7 +3,6 @@ Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
=====================================================
KVM makes use of some custom MSRs to service some requests.
At present, this facility is only used by kvmclock.
Custom MSRs have a range reserved for them, that goes from
0x4b564d00 to 0x4b564dff. There are MSRs outside this area,
@ -151,3 +150,38 @@ MSR_KVM_SYSTEM_TIME: 0x12
return PRESENT;
} else
return NON_PRESENT;
MSR_KVM_ASYNC_PF_EN: 0x4b564d02
data: Bits 63-6 hold 64-byte aligned physical address of a
64 byte memory area which must be in guest RAM and must be
zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1
when asynchronous page faults are enabled on the vcpu 0 when
disabled. Bit 2 is 1 if asynchronous page faults can be injected
when vcpu is in cpl == 0.
First 4 byte of 64 byte memory location will be written to by
the hypervisor at the time of asynchronous page fault (APF)
injection to indicate type of asynchronous page fault. Value
of 1 means that the page referred to by the page fault is not
present. Value 2 means that the page is now available. Disabling
interrupt inhibits APFs. Guest must not enable interrupt
before the reason is read, or it may be overwritten by another
APF. Since APF uses the same exception vector as regular page
fault guest must reset the reason to 0 before it does
something that can generate normal page fault. If during page
fault APF reason is 0 it means that this is regular page
fault.
During delivery of type 1 APF cr2 contains a token that will
be used to notify a guest when missing page becomes
available. When page becomes available type 2 APF is sent with
cr2 set to the token associated with the page. There is special
kind of token 0xffffffff which tells vcpu that it should wake
up all processes waiting for APFs and no individual type 2 APFs
will be sent.
If APF is disabled while there are outstanding APFs, they will
not be delivered.
Currently type 2 APF will be always delivered on the same vcpu as
type 1 was, but guest should not rely on that.

View File

@ -590,6 +590,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
void kvm_sal_emul(struct kvm_vcpu *vcpu);
#define __KVM_HAVE_ARCH_VM_ALLOC 1
struct kvm *kvm_arch_alloc_vm(void);
void kvm_arch_free_vm(struct kvm *kvm);
#endif /* __ASSEMBLY__*/
#endif

View File

@ -749,7 +749,7 @@ out:
return r;
}
static struct kvm *kvm_alloc_kvm(void)
struct kvm *kvm_arch_alloc_vm(void)
{
struct kvm *kvm;
@ -760,7 +760,7 @@ static struct kvm *kvm_alloc_kvm(void)
vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
if (!vm_base)
return ERR_PTR(-ENOMEM);
return NULL;
memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
kvm = (struct kvm *)(vm_base +
@ -806,10 +806,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
#define GUEST_PHYSICAL_RR4 0x2739
#define VMM_INIT_RR 0x1660
static void kvm_init_vm(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm)
{
BUG_ON(!kvm);
kvm->arch.is_sn2 = ia64_platform_is("sn2");
kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
kvm->arch.vmm_init_rr = VMM_INIT_RR;
@ -823,21 +825,8 @@ static void kvm_init_vm(struct kvm *kvm)
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
}
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kvm_alloc_kvm();
if (IS_ERR(kvm))
return ERR_PTR(-ENOMEM);
kvm->arch.is_sn2 = ia64_platform_is("sn2");
kvm_init_vm(kvm);
return kvm;
return 0;
}
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
@ -962,7 +951,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = kvm_setup_default_irq_routing(kvm);
if (r) {
mutex_lock(&kvm->slots_lock);
kvm_ioapic_destroy(kvm);
mutex_unlock(&kvm->slots_lock);
goto out;
}
break;
@ -1357,7 +1348,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
return -EINVAL;
}
static void free_kvm(struct kvm *kvm)
void kvm_arch_free_vm(struct kvm *kvm)
{
unsigned long vm_base = kvm->arch.vm_base;
@ -1399,9 +1390,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
#endif
kfree(kvm->arch.vioapic);
kvm_release_vm_pages(kvm);
kvm_free_physmem(kvm);
cleanup_srcu_struct(&kvm->srcu);
free_kvm(kvm);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)

View File

@ -1307,12 +1307,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
int err = -ENOMEM;
unsigned long p;
vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s));
vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
if (!vcpu_book3s)
goto out;
memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s));
vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
if (!vcpu_book3s->shadow_vcpu)

View File

@ -145,18 +145,12 @@ void kvm_arch_check_processor_compat(void *rtn)
*(int *)rtn = kvmppc_core_check_processor_compat();
}
struct kvm *kvm_arch_create_vm(void)
int kvm_arch_init_vm(struct kvm *kvm)
{
struct kvm *kvm;
kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
if (!kvm)
return ERR_PTR(-ENOMEM);
return kvm;
return 0;
}
static void kvmppc_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
unsigned int i;
struct kvm_vcpu *vcpu;
@ -176,14 +170,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
{
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvmppc_free_vcpus(kvm);
kvm_free_physmem(kvm);
cleanup_srcu_struct(&kvm->srcu);
kfree(kvm);
}
int kvm_dev_ioctl_check_extension(long ext)
{
int r;

View File

@ -164,24 +164,18 @@ long kvm_arch_vm_ioctl(struct file *filp,
return r;
}
struct kvm *kvm_arch_create_vm(void)
int kvm_arch_init_vm(struct kvm *kvm)
{
struct kvm *kvm;
int rc;
char debug_name[16];
rc = s390_enable_sie();
if (rc)
goto out_nokvm;
rc = -ENOMEM;
kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
if (!kvm)
goto out_nokvm;
goto out_err;
kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
if (!kvm->arch.sca)
goto out_nosca;
goto out_err;
sprintf(debug_name, "kvm-%u", current->pid);
@ -195,13 +189,11 @@ struct kvm *kvm_arch_create_vm(void)
debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
VM_EVENT(kvm, 3, "%s", "vm created");
return kvm;
return 0;
out_nodbf:
free_page((unsigned long)(kvm->arch.sca));
out_nosca:
kfree(kvm);
out_nokvm:
return ERR_PTR(rc);
out_err:
return rc;
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@ -240,11 +232,8 @@ void kvm_arch_sync_events(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_free_vcpus(kvm);
kvm_free_physmem(kvm);
free_page((unsigned long)(kvm->arch.sca));
debug_unregister(kvm->arch.dbf);
cleanup_srcu_struct(&kvm->srcu);
kfree(kvm);
}
/* Section: vcpu related */

View File

@ -15,6 +15,14 @@
struct x86_emulate_ctxt;
struct x86_exception {
u8 vector;
bool error_code_valid;
u16 error_code;
bool nested_page_fault;
u64 address; /* cr2 or nested page fault gpa */
};
/*
* x86_emulate_ops:
*
@ -64,7 +72,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_std)(unsigned long addr, void *val,
unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
unsigned int bytes, struct kvm_vcpu *vcpu,
struct x86_exception *fault);
/*
* write_std: Write bytes of standard (non-emulated/special) memory.
@ -74,7 +83,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to write to memory.
*/
int (*write_std)(unsigned long addr, void *val,
unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
unsigned int bytes, struct kvm_vcpu *vcpu,
struct x86_exception *fault);
/*
* fetch: Read bytes of standard (non-emulated/special) memory.
* Used for instruction fetch.
@ -83,7 +93,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*fetch)(unsigned long addr, void *val,
unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
unsigned int bytes, struct kvm_vcpu *vcpu,
struct x86_exception *fault);
/*
* read_emulated: Read bytes from emulated/special memory area.
@ -94,7 +105,7 @@ struct x86_emulate_ops {
int (*read_emulated)(unsigned long addr,
void *val,
unsigned int bytes,
unsigned int *error,
struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@ -107,7 +118,7 @@ struct x86_emulate_ops {
int (*write_emulated)(unsigned long addr,
const void *val,
unsigned int bytes,
unsigned int *error,
struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@ -122,7 +133,7 @@ struct x86_emulate_ops {
const void *old,
const void *new,
unsigned int bytes,
unsigned int *error,
struct x86_exception *fault,
struct kvm_vcpu *vcpu);
int (*pio_in_emulated)(int size, unsigned short port, void *val,
@ -159,7 +170,10 @@ struct operand {
};
union {
unsigned long *reg;
unsigned long mem;
struct segmented_address {
ulong ea;
unsigned seg;
} mem;
} addr;
union {
unsigned long val;
@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
bool perm_ok; /* do not check permissions if true */
int exception; /* exception that happens during emulation or -1 */
u32 error_code; /* error code for exception */
bool error_code_valid;
bool have_exception;
struct x86_exception exception;
/* decode cache */
struct decode_cache decode;
@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
#define EMULATION_RESTART 1

View File

@ -83,11 +83,14 @@
#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
extern spinlock_t kvm_lock;
extern struct list_head vm_list;
struct kvm_vcpu;
struct kvm;
struct kvm_async_pf;
enum kvm_reg {
VCPU_REGS_RAX = 0,
@ -114,6 +117,7 @@ enum kvm_reg {
enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
VCPU_EXREG_CR3,
};
enum {
@ -238,16 +242,18 @@ struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
void (*inject_page_fault)(struct kvm_vcpu *vcpu);
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
bool prefault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
u32 *error);
struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, bool clear_unsync);
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
int root_level;
@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
*/
struct kvm_mmu *walk_mmu;
/*
* This struct is filled with the necessary information to propagate a
* page fault into the guest
*/
struct {
u64 address;
unsigned error_code;
bool nested;
} fault;
/* only needed in kvm_pv_mmu_op() path, but it's hot so
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
u64 hv_vapic;
cpumask_var_t wbinvd_dirty_mask;
struct {
bool halted;
gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
struct gfn_to_hva_cache data;
u64 msr_val;
u32 id;
bool send_user_only;
} apf;
};
struct kvm_arch {
@ -456,6 +461,10 @@ struct kvm_arch {
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
};
struct kvm_vm_stat {
@ -529,6 +538,7 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
void (*decache_cr3)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@ -582,9 +592,17 @@ struct kvm_x86_ops {
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
const struct trace_print_flags *exit_reasons_str;
};
struct kvm_arch_async_pf {
u32 token;
gfn_t gfn;
unsigned long cr3;
bool direct_map;
};
extern struct kvm_x86_ops *kvm_x86_ops;
int kvm_mmu_module_init(void);
@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@ -623,8 +640,15 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2, u16 error_code, int emulation_type);
int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
int emulation_type, void *insn, int insn_len);
static inline int emulate_instruction(struct kvm_vcpu *vcpu,
int emulation_type)
{
return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
}
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
void kvm_propagate_fault(struct kvm_vcpu *vcpu);
void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_enable_tdp(void);
@ -766,20 +795,25 @@ enum {
#define HF_VINTR_MASK (1 << 2)
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
/*
* Hardware virtualization extension instructions may fault if a
* reboot turns off virtualization while processes are running.
* Trap the fault and ignore the instruction if that happens.
*/
asmlinkage void kvm_handle_fault_on_reboot(void);
asmlinkage void kvm_spurious_fault(void);
extern bool kvm_rebooting;
#define __kvm_handle_fault_on_reboot(insn) \
"666: " insn "\n\t" \
"668: \n\t" \
".pushsection .fixup, \"ax\" \n" \
"667: \n\t" \
"cmpb $0, kvm_rebooting \n\t" \
"jne 668b \n\t" \
__ASM_SIZE(push) " $666b \n\t" \
"jmp kvm_handle_fault_on_reboot \n\t" \
"call kvm_spurious_fault \n\t" \
".popsection \n\t" \
".pushsection __ex_table, \"a\" \n\t" \
_ASM_PTR " 666b, 667b \n\t" \
@ -799,4 +833,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
#endif /* _ASM_X86_KVM_HOST_H */

View File

@ -20,6 +20,7 @@
* are available. The use of 0x11 and 0x12 is deprecated
*/
#define KVM_FEATURE_CLOCKSOURCE2 3
#define KVM_FEATURE_ASYNC_PF 4
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@ -32,9 +33,13 @@
/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
#define KVM_MAX_MMU_OP_BATCH 32
#define KVM_ASYNC_PF_ENABLED (1 << 0)
#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
#define KVM_MMU_OP_FLUSH_TLB 2
@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
#define KVM_PV_REASON_PAGE_READY 2
struct kvm_vcpu_pv_apf_data {
__u32 reason;
__u8 pad[60];
__u32 enabled;
};
#ifdef __KERNEL__
#include <asm/processor.h>
extern void kvmclock_init(void);
extern int kvm_register_clock(char *txt);
/* This instruction is vmcall. On non-VT architectures, it will generate a
@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
#ifdef CONFIG_KVM_GUEST
void __init kvm_guest_init(void);
void kvm_async_pf_task_wait(u32 token);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
#else
#define kvm_guest_init() do { } while (0)
#define kvm_async_pf_task_wait(T) do {} while(0)
#define kvm_async_pf_task_wake(T) do {} while(0)
static inline u32 kvm_read_and_reset_pf_reason(void)
{
return 0;
}
#endif
#endif /* __KERNEL__ */

View File

@ -47,14 +47,13 @@ enum {
INTERCEPT_MONITOR,
INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND,
INTERCEPT_XSETBV,
};
struct __attribute__ ((__packed__)) vmcb_control_area {
u16 intercept_cr_read;
u16 intercept_cr_write;
u16 intercept_dr_read;
u16 intercept_dr_write;
u32 intercept_cr;
u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
u8 reserved_1[42];
@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
u64 reserved_5;
u32 clean;
u32 reserved_5;
u64 next_rip;
u8 reserved_6[816];
u8 insn_len;
u8 insn_bytes[15];
u8 reserved_6[800];
};
#define TLB_CONTROL_DO_NOTHING 0
#define TLB_CONTROL_FLUSH_ALL_ASID 1
#define TLB_CONTROL_FLUSH_ASID 3
#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
#define V_TPR_MASK 0x0f
@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
#define SVM_SELECTOR_CODE_MASK (1 << 3)
#define INTERCEPT_CR0_MASK 1
#define INTERCEPT_CR3_MASK (1 << 3)
#define INTERCEPT_CR4_MASK (1 << 4)
#define INTERCEPT_CR8_MASK (1 << 8)
#define INTERCEPT_CR0_READ 0
#define INTERCEPT_CR3_READ 3
#define INTERCEPT_CR4_READ 4
#define INTERCEPT_CR8_READ 8
#define INTERCEPT_CR0_WRITE (16 + 0)
#define INTERCEPT_CR3_WRITE (16 + 3)
#define INTERCEPT_CR4_WRITE (16 + 4)
#define INTERCEPT_CR8_WRITE (16 + 8)
#define INTERCEPT_DR0_MASK 1
#define INTERCEPT_DR1_MASK (1 << 1)
#define INTERCEPT_DR2_MASK (1 << 2)
#define INTERCEPT_DR3_MASK (1 << 3)
#define INTERCEPT_DR4_MASK (1 << 4)
#define INTERCEPT_DR5_MASK (1 << 5)
#define INTERCEPT_DR6_MASK (1 << 6)
#define INTERCEPT_DR7_MASK (1 << 7)
#define INTERCEPT_DR0_READ 0
#define INTERCEPT_DR1_READ 1
#define INTERCEPT_DR2_READ 2
#define INTERCEPT_DR3_READ 3
#define INTERCEPT_DR4_READ 4
#define INTERCEPT_DR5_READ 5
#define INTERCEPT_DR6_READ 6
#define INTERCEPT_DR7_READ 7
#define INTERCEPT_DR0_WRITE (16 + 0)
#define INTERCEPT_DR1_WRITE (16 + 1)
#define INTERCEPT_DR2_WRITE (16 + 2)
#define INTERCEPT_DR3_WRITE (16 + 3)
#define INTERCEPT_DR4_WRITE (16 + 4)
#define INTERCEPT_DR5_WRITE (16 + 5)
#define INTERCEPT_DR6_WRITE (16 + 6)
#define INTERCEPT_DR7_WRITE (16 + 7)
#define SVM_EVTINJ_VEC_MASK 0xff
@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
#define SVM_EXITINFO_REG_MASK 0x0F
#define SVM_EXIT_READ_CR0 0x000
#define SVM_EXIT_READ_CR3 0x003
#define SVM_EXIT_READ_CR4 0x004
@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_MONITOR 0x08a
#define SVM_EXIT_MWAIT 0x08b
#define SVM_EXIT_MWAIT_COND 0x08c
#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_ERR -1

View File

@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
asmlinkage void stack_segment(void);
asmlinkage void general_protection(void);
asmlinkage void page_fault(void);
asmlinkage void async_page_fault(void);
asmlinkage void spurious_interrupt_bug(void);
asmlinkage void coprocessor_error(void);
asmlinkage void alignment_check(void);

View File

@ -66,15 +66,23 @@
#define PIN_BASED_NMI_EXITING 0x00000008
#define PIN_BASED_VIRTUAL_NMIS 0x00000020
#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
#define VM_EXIT_SAVE_IA32_PAT 0x00040000
#define VM_EXIT_LOAD_IA32_PAT 0x00080000
#define VM_EXIT_SAVE_IA32_EFER 0x00100000
#define VM_EXIT_LOAD_IA32_EFER 0x00200000
#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
/* VMCS Encodings */
enum vmcs_field {
@ -239,6 +247,7 @@ enum vmcs_field {
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_HLT 12
#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
@ -296,6 +305,12 @@ enum vmcs_field {
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
/* GUEST_ACTIVITY_STATE flags */
#define GUEST_ACTIVITY_ACTIVE 0
#define GUEST_ACTIVITY_HLT 1
#define GUEST_ACTIVITY_SHUTDOWN 2
#define GUEST_ACTIVITY_WAIT_SIPI 3
/*
* Exit Qualifications for MOV for Control Register Access
*/

View File

@ -1406,6 +1406,16 @@ ENTRY(general_protection)
CFI_ENDPROC
END(general_protection)
#ifdef CONFIG_KVM_GUEST
ENTRY(async_page_fault)
RING0_EC_FRAME
pushl $do_async_page_fault
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(apf_page_fault)
#endif
/*
* End of kprobes section
*/

View File

@ -1329,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
#endif
errorentry general_protection do_general_protection
errorentry page_fault do_page_fault
#ifdef CONFIG_KVM_GUEST
errorentry async_page_fault do_async_page_fault
#endif
#ifdef CONFIG_X86_MCE
paranoidzeroentry machine_check *machine_check_vector(%rip)
#endif

View File

@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
set_stopped_child_used_math(tsk);
return 0;
}
EXPORT_SYMBOL_GPL(init_fpu);
/*
* The xstateregs_active() routine is the same as the fpregs_active() routine,

View File

@ -27,16 +27,37 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/hash.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
#define MMU_QUEUE_SIZE 1024
static int kvmapf = 1;
static int parse_no_kvmapf(char *arg)
{
kvmapf = 0;
return 0;
}
early_param("no-kvmapf", parse_no_kvmapf);
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
};
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static struct kvm_para_state *kvm_para_state(void)
{
@ -50,6 +71,195 @@ static void kvm_io_delay(void)
{
}
#define KVM_TASK_SLEEP_HASHBITS 8
#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
struct kvm_task_sleep_node {
struct hlist_node link;
wait_queue_head_t wq;
u32 token;
int cpu;
bool halted;
struct mm_struct *mm;
};
static struct kvm_task_sleep_head {
spinlock_t lock;
struct hlist_head list;
} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
u32 token)
{
struct hlist_node *p;
hlist_for_each(p, &b->list) {
struct kvm_task_sleep_node *n =
hlist_entry(p, typeof(*n), link);
if (n->token == token)
return n;
}
return NULL;
}
void kvm_async_pf_task_wait(u32 token)
{
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
struct kvm_task_sleep_node n, *e;
DEFINE_WAIT(wait);
int cpu, idle;
cpu = get_cpu();
idle = idle_cpu(cpu);
put_cpu();
spin_lock(&b->lock);
e = _find_apf_task(b, token);
if (e) {
/* dummy entry exist -> wake up was delivered ahead of PF */
hlist_del(&e->link);
kfree(e);
spin_unlock(&b->lock);
return;
}
n.token = token;
n.cpu = smp_processor_id();
n.mm = current->active_mm;
n.halted = idle || preempt_count() > 1;
atomic_inc(&n.mm->mm_count);
init_waitqueue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
spin_unlock(&b->lock);
for (;;) {
if (!n.halted)
prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
if (!n.halted) {
local_irq_enable();
schedule();
local_irq_disable();
} else {
/*
* We cannot reschedule. So halt.
*/
native_safe_halt();
local_irq_disable();
}
}
if (!n.halted)
finish_wait(&n.wq, &wait);
return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
hlist_del_init(&n->link);
if (!n->mm)
return;
mmdrop(n->mm);
if (n->halted)
smp_send_reschedule(n->cpu);
else if (waitqueue_active(&n->wq))
wake_up(&n->wq);
}
static void apf_task_wake_all(void)
{
int i;
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
struct hlist_node *p, *next;
struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
spin_lock(&b->lock);
hlist_for_each_safe(p, next, &b->list) {
struct kvm_task_sleep_node *n =
hlist_entry(p, typeof(*n), link);
if (n->cpu == smp_processor_id())
apf_task_wake_one(n);
}
spin_unlock(&b->lock);
}
}
void kvm_async_pf_task_wake(u32 token)
{
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
struct kvm_task_sleep_node *n;
if (token == ~0) {
apf_task_wake_all();
return;
}
again:
spin_lock(&b->lock);
n = _find_apf_task(b, token);
if (!n) {
/*
* async PF was not yet handled.
* Add dummy entry for the token.
*/
n = kmalloc(sizeof(*n), GFP_ATOMIC);
if (!n) {
/*
* Allocation failed! Busy wait while other cpu
* handles async PF.
*/
spin_unlock(&b->lock);
cpu_relax();
goto again;
}
n->token = token;
n->cpu = smp_processor_id();
n->mm = NULL;
init_waitqueue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
} else
apf_task_wake_one(n);
spin_unlock(&b->lock);
return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
u32 kvm_read_and_reset_pf_reason(void)
{
u32 reason = 0;
if (__get_cpu_var(apf_reason).enabled) {
reason = __get_cpu_var(apf_reason).reason;
__get_cpu_var(apf_reason).reason = 0;
}
return reason;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
dotraplinkage void __kprobes
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
{
switch (kvm_read_and_reset_pf_reason()) {
default:
do_page_fault(regs, error_code);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
kvm_async_pf_task_wait((u32)read_cr2());
break;
case KVM_PV_REASON_PAGE_READY:
kvm_async_pf_task_wake((u32)read_cr2());
break;
}
}
static void kvm_mmu_op(void *buffer, unsigned len)
{
int r;
@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
#endif
}
void __init kvm_guest_init(void)
void __cpuinit kvm_guest_cpu_init(void)
{
if (!kvm_para_available())
return;
paravirt_ops_setup();
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
u64 pa = __pa(&__get_cpu_var(apf_reason));
#ifdef CONFIG_PREEMPT
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
#endif
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
__get_cpu_var(apf_reason).enabled = 1;
printk(KERN_INFO"KVM setup async PF for cpu %d\n",
smp_processor_id());
}
}
static void kvm_pv_disable_apf(void *unused)
{
if (!__get_cpu_var(apf_reason).enabled)
return;
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__get_cpu_var(apf_reason).enabled = 0;
printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
smp_processor_id());
}
static int kvm_pv_reboot_notify(struct notifier_block *nb,
unsigned long code, void *unused)
{
if (code == SYS_RESTART)
on_each_cpu(kvm_pv_disable_apf, NULL, 1);
return NOTIFY_DONE;
}
static struct notifier_block kvm_pv_reboot_nb = {
.notifier_call = kvm_pv_reboot_notify,
};
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
#ifdef CONFIG_KVM_CLOCK
WARN_ON(kvm_register_clock("primary cpu clock"));
#endif
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
}
static void kvm_guest_cpu_online(void *dummy)
{
kvm_guest_cpu_init();
}
static void kvm_guest_cpu_offline(void *dummy)
{
kvm_pv_disable_apf(NULL);
apf_task_wake_all();
}
static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (unsigned long)hcpu;
switch (action) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
case CPU_ONLINE_FROZEN:
smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
.notifier_call = kvm_cpu_notify,
};
#endif
static void __init kvm_apf_trap_init(void)
{
set_intr_gate(14, &async_page_fault);
}
void __init kvm_guest_init(void)
{
int i;
if (!kvm_para_available())
return;
paravirt_ops_setup();
register_reboot_notifier(&kvm_pv_reboot_nb);
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
spin_lock_init(&async_pf_sleepers[i].lock);
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
register_cpu_notifier(&kvm_cpu_notifier);
#else
kvm_guest_cpu_init();
#endif
}

View File

@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static int kvm_register_clock(char *txt)
int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high, ret;
@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
}
#endif
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
WARN_ON(kvm_register_clock("primary cpu clock"));
native_smp_prepare_boot_cpu();
}
#endif
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@ -205,9 +197,6 @@ void __init kvmclock_init(void)
#ifdef CONFIG_X86_LOCAL_APIC
x86_cpuinit.setup_percpu_clockev =
kvm_setup_secondary_clock;
#endif
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC

View File

@ -28,6 +28,7 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
---help---

View File

@ -1,5 +1,5 @@
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
CFLAGS_x86.o := -I.
CFLAGS_svm.o := -I.
@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \
assigned-dev.o)
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o timer.o

View File

@ -20,16 +20,8 @@
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
#ifndef __KERNEL__
#include <stdio.h>
#include <stdint.h>
#include <public/xen.h>
#define DPRINTF(_f, _a ...) printf(_f , ## _a)
#else
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
#define DPRINTF(x...) do {} while (0)
#endif
#include <linux/module.h>
#include <asm/kvm_emulate.h>
@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
}
static inline unsigned long
register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
register_address(struct decode_cache *c, unsigned long reg)
{
return base + address_mask(c, reg);
return address_mask(c, reg);
}
static inline void
@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
return ops->get_cached_segment_base(seg, ctxt->vcpu);
}
static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
struct decode_cache *c)
static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
struct decode_cache *c)
{
if (!c->has_seg_override)
return 0;
return seg_base(ctxt, ops, c->seg_override);
return c->seg_override;
}
static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
static ulong linear(struct x86_emulate_ctxt *ctxt,
struct segmented_address addr)
{
return seg_base(ctxt, ops, VCPU_SREG_ES);
struct decode_cache *c = &ctxt->decode;
ulong la;
la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
if (c->ad_bytes != 8)
la &= (u32)-1;
return la;
}
static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
u32 error, bool valid)
{
return seg_base(ctxt, ops, VCPU_SREG_SS);
ctxt->exception.vector = vec;
ctxt->exception.error_code = error;
ctxt->exception.error_code_valid = valid;
return X86EMUL_PROPAGATE_FAULT;
}
static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
u32 error, bool valid)
static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
{
ctxt->exception = vec;
ctxt->error_code = error;
ctxt->error_code_valid = valid;
return emulate_exception(ctxt, GP_VECTOR, err, true);
}
static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
static int emulate_ud(struct x86_emulate_ctxt *ctxt)
{
emulate_exception(ctxt, GP_VECTOR, err, true);
return emulate_exception(ctxt, UD_VECTOR, 0, false);
}
static void emulate_pf(struct x86_emulate_ctxt *ctxt)
static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
{
emulate_exception(ctxt, PF_VECTOR, 0, true);
}
static void emulate_ud(struct x86_emulate_ctxt *ctxt)
{
emulate_exception(ctxt, UD_VECTOR, 0, false);
}
static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
{
emulate_exception(ctxt, TS_VECTOR, err, true);
return emulate_exception(ctxt, TS_VECTOR, err, true);
}
static int emulate_de(struct x86_emulate_ctxt *ctxt)
{
emulate_exception(ctxt, DE_VECTOR, 0, false);
return X86EMUL_PROPAGATE_FAULT;
return emulate_exception(ctxt, DE_VECTOR, 0, false);
}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
cur_size = fc->end - fc->start;
size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
size, ctxt->vcpu, NULL);
size, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
fc->end += size;
@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
ulong addr,
struct segmented_address addr,
u16 *size, unsigned long *address, int op_bytes)
{
int rc;
@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
addr.ea += 2;
rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
ctxt->vcpu, &ctxt->exception);
return rc;
}
@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
break;
}
}
op->addr.mem = modrm_ea;
op->addr.mem.ea = modrm_ea;
done:
return rc;
}
@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
op->type = OP_MEM;
switch (c->ad_bytes) {
case 2:
op->addr.mem = insn_fetch(u16, 2, c->eip);
op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
break;
case 4:
op->addr.mem = insn_fetch(u32, 4, c->eip);
op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
break;
case 8:
op->addr.mem = insn_fetch(u64, 8, c->eip);
op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
break;
}
done:
@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
else if (c->src.bytes == 4)
sv = (s32)c->src.val & (s32)mask;
c->dst.addr.mem += (sv >> 3);
c->dst.addr.mem.ea += (sv >> 3);
}
/* only subword offset */
@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
{
int rc;
struct read_cache *mc = &ctxt->decode.mem_read;
u32 err;
while (size) {
int n = min(size, 8u);
@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
if (mc->pos < mc->end)
goto read_cached;
rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
ctxt->vcpu);
if (rc == X86EMUL_PROPAGATE_FAULT)
emulate_pf(ctxt);
rc = ops->read_emulated(addr, mc->data + mc->end, n,
&ctxt->exception, ctxt->vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
mc->end += n;
@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
struct desc_ptr dt;
u16 index = selector >> 3;
int ret;
u32 err;
ulong addr;
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
if (dt.size < index * 8 + 7) {
emulate_gp(ctxt, selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
emulate_pf(ctxt);
ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
&ctxt->exception);
return ret;
}
@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
{
struct desc_ptr dt;
u16 index = selector >> 3;
u32 err;
ulong addr;
int ret;
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
if (dt.size < index * 8 + 7) {
emulate_gp(ctxt, selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
emulate_pf(ctxt);
ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
&ctxt->exception);
return ret;
}
@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
{
int rc;
struct decode_cache *c = &ctxt->decode;
u32 err;
switch (c->dst.type) {
case OP_REG:
@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
case OP_MEM:
if (c->lock_prefix)
rc = ops->cmpxchg_emulated(
c->dst.addr.mem,
linear(ctxt, c->dst.addr.mem),
&c->dst.orig_val,
&c->dst.val,
c->dst.bytes,
&err,
&ctxt->exception,
ctxt->vcpu);
else
rc = ops->write_emulated(
c->dst.addr.mem,
linear(ctxt, c->dst.addr.mem),
&c->dst.val,
c->dst.bytes,
&err,
&ctxt->exception,
ctxt->vcpu);
if (rc == X86EMUL_PROPAGATE_FAULT)
emulate_pf(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
c->regs[VCPU_REGS_RSP]);
c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
c->dst.addr.mem.seg = VCPU_SREG_SS;
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
{
struct decode_cache *c = &ctxt->decode;
int rc;
struct segmented_address addr;
rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
c->regs[VCPU_REGS_RSP]),
dest, len);
addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
addr.seg = VCPU_SREG_SS;
rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
change_mask |= EFLG_IF;
break;
case X86EMUL_MODE_VM86:
if (iopl < 3) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if (iopl < 3)
return emulate_gp(ctxt, 0);
change_mask |= EFLG_IF;
break;
default: /* real mode */
@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
*(unsigned long *)dest =
(ctxt->eflags & ~change_mask) | (val & change_mask);
if (rc == X86EMUL_PROPAGATE_FAULT)
emulate_pf(ctxt);
return rc;
}
@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
gva_t cs_addr;
gva_t eip_addr;
u16 cs, eip;
u32 err;
/* TODO: Add limit checks */
c->src.val = ctxt->eflags;
@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
eip_addr = dt.address + (irq << 2);
cs_addr = dt.address + (irq << 2) + 2;
rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE)
return rc;
if (temp_eip & ~0xffff) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if (temp_eip & ~0xffff)
return emulate_gp(ctxt, 0);
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
emulate_ud(ctxt);
return X86EMUL_PROPAGATE_FAULT;
}
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_ud(ctxt);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
u16 cs_sel, ss_sel;
/* inject #GP if in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if (ctxt->mode == X86EMUL_MODE_REAL)
return emulate_gp(ctxt, 0);
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
if (ctxt->mode == X86EMUL_MODE_PROT64) {
emulate_ud(ctxt);
return X86EMUL_PROPAGATE_FAULT;
}
if (ctxt->mode == X86EMUL_MODE_PROT64)
return emulate_ud(ctxt);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
break;
case X86EMUL_MODE_PROT64:
if (msr_data == 0x0) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if (msr_data == 0x0)
return emulate_gp(ctxt, 0);
break;
}
@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_gp(ctxt, 0);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
switch (usermode) {
case X86EMUL_MODE_PROT32:
cs_sel = (u16)(msr_data + 16);
if ((msr_data & 0xfffc) == 0x0) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
ss_sel = (u16)(msr_data + 24);
break;
case X86EMUL_MODE_PROT64:
cs_sel = (u16)(msr_data + 32);
if (msr_data == 0x0) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if (msr_data == 0x0)
return emulate_gp(ctxt, 0);
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
{
struct tss_segment_16 tss_seg;
int ret;
u32 err, new_tss_base = get_desc_base(new_desc);
u32 new_tss_base = get_desc_base(new_desc);
ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
emulate_pf(ctxt);
return ret;
}
save_state_to_tss16(ctxt, ops, &tss_seg);
ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
emulate_pf(ctxt);
return ret;
}
ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
emulate_pf(ctxt);
return ret;
}
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
ctxt->vcpu, &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
emulate_pf(ctxt);
return ret;
}
}
return load_state_from_tss16(ctxt, ops, &tss_seg);
@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int ret;
if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
return emulate_gp(ctxt, 0);
c->eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
c->regs[VCPU_REGS_RAX] = tss->eax;
@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
{
struct tss_segment_32 tss_seg;
int ret;
u32 err, new_tss_base = get_desc_base(new_desc);
u32 new_tss_base = get_desc_base(new_desc);
ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
emulate_pf(ctxt);
return ret;
}
save_state_to_tss32(ctxt, ops, &tss_seg);
ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
emulate_pf(ctxt);
return ret;
}
ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
emulate_pf(ctxt);
return ret;
}
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
ctxt->vcpu, &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
emulate_pf(ctxt);
return ret;
}
}
return load_state_from_tss32(ctxt, ops, &tss_seg);
@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
return emulate_gp(ctxt, 0);
}
desc_limit = desc_limit_scaled(&next_tss_desc);
@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
int reg, struct operand *op)
{
struct decode_cache *c = &ctxt->decode;
int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
register_address_increment(c, &c->regs[reg], df * op->bytes);
op->addr.mem = register_address(c, base, c->regs[reg]);
op->addr.mem.ea = register_address(c, c->regs[reg]);
op->addr.mem.seg = seg;
}
static int em_push(struct x86_emulate_ctxt *ctxt)
@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
struct decode_cache *c = &ctxt->decode;
u64 tsc = 0;
if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
return emulate_gp(ctxt, 0);
ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
c->regs[VCPU_REGS_RAX] = (u32)tsc;
c->regs[VCPU_REGS_RDX] = tsc >> 32;
@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_IMM;
op->bytes = size;
op->addr.mem = c->eip;
op->addr.mem.ea = c->eip;
/* NB. Immediates are sign-extended as necessary. */
switch (op->bytes) {
case 1:
@ -2678,7 +2610,7 @@ done:
}
int
x86_decode_insn(struct x86_emulate_ctxt *ctxt)
x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
{
struct x86_emulate_ops *ops = ctxt->ops;
struct decode_cache *c = &ctxt->decode;
@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
struct operand memop = { .type = OP_NONE };
c->eip = ctxt->eip;
c->fetch.start = c->fetch.end = c->eip;
c->fetch.start = c->eip;
c->fetch.end = c->fetch.start + insn_len;
if (insn_len > 0)
memcpy(c->fetch.data, insn, insn_len);
ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
switch (mode) {
@ -2803,10 +2738,8 @@ done_prefixes:
c->execute = opcode.u.execute;
/* Unrecognised? */
if (c->d == 0 || (c->d & Undefined)) {
DPRINTF("Cannot emulate %02x\n", c->b);
if (c->d == 0 || (c->d & Undefined))
return -1;
}
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
c->op_bytes = 8;
@ -2831,14 +2764,13 @@ done_prefixes:
if (!c->has_seg_override)
set_seg_override(c, VCPU_SREG_DS);
if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
memop.addr.mem += seg_override_base(ctxt, ops, c);
memop.addr.mem.seg = seg_override(ctxt, ops, c);
if (memop.type == OP_MEM && c->ad_bytes != 8)
memop.addr.mem = (u32)memop.addr.mem;
memop.addr.mem.ea = (u32)memop.addr.mem.ea;
if (memop.type == OP_MEM && c->rip_relative)
memop.addr.mem += c->eip;
memop.addr.mem.ea += c->eip;
/*
* Decode and fetch the source operand: register, memory
@ -2890,14 +2822,14 @@ done_prefixes:
case SrcSI:
c->src.type = OP_MEM;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->src.addr.mem =
register_address(c, seg_override_base(ctxt, ops, c),
c->regs[VCPU_REGS_RSI]);
c->src.addr.mem.ea =
register_address(c, c->regs[VCPU_REGS_RSI]);
c->src.addr.mem.seg = seg_override(ctxt, ops, c),
c->src.val = 0;
break;
case SrcImmFAddr:
c->src.type = OP_IMM;
c->src.addr.mem = c->eip;
c->src.addr.mem.ea = c->eip;
c->src.bytes = c->op_bytes + 2;
insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
break;
@ -2944,7 +2876,7 @@ done_prefixes:
break;
case DstImmUByte:
c->dst.type = OP_IMM;
c->dst.addr.mem = c->eip;
c->dst.addr.mem.ea = c->eip;
c->dst.bytes = 1;
c->dst.val = insn_fetch(u8, 1, c->eip);
break;
@ -2969,9 +2901,9 @@ done_prefixes:
case DstDI:
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.addr.mem =
register_address(c, es_base(ctxt, ops),
c->regs[VCPU_REGS_RDI]);
c->dst.addr.mem.ea =
register_address(c, c->regs[VCPU_REGS_RDI]);
c->dst.addr.mem.seg = VCPU_SREG_ES;
c->dst.val = 0;
break;
case ImplicitOps:
@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
ctxt->decode.mem_read.pos = 0;
if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
emulate_ud(ctxt);
rc = emulate_ud(ctxt);
goto done;
}
/* LOCK prefix is allowed only with some instructions */
if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
emulate_ud(ctxt);
rc = emulate_ud(ctxt);
goto done;
}
if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
emulate_ud(ctxt);
rc = emulate_ud(ctxt);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
emulate_gp(ctxt, 0);
rc = emulate_gp(ctxt, 0);
goto done;
}
@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
rc = read_emulated(ctxt, ops, c->src.addr.mem,
rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
c->src.valptr, c->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if (c->src2.type == OP_MEM) {
rc = read_emulated(ctxt, ops, c->src2.addr.mem,
rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
&c->src2.val, c->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
rc = read_emulated(ctxt, ops, c->dst.addr.mem,
rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
&c->dst.val, c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@ -3215,13 +3147,13 @@ special_insn:
break;
case 0x8c: /* mov r/m, sreg */
if (c->modrm_reg > VCPU_SREG_GS) {
emulate_ud(ctxt);
rc = emulate_ud(ctxt);
goto done;
}
c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
break;
case 0x8d: /* lea r16/r32, m */
c->dst.val = c->src.addr.mem;
c->dst.val = c->src.addr.mem.ea;
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
@ -3230,7 +3162,7 @@ special_insn:
if (c->modrm_reg == VCPU_SREG_CS ||
c->modrm_reg > VCPU_SREG_GS) {
emulate_ud(ctxt);
rc = emulate_ud(ctxt);
goto done;
}
@ -3268,7 +3200,6 @@ special_insn:
break;
case 0xa6 ... 0xa7: /* cmps */
c->dst.type = OP_NONE; /* Disable writeback. */
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
goto cmp;
case 0xa8 ... 0xa9: /* test ax, imm */
goto test;
@ -3363,7 +3294,7 @@ special_insn:
do_io_in:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
emulate_gp(ctxt, 0);
rc = emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@ -3377,7 +3308,7 @@ special_insn:
c->src.bytes = min(c->src.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->dst.val,
c->src.bytes)) {
emulate_gp(ctxt, 0);
rc = emulate_gp(ctxt, 0);
goto done;
}
ops->pio_out_emulated(c->src.bytes, c->dst.val,
@ -3402,14 +3333,14 @@ special_insn:
break;
case 0xfa: /* cli */
if (emulator_bad_iopl(ctxt, ops)) {
emulate_gp(ctxt, 0);
rc = emulate_gp(ctxt, 0);
goto done;
} else
ctxt->eflags &= ~X86_EFLAGS_IF;
break;
case 0xfb: /* sti */
if (emulator_bad_iopl(ctxt, ops)) {
emulate_gp(ctxt, 0);
rc = emulate_gp(ctxt, 0);
goto done;
} else {
ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@ -3449,11 +3380,11 @@ writeback:
c->dst.type = saved_dst_type;
if ((c->d & SrcMask) == SrcSI)
string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
string_addr_inc(ctxt, seg_override(ctxt, ops, c),
VCPU_REGS_RSI, &c->src);
if ((c->d & DstMask) == DstDI)
string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
&c->dst);
if (c->rep_prefix && (c->d & String)) {
@ -3482,6 +3413,8 @@ writeback:
ctxt->eip = c->eip;
done:
if (rc == X86EMUL_PROPAGATE_FAULT)
ctxt->have_exception = true;
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
@ -3544,9 +3477,11 @@ twobyte_insn:
break;
case 5: /* not defined */
emulate_ud(ctxt);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
case 7: /* invlpg*/
emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
emulate_invlpg(ctxt->vcpu,
linear(ctxt, c->src.addr.mem));
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@ -3573,6 +3508,7 @@ twobyte_insn:
case 5 ... 7:
case 9 ... 15:
emulate_ud(ctxt);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@ -3581,6 +3517,7 @@ twobyte_insn:
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
emulate_ud(ctxt);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@ -3588,6 +3525,7 @@ twobyte_insn:
case 0x22: /* mov reg, cr */
if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
c->dst.type = OP_NONE;
@ -3596,6 +3534,7 @@ twobyte_insn:
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
emulate_ud(ctxt);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
@ -3604,6 +3543,7 @@ twobyte_insn:
~0ULL : ~0U), ctxt->vcpu) < 0) {
/* #UD condition is already handled by the code above */
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
@ -3615,6 +3555,7 @@ twobyte_insn:
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
rc = X86EMUL_CONTINUE;
@ -3623,6 +3564,7 @@ twobyte_insn:
/* rdmsr */
if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
emulate_gp(ctxt, 0);
rc = X86EMUL_PROPAGATE_FAULT;
goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@ -3785,6 +3727,5 @@ twobyte_insn:
goto writeback;
cannot_emulate:
DPRINTF("Cannot emulate %02x\n", c->b);
return -1;
}

View File

@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr4 & mask;
}
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
kvm_x86_ops->decache_cr3(vcpu);
return vcpu->arch.cr3;
}
static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, ~0UL);
@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
}
static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags &= ~HF_GUEST_MASK;
}
static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
{
return vcpu->arch.hflags & HF_GUEST_MASK;
}
#endif

View File

@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
if (old_ppr != ppr) {
apic_set_reg(apic, APIC_PROCPRI, ppr);
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
if (ppr < old_ppr)
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
}
}

View File

@ -18,9 +18,11 @@
*
*/
#include "irq.h"
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
#include "x86.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
void kvm_mmu_set_base_ptes(u64 base_pte)
{
shadow_base_present_pte = base_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
}
/*
* Return the pointer to the largepage write count for a given
* gfn, handling slots that are not large page aligned.
* Return the pointer to the large page information for a given gfn,
* handling slots that are not large page aligned.
*/
static int *slot_largepage_idx(gfn_t gfn,
struct kvm_memory_slot *slot,
int level)
static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
struct kvm_memory_slot *slot,
int level)
{
unsigned long idx;
idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].write_count;
return &slot->lpage_info[level - 2][idx];
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
int *write_count;
struct kvm_lpage_info *linfo;
int i;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
*write_count += 1;
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count += 1;
}
}
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
int *write_count;
struct kvm_lpage_info *linfo;
int i;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
*write_count -= 1;
WARN_ON(*write_count < 0);
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count -= 1;
WARN_ON(linfo->write_count < 0);
}
}
@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
int level)
{
struct kvm_memory_slot *slot;
int *largepage_idx;
struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
if (slot) {
largepage_idx = slot_largepage_idx(gfn, slot, level);
return *largepage_idx;
linfo = lpage_info_slot(gfn, slot, level);
return linfo->write_count;
}
return 1;
@ -590,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
unsigned long idx;
struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
linfo = lpage_info_slot(gfn, slot, level);
return &slot->lpage_info[level - 2][idx].rmap_pde;
return &linfo->rmap_pde;
}
/*
@ -887,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
gfn_t gfn = memslot->base_gfn + gfn_offset;
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
unsigned long idx;
int sh;
struct kvm_lpage_info *linfo;
sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
idx = ((memslot->base_gfn+gfn_offset) >> sh) -
(memslot->base_gfn >> sh);
ret |= handler(kvm,
&memslot->lpage_info[j][idx].rmap_pde,
data);
linfo = lpage_info_slot(gfn, memslot,
PT_DIRECTORY_LEVEL + j);
ret |= handler(kvm, &linfo->rmap_pde, data);
}
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
@ -1161,7 +1152,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, bool clear_unsync)
struct kvm_mmu_page *sp)
{
return 1;
}
@ -1291,7 +1282,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (clear_unsync)
kvm_unlink_unsync_page(vcpu->kvm, sp);
if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@ -1332,12 +1323,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
continue;
WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
kvm_unlink_unsync_page(vcpu->kvm, s);
if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
(vcpu->arch.mmu.sync_page(vcpu, s))) {
kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
continue;
}
kvm_unlink_unsync_page(vcpu->kvm, s);
flush = true;
}
@ -1963,9 +1954,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool reset_host_protection)
bool can_unsync, bool host_writable)
{
u64 spte;
u64 spte, entry = *sptep;
int ret = 0;
/*
@ -1973,7 +1964,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* whether the guest actually used the pte (in order to detect
* demand paging).
*/
spte = shadow_base_present_pte;
spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
if (!dirty)
@ -1990,8 +1981,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
if (reset_host_protection)
if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
else
pte_access &= ~ACC_WRITE_MASK;
spte |= (u64)pfn << PAGE_SHIFT;
@ -2036,6 +2029,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
update_spte(sptep, spte);
/*
* If we overwrite a writable spte with a read-only one we
* should flush remote TLBs. Otherwise rmap_write_protect
* will find a read-only spte, even though the writable spte
* might be cached on a CPU's TLB.
*/
if (is_writable_pte(entry) && !is_writable_pte(*sptep))
kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
}
@ -2045,7 +2046,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
bool reset_host_protection)
bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
@ -2080,7 +2081,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, level, gfn, pfn, speculative, true,
reset_host_protection)) {
host_writable)) {
if (write_fault)
*ptwrite = 1;
kvm_mmu_flush_tlb(vcpu);
@ -2211,7 +2212,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int level, gfn_t gfn, pfn_t pfn)
int map_writable, int level, gfn_t gfn, pfn_t pfn,
bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@ -2220,9 +2222,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
unsigned pte_access = ACC_ALL;
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
0, write, 1, &pt_write,
level, gfn, pfn, false, true);
level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@ -2277,12 +2281,17 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
bool prefault)
{
int r;
int level;
pfn_t pfn;
unsigned long mmu_seq;
bool map_writable;
level = mapping_level(vcpu, gfn);
@ -2297,7 +2306,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
return 0;
/* mmio */
if (is_error_pfn(pfn))
@ -2307,7 +2318,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, v, write, level, gfn, pfn);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
@ -2530,6 +2542,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
@ -2552,23 +2565,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
u32 access, u32 *error)
u32 access, struct x86_exception *exception)
{
if (error)
*error = 0;
if (exception)
exception->error_code = 0;
return vaddr;
}
static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
u32 access, u32 *error)
u32 access,
struct x86_exception *exception)
{
if (error)
*error = 0;
if (exception)
exception->error_code = 0;
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
u32 error_code, bool prefault)
{
gfn_t gfn;
int r;
@ -2584,17 +2598,67 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
error_code & PFERR_WRITE_MASK, gfn);
error_code & PFERR_WRITE_MASK, gfn, prefault);
}
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
u32 error_code)
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
{
struct kvm_arch_async_pf arch;
arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
arch.gfn = gfn;
arch.direct_map = vcpu->arch.mmu.direct_map;
arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
kvm_event_needs_reinjection(vcpu)))
return false;
return kvm_x86_ops->interrupt_allowed(vcpu);
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable)
{
bool async;
*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
if (!async)
return false; /* *pfn has correct page already */
put_page(pfn_to_page(*pfn));
if (!prefault && can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return true;
} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
return true;
}
*pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
return false;
}
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
bool prefault)
{
pfn_t pfn;
int r;
int level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@ -2609,15 +2673,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
return 0;
/* mmio */
if (is_error_pfn(pfn))
return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
level, gfn, pfn);
r = __direct_map(vcpu, gpa, write, map_writable,
level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@ -2659,18 +2727,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
mmu_free_roots(vcpu);
}
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
return vcpu->arch.cr3;
return kvm_read_cr3(vcpu);
}
static void inject_page_fault(struct kvm_vcpu *vcpu)
static void inject_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
vcpu->arch.mmu.inject_page_fault(vcpu);
vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
static void paging_free(struct kvm_vcpu *vcpu)
@ -2816,6 +2885,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = vcpu->arch.walk_mmu;
context->base_role.word = 0;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
@ -3008,9 +3078,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
return;
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
@ -3264,12 +3331,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
}
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
void *insn, int insn_len)
{
int r;
enum emulation_result er;
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
if (r < 0)
goto out;
@ -3282,7 +3350,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
if (r)
goto out;
er = emulate_instruction(vcpu, cr2, error_code, 0);
er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
switch (er) {
case EMULATE_DONE:
@ -3377,11 +3445,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (!test_bit(slot, sp->slot_bitmap))
continue;
if (sp->role.level != PT_PAGE_TABLE_LEVEL)
continue;
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (is_writable_pte(pt[i]))
pt[i] &= ~PT_WRITABLE_MASK;
update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
}
kvm_flush_remote_tlbs(kvm);
}
@ -3463,13 +3534,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
void kvm_mmu_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
unregister_shrinker(&mmu_shrinker);
}
int kvm_mmu_module_init(void)
{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@ -3566,7 +3630,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
(void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
return 1;
}
@ -3662,12 +3726,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
#ifdef CONFIG_KVM_MMU_AUDIT
#include "mmu_audit.c"
#else
static void mmu_audit_disable(void) { }
#endif
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
@ -3675,5 +3733,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
destroy_kvm_mmu(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
}
#ifdef CONFIG_KVM_MMU_AUDIT
#include "mmu_audit.c"
#else
static void mmu_audit_disable(void) { }
#endif
void kvm_mmu_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}

View File

@ -19,11 +19,9 @@
#include <linux/ratelimit.h>
static int audit_point;
#define audit_printk(fmt, args...) \
#define audit_printk(kvm, fmt, args...) \
printk(KERN_ERR "audit: (%s) error: " \
fmt, audit_point_name[audit_point], ##args)
fmt, audit_point_name[kvm->arch.audit_point], ##args)
typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
if (sp->unsync) {
if (level != PT_PAGE_TABLE_LEVEL) {
audit_printk("unsync sp: %p level = %d\n", sp, level);
audit_printk(vcpu->kvm, "unsync sp: %p "
"level = %d\n", sp, level);
return;
}
if (*sptep == shadow_notrap_nonpresent_pte) {
audit_printk("notrap spte in unsync sp: %p\n", sp);
audit_printk(vcpu->kvm, "notrap spte in unsync "
"sp: %p\n", sp);
return;
}
}
if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
audit_printk("notrap spte in direct sp: %p\n", sp);
audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
sp);
return;
}
@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
"ent %llxn", vcpu->arch.mmu.root_level, pfn,
hpa, *sptep);
}
static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!gfn_to_memslot(kvm, gfn)) {
if (!printk_ratelimit())
return;
audit_printk("no memslot for gfn %llx\n", gfn);
audit_printk("index %ld of sp (gfn=%llx)\n",
audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
(long int)(sptep - rev_sp->spt), rev_sp->gfn);
dump_stack();
return;
@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!*rmapp) {
if (!printk_ratelimit())
return;
audit_printk("no rmap for writable spte %llx\n", *sptep);
audit_printk(kvm, "no rmap for writable spte %llx\n",
*sptep);
dump_stack();
}
}
@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
struct kvm_mmu_page *sp = page_header(__pa(sptep));
if (audit_point == AUDIT_POST_SYNC && sp->unsync)
audit_printk("meet unsync sp(%p) after sync root.\n", sp);
if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
"root.\n", sp);
}
static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
if (is_writable_pte(*spte))
audit_printk("shadow page has writable mappings: gfn "
"%llx role %x\n", sp->gfn, sp->role.word);
audit_printk(kvm, "shadow page has writable "
"mappings: gfn %llx role %x\n",
sp->gfn, sp->role.word);
spte = rmap_next(kvm, rmapp, spte);
}
}
@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
if (!__ratelimit(&ratelimit_state))
return;
audit_point = point;
vcpu->kvm->arch.audit_point = point;
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
}

View File

@ -72,7 +72,7 @@ struct guest_walker {
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
u32 error_code;
struct x86_exception fault;
};
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@ -266,21 +266,23 @@ walk:
return 1;
error:
walker->error_code = 0;
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
walker->fault.error_code = 0;
if (present)
walker->error_code |= PFERR_PRESENT_MASK;
walker->fault.error_code |= PFERR_PRESENT_MASK;
walker->error_code |= write_fault | user_fault;
walker->fault.error_code |= write_fault | user_fault;
if (fetch_fault && mmu->nx)
walker->error_code |= PFERR_FETCH_MASK;
walker->fault.error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
walker->error_code |= PFERR_RSVD_MASK;
walker->fault.error_code |= PFERR_RSVD_MASK;
vcpu->arch.fault.address = addr;
vcpu->arch.fault.error_code = walker->error_code;
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
trace_kvm_mmu_walker_error(walker->error_code);
trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
}
@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
pt_element_t gpte)
{
u64 nonpresent = shadow_trap_nonpresent_pte;
if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
if (!is_present_gpte(gpte)) {
if (!sp->unsync)
nonpresent = shadow_notrap_nonpresent_pte;
goto no_present;
}
if (!(gpte & PT_ACCESSED_MASK))
goto no_present;
return false;
no_present:
drop_spte(vcpu->kvm, spte, nonpresent);
return true;
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
u64 new_spte;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
if (!is_present_gpte(gpte)) {
if (sp->unsync)
new_spte = shadow_trap_nonpresent_pte;
else
new_spte = shadow_notrap_nonpresent_pte;
__set_spte(spte, new_spte);
}
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
kvm_get_pfn(pfn);
/*
* we call mmu_set_spte() with reset_host_protection = true beacuse that
* we call mmu_set_spte() with host_writable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
struct kvm_mmu *mmu = &vcpu->arch.mmu;
pt_element_t *gptep = gw->prefetch_ptes;
u64 *spte;
int i;
@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gpte = gptep[i];
if (!is_present_gpte(gpte) ||
is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
if (!sp->unsync)
__set_spte(spte, shadow_notrap_nonpresent_pte);
continue;
}
if (!(gpte & PT_ACCESSED_MASK))
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
int *ptwrite, pfn_t pfn)
int *ptwrite, pfn_t pfn, bool map_writable,
bool prefault)
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
user_fault, write_fault, dirty, ptwrite, it.level,
gw->gfn, pfn, false, true);
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return it.sptep;
@ -527,8 +539,8 @@ out_gpte_changed:
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
u32 error_code)
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
bool prefault)
{
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
@ -539,6 +551,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
unsigned long mmu_seq;
bool map_writable;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@ -556,8 +569,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
inject_page_fault(vcpu);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
if (!prefault) {
inject_page_fault(vcpu, &walker.fault);
/* reset fork detector */
vcpu->arch.last_pt_write_count = 0;
}
return 0;
}
@ -568,7 +584,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
return 0;
/* mmio */
if (is_error_pfn(pfn))
@ -581,7 +600,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn);
level, &write_pt, pfn, map_writable, prefault);
(void)sptep;
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@ -661,7 +680,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
u32 *error)
struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
@ -672,14 +691,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
} else if (error)
*error = walker.error_code;
} else if (exception)
*exception = walker.fault;
return gpa;
}
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
u32 access, u32 *error)
u32 access,
struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
@ -690,8 +710,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
} else if (error)
*error = walker.error_code;
} else if (exception)
*exception = walker.fault;
return gpa;
}
@ -730,12 +750,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
*
* Note:
* We should flush all tlbs if spte is dropped even though guest is
* responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
* and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
* used by guest then tlbs are not flushed, so guest is allowed to access the
* freed pages.
* And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
bool clear_unsync)
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
bool reset_host_protection;
bool host_writable;
gpa_t first_pte_gpa;
offset = nr_present = 0;
@ -764,31 +791,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return -EINVAL;
gfn = gpte_to_gfn(gpte);
if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
|| gfn != sp->gfns[i] || !is_present_gpte(gpte)
|| !(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
if (is_present_gpte(gpte) || !clear_unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i],
shadow_trap_nonpresent_pte);
vcpu->kvm->tlbs_dirty++;
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
pte_access &= ~ACC_WRITE_MASK;
reset_host_protection = 0;
} else {
reset_host_protection = 1;
}
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
reset_host_protection);
host_writable);
}
return !nr_present;

File diff suppressed because it is too large Load Diff

View File

@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
#define KVM_ISA_VMX 1
#define KVM_ISA_SVM 2
/*
* Tracepoint for kvm guest exit:
*/
TRACE_EVENT(kvm_exit,
TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
TP_ARGS(exit_reason, vcpu),
TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
TP_ARGS(exit_reason, vcpu, isa),
TP_STRUCT__entry(
__field( unsigned int, exit_reason )
__field( unsigned long, guest_rip )
__field( u32, isa )
__field( u64, info1 )
__field( u64, info2 )
),
TP_fast_assign(
__entry->exit_reason = exit_reason;
__entry->guest_rip = kvm_rip_read(vcpu);
__entry->isa = isa;
kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
&__entry->info2);
),
TP_printk("reason %s rip 0x%lx",
TP_printk("reason %s rip 0x%lx info %llx %llx",
ftrace_print_symbols_seq(p, __entry->exit_reason,
kvm_x86_ops->exit_reasons_str),
__entry->guest_rip)
__entry->guest_rip, __entry->info1, __entry->info2)
);
/*

View File

@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static int __read_mostly vmm_exclusive = 1;
module_param(vmm_exclusive, bool, S_IRUGO);
static int __read_mostly yield_on_hlt = 1;
module_param(yield_on_hlt, bool, S_IRUGO);
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static bool cpu_has_load_ia32_efer;
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
: "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
: "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
static unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
unsigned long value = 0;
asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
: "=a"(value) : "d"(field) : "cc");
: "+a"(value) : "d"(field) : "cc");
return value;
}
@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
return;
}
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
vmcs_write64(GUEST_IA32_EFER, guest_val);
vmcs_write64(HOST_IA32_EFER, host_val);
vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
return;
}
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
/* Ensure that we clear the HLT state in the VMCS. We don't need to
* explicitly skip the instruction because if the HLT state is set, then
* the instruction is already executing and RIP has already been
* advanced. */
if (!yield_on_hlt &&
vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
&& tboot_enabled())
return 1;
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& !tboot_enabled())
&& !tboot_enabled()) {
printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
" activate TXT before enabling KVM\n");
return 1;
}
}
return 0;
@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
static __init bool allow_1_setting(u32 msr, u32 ctl)
{
u32 vmx_msr_low, vmx_msr_high;
rdmsr(msr, vmx_msr_low, vmx_msr_high);
return vmx_msr_high & ctl;
}
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
{
u32 vmx_msr_low, vmx_msr_high;
@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_pin_based_exec_control) < 0)
return -EIO;
min = CPU_BASED_HLT_EXITING |
min =
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
if (yield_on_hlt)
min |= CPU_BASED_HLT_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
cpu_has_load_ia32_efer =
allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
VM_ENTRY_LOAD_IA32_EFER)
&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
VM_EXIT_LOAD_IA32_EFER);
return 0;
}
@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, save->base >> 4);
vmcs_write32(sf->base, save->base & 0xfffff);
vmcs_write32(sf->base, save->base & 0xffff0);
vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0xf3);
if (save->base & 0xf)
printk_once(KERN_WARNING "kvm: segment base is not paragraph"
" aligned when entering protected mode (seg=%d)",
seg);
}
static void enter_rmode(struct kvm_vcpu *vcpu)
@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
{
if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
}
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
vcpu->kvm->arch.ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_ACTIVITY_STATE, 0);
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
return;
}
if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
enable_irq_window(vcpu);
return;
}
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
vmx_clear_hlt(vcpu);
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
return 0;
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
| GUEST_INTR_STATE_NMI));
}
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
return 1;
/*
* Forward all other exceptions that are valid in real mode.
@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_invalid_opcode(intr_info)) {
er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
}
if (vmx->rmode.vm86_active &&
@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string || in)
return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
if (err)
kvm_inject_gp(vcpu, 0);
else
skip_emulated_instruction(vcpu);
}
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
switch (cr) {
case 0:
err = kvm_set_cr0(vcpu, val);
complete_insn_gp(vcpu, err);
kvm_complete_insn_gp(vcpu, err);
return 1;
case 3:
err = kvm_set_cr3(vcpu, val);
complete_insn_gp(vcpu, err);
kvm_complete_insn_gp(vcpu, err);
return 1;
case 4:
err = kvm_set_cr4(vcpu, val);
complete_insn_gp(vcpu, err);
kvm_complete_insn_gp(vcpu, err);
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
u8 cr8 = kvm_register_read(vcpu, reg);
kvm_set_cr8(vcpu, cr8);
skip_emulated_instruction(vcpu);
err = kvm_set_cr8(vcpu, cr8);
kvm_complete_insn_gp(vcpu, err);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
if (cr8_prev <= cr8)
@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
case 1: /*mov from cr*/
switch (cr) {
case 3:
kvm_register_write(vcpu, reg, vcpu->arch.cr3);
trace_kvm_cr_read(cr, vcpu->arch.cr3);
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
skip_emulated_instruction(vcpu);
return 1;
case 8:
@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
return 1;
}
static int handle_invd(struct kvm_vcpu *vcpu)
{
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_task_switch(struct kvm_vcpu *vcpu)
@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
&& (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
return handle_interrupt_window(&vmx->vcpu);
err = emulate_instruction(vcpu, 0, 0, 0);
err = emulate_instruction(vcpu, 0);
if (err == EMULATE_DO_MMIO) {
ret = 0;
@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
*info1 = vmcs_readl(EXIT_QUALIFICATION);
*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
}
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
trace_kvm_exit(exit_reason, vcpu);
trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
return handle_invalid_guest_state(vcpu);
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
);
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
| (1 << VCPU_EXREG_PDPTR));
| (1 << VCPU_EXREG_PDPTR)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
.decache_cr3 = vmx_decache_cr3,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
.get_exit_info = vmx_get_exit_info,
.exit_reasons_str = vmx_exit_reasons_str,
.get_lpage_level = vmx_get_lpage_level,
.cpuid_update = vmx_cpuid_update,
@ -4396,8 +4472,6 @@ static int __init vmx_init(void)
if (enable_ept) {
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
VMX_EPT_WRITABLE_MASK);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();

View File

@ -43,6 +43,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
int i;
for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
vcpu->arch.apf.gfns[i] = ~0;
}
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
@ -326,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
unsigned error_code = vcpu->arch.fault.error_code;
if (err)
kvm_inject_gp(vcpu, 0);
else
kvm_x86_ops->skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
vcpu->arch.cr2 = vcpu->arch.fault.address;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
vcpu->arch.cr2 = fault->address;
kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
}
void kvm_propagate_fault(struct kvm_vcpu *vcpu)
void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
vcpu->arch.nested_mmu.inject_page_fault(vcpu);
if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
vcpu->arch.mmu.inject_page_fault(vcpu);
vcpu->arch.fault.nested = false;
vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@ -460,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_avail))
return true;
gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
@ -506,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
vcpu->arch.cr3))
kvm_read_cr3(vcpu)))
return 1;
}
kvm_x86_ops->set_cr0(vcpu, cr0);
if ((cr0 ^ old_cr0) & X86_CR0_PG)
kvm_clear_async_pf_completion_queue(vcpu);
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
return 0;
@ -595,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
kvm_read_cr3(vcpu)))
return 1;
if (cr4 & X86_CR4_VMXE)
@ -615,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
return 0;
@ -650,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
@ -665,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
vcpu->arch.cr8 = cr8;
return 0;
}
void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (__kvm_set_cr8(vcpu, cr8))
kvm_inject_gp(vcpu, 0);
}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@ -775,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
* kvm-specific. Those are put in the beginning of the list.
*/
#define KVM_SAVE_MSRS_BEGIN 7
#define KVM_SAVE_MSRS_BEGIN 8
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_APIC_ASSIST_PAGE,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@ -830,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
@ -1418,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
/* Bits 2:5 are resrved, Should be zero */
if (data & 0x3c)
return 1;
vcpu->arch.apf.msr_val = data;
if (!(data & KVM_ASYNC_PF_ENABLED)) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
return 0;
}
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
return 1;
vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
kvm_async_pf_wakeup_all(vcpu);
return 0;
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@ -1499,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
break;
}
case MSR_KVM_ASYNC_PF_EN:
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@ -1775,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_SYSTEM_TIME_NEW:
data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
data = vcpu->arch.apf.msr_val;
break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@ -1904,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
@ -1922,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@ -2185,6 +2229,11 @@ out:
return r;
}
static void cpuid_mask(u32 *word, int wordnum)
{
*word &= boot_cpu_data.x86_capability[wordnum];
}
static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index)
{
@ -2259,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
cpuid_mask(&entry->edx, 0);
entry->ecx &= kvm_supported_word4_x86_features;
cpuid_mask(&entry->ecx, 4);
/* we support x2apic emulation even if host does not support
* it since we emulate x2apic in software */
entry->ecx |= F(X2APIC);
@ -2350,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 0x80000001:
entry->edx &= kvm_supported_word1_x86_features;
cpuid_mask(&entry->edx, 1);
entry->ecx &= kvm_supported_word6_x86_features;
cpuid_mask(&entry->ecx, 6);
break;
}
@ -3169,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memslots *slots, *old_slots;
unsigned long *dirty_bitmap;
r = -ENOMEM;
dirty_bitmap = vmalloc(n);
if (!dirty_bitmap)
goto out;
dirty_bitmap = memslot->dirty_bitmap_head;
if (memslot->dirty_bitmap == dirty_bitmap)
dirty_bitmap += n / sizeof(long);
memset(dirty_bitmap, 0, n);
r = -ENOMEM;
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!slots) {
vfree(dirty_bitmap);
if (!slots)
goto out;
}
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
slots->generation++;
old_slots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
@ -3195,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
spin_unlock(&kvm->mmu_lock);
r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
vfree(dirty_bitmap);
if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
goto out;
}
vfree(dirty_bitmap);
} else {
r = -EFAULT;
if (clear_user(log->dirty_bitmap, n))
@ -3266,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
&vpic->dev);
mutex_unlock(&kvm->slots_lock);
kfree(vpic);
goto create_irqchip_unlock;
}
@ -3278,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
smp_wmb();
r = kvm_setup_default_irq_routing(kvm);
if (r) {
mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
kvm_destroy_pic(kvm);
mutex_unlock(&kvm->irq_lock);
mutex_unlock(&kvm->slots_lock);
}
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
@ -3557,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
{
gpa_t t_gpa;
u32 error;
struct x86_exception exception;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
if (t_gpa == UNMAPPED_GVA)
vcpu->arch.fault.nested = true;
t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
return t_gpa;
}
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
/* uses this to access any guest's mapped memory without checking CPL */
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
u32 *error)
struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
error);
exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
if (gpa == UNMAPPED_GVA) {
r = X86EMUL_PROPAGATE_FAULT;
goto out;
}
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@ -3630,31 +3682,35 @@ out:
/* used for instruction fetching */
static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 *error)
struct kvm_vcpu *vcpu,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
access | PFERR_FETCH_MASK, error);
access | PFERR_FETCH_MASK,
exception);
}
static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 *error)
struct kvm_vcpu *vcpu,
struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
error);
exception);
}
static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 *error)
struct kvm_vcpu *vcpu,
struct x86_exception *exception)
{
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
static int kvm_write_guest_virt_system(gva_t addr, void *val,
unsigned int bytes,
struct kvm_vcpu *vcpu,
u32 *error)
struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
@ -3662,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
PFERR_WRITE_MASK,
error);
exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
if (gpa == UNMAPPED_GVA) {
r = X86EMUL_PROPAGATE_FAULT;
goto out;
}
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@ -3688,7 +3742,7 @@ out:
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
unsigned int *error_code,
struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@ -3701,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@ -3710,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
== X86EMUL_CONTINUE)
if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
== X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
mmio:
@ -3735,7 +3789,7 @@ mmio:
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes)
const void *val, int bytes)
{
int ret;
@ -3749,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
unsigned int *error_code,
struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@ -3787,7 +3841,7 @@ mmio:
int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
unsigned int *error_code,
struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
@ -3795,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
int rc, now;
now = -addr & ~PAGE_MASK;
rc = emulator_write_emulated_onepage(addr, val, now, error_code,
rc = emulator_write_emulated_onepage(addr, val, now, exception,
vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
@ -3803,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
val += now;
bytes -= now;
}
return emulator_write_emulated_onepage(addr, val, bytes, error_code,
return emulator_write_emulated_onepage(addr, val, bytes, exception,
vcpu);
}
@ -3821,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
unsigned int *error_code,
struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@ -3879,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
return emulator_write_emulated(addr, new, bytes, exception, vcpu);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@ -3904,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
if (vcpu->arch.pio.count)
goto data_avail;
trace_kvm_pio(0, port, size, 1);
trace_kvm_pio(0, port, size, count);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 1;
@ -3932,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
const void *val, unsigned int count,
struct kvm_vcpu *vcpu)
{
trace_kvm_pio(1, port, size, 1);
trace_kvm_pio(1, port, size, count);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 0;
@ -3973,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
return X86EMUL_CONTINUE;
if (kvm_x86_ops->has_wbinvd_exit()) {
preempt_disable();
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
preempt_enable();
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
}
wbinvd();
} else
wbinvd();
return X86EMUL_CONTINUE;
}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@ -4019,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
value = vcpu->arch.cr2;
break;
case 3:
value = vcpu->arch.cr3;
value = kvm_read_cr3(vcpu);
break;
case 4:
value = kvm_read_cr4(vcpu);
@ -4053,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
res = __kvm_set_cr8(vcpu, val & 0xfUL);
res = kvm_set_cr8(vcpu, val);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@ -4206,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
if (ctxt->exception == PF_VECTOR)
kvm_propagate_fault(vcpu);
else if (ctxt->error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
if (ctxt->exception.vector == PF_VECTOR)
kvm_propagate_fault(vcpu, &ctxt->exception);
else if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
ctxt->exception.error_code);
else
kvm_queue_exception(vcpu, ctxt->exception);
kvm_queue_exception(vcpu, ctxt->exception.vector);
}
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@ -4267,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
static int handle_emulation_failure(struct kvm_vcpu *vcpu)
{
int r = EMULATE_DONE;
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
if (!is_guest_mode(vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
r = EMULATE_FAIL;
}
kvm_queue_exception(vcpu, UD_VECTOR);
return EMULATE_FAIL;
return r;
}
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@ -4302,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
return false;
}
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
u16 error_code,
int emulation_type)
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
void *insn,
int insn_len)
{
int r;
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@ -4323,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
vcpu->arch.emulate_ctxt.interruptibility = 0;
vcpu->arch.emulate_ctxt.exception = -1;
vcpu->arch.emulate_ctxt.have_exception = false;
vcpu->arch.emulate_ctxt.perm_ok = false;
r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
if (r == X86EMUL_PROPAGATE_FAULT)
goto done;
@ -4389,7 +4453,7 @@ restart:
}
done:
if (vcpu->arch.emulate_ctxt.exception >= 0) {
if (vcpu->arch.emulate_ctxt.have_exception) {
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
@ -4413,7 +4477,7 @@ done:
return r;
}
EXPORT_SYMBOL_GPL(emulate_instruction);
EXPORT_SYMBOL_GPL(x86_emulate_instruction);
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
{
@ -4653,7 +4717,6 @@ int kvm_arch_init(void *opaque)
kvm_x86_ops = ops;
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@ -5116,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
vcpu->arch.apf.halted = true;
r = 1;
goto out;
}
}
r = kvm_mmu_reload(vcpu);
@ -5244,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
r = 1;
while (r > 0) {
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
r = vcpu_enter_guest(vcpu);
else {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@ -5257,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
case KVM_MP_STATE_SIPI_RECEIVED:
default:
@ -5278,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
kvm_check_async_pf_completion(vcpu);
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
@ -5302,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
if (!tsk_used_math(current) && init_fpu(current))
return -ENOMEM;
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@ -5313,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
if (!irqchip_in_kernel(vcpu->kvm))
kvm_set_cr8(vcpu, kvm_run->cr8);
if (!irqchip_in_kernel(vcpu->kvm)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
r = -EINVAL;
goto out;
}
}
if (vcpu->arch.pio.count || vcpu->mmio_needed) {
if (vcpu->mmio_needed) {
@ -5323,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->mmio_needed = 0;
}
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r != EMULATE_DONE) {
r = 0;
@ -5436,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
sregs->cr3 = vcpu->arch.cr3;
sregs->cr3 = kvm_read_cr3(vcpu);
sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
@ -5504,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
kvm_set_cr8(vcpu, sregs->cr8);
@ -5522,7 +5604,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (sregs->cr4 & X86_CR4_OSXSAVE)
update_cpuid(vcpu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
@ -5773,6 +5855,8 @@ free_vcpu:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
vcpu->arch.apf.msr_val = 0;
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
@ -5792,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_val = 0;
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
return kvm_x86_ops->vcpu_reset(vcpu);
}
@ -5881,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
goto fail_free_mce_banks;
kvm_async_pf_hash_reset(vcpu);
return 0;
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
@ -5906,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
}
struct kvm *kvm_arch_create_vm(void)
int kvm_arch_init_vm(struct kvm *kvm)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
if (!kvm)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@ -5921,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
spin_lock_init(&kvm->arch.tsc_write_lock);
return kvm;
return 0;
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@ -5939,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
/*
* Unpin any mmu pages first.
*/
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
}
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu);
@ -5964,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
kvm_free_physmem(kvm);
if (kvm->arch.apic_access_page)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
cleanup_srcu_struct(&kvm->srcu);
kfree(kvm);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@ -6051,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
|| !list_empty_careful(&vcpu->async_pf.done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
|| vcpu->arch.nmi_pending ||
(kvm_arch_interrupt_allowed(vcpu) &&
@ -6110,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;
if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
is_error_page(work->page))
return;
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
return;
if (!vcpu->arch.mmu.direct_map &&
work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
return;
vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
}
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
{
return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
}
static inline u32 kvm_async_pf_next_probe(u32 key)
{
return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
}
static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
u32 key = kvm_async_pf_hash_fn(gfn);
while (vcpu->arch.apf.gfns[key] != ~0)
key = kvm_async_pf_next_probe(key);
vcpu->arch.apf.gfns[key] = gfn;
}
static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
{
int i;
u32 key = kvm_async_pf_hash_fn(gfn);
for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
(vcpu->arch.apf.gfns[key] != gfn &&
vcpu->arch.apf.gfns[key] != ~0); i++)
key = kvm_async_pf_next_probe(key);
return key;
}
bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
}
static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
u32 i, j, k;
i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
while (true) {
vcpu->arch.apf.gfns[i] = ~0;
do {
j = kvm_async_pf_next_probe(j);
if (vcpu->arch.apf.gfns[j] == ~0)
return;
k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
/*
* k lies cyclically in ]i,j]
* | i.k.j |
* |....j i.k.| or |.k..j i...|
*/
} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
i = j;
}
}
static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
{
return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
sizeof(val));
}
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
struct x86_exception fault;
trace_kvm_async_pf_not_present(work->arch.token, work->gva);
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
(vcpu->arch.apf.send_user_only &&
kvm_x86_ops->get_cpl(vcpu) == 0))
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
fault.vector = PF_VECTOR;
fault.error_code_valid = true;
fault.error_code = 0;
fault.nested_page_fault = false;
fault.address = work->arch.token;
kvm_inject_page_fault(vcpu, &fault);
}
}
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
struct x86_exception fault;
trace_kvm_async_pf_ready(work->arch.token, work->gva);
if (is_error_page(work->page))
work->arch.token = ~0; /* broadcast wakeup */
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
fault.vector = PF_VECTOR;
fault.error_code_valid = true;
fault.error_code = 0;
fault.nested_page_fault = false;
fault.address = work->arch.token;
kvm_inject_page_fault(vcpu, &fault);
}
vcpu->arch.apf.halted = false;
}
bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
{
if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
return true;
else
return !kvm_event_needs_reinjection(vcpu) &&
kvm_x86_ops->interrupt_allowed(vcpu);
}
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);

View File

@ -540,6 +540,7 @@ struct kvm_ppc_pvinfo {
#endif
#define KVM_CAP_PPC_GET_PVINFO 57
#define KVM_CAP_PPC_IRQ_LEVEL 58
#define KVM_CAP_ASYNC_PF 59
#ifdef KVM_CAP_IRQ_ROUTING

View File

@ -16,6 +16,8 @@
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/msi.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <asm/signal.h>
#include <linux/kvm.h>
@ -40,6 +42,7 @@
#define KVM_REQ_KICK 9
#define KVM_REQ_DEACTIVATE_FPU 10
#define KVM_REQ_EVENT 11
#define KVM_REQ_APF_HALT 12
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
@ -74,6 +77,27 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev);
#ifdef CONFIG_KVM_ASYNC_PF
struct kvm_async_pf {
struct work_struct work;
struct list_head link;
struct list_head queue;
struct kvm_vcpu *vcpu;
struct mm_struct *mm;
gva_t gva;
unsigned long addr;
struct kvm_arch_async_pf arch;
struct page *page;
bool done;
};
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
struct kvm_arch_async_pf *arch);
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif
struct kvm_vcpu {
struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS
@ -104,6 +128,15 @@ struct kvm_vcpu {
gpa_t mmio_phys_addr;
#endif
#ifdef CONFIG_KVM_ASYNC_PF
struct {
u32 queued;
struct list_head queue;
struct list_head done;
spinlock_t lock;
} async_pf;
#endif
struct kvm_vcpu_arch arch;
};
@ -113,16 +146,19 @@ struct kvm_vcpu {
*/
#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
struct kvm_lpage_info {
unsigned long rmap_pde;
int write_count;
};
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
unsigned long flags;
unsigned long *rmap;
unsigned long *dirty_bitmap;
struct {
unsigned long rmap_pde;
int write_count;
} *lpage_info[KVM_NR_PAGE_SIZES - 1];
unsigned long *dirty_bitmap_head;
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
unsigned long userspace_addr;
int user_alloc;
int id;
@ -169,6 +205,7 @@ struct kvm_irq_routing_table {};
struct kvm_memslots {
int nmemslots;
u64 generation;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
KVM_PRIVATE_MEM_SLOTS];
};
@ -206,6 +243,10 @@ struct kvm {
struct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
/*
* Update side is protected by irq_lock and,
* if configured, irqfds.lock.
*/
struct kvm_irq_routing_table __rcu *irq_routing;
struct hlist_head mask_notifier_list;
struct hlist_head irq_ack_notifier_list;
@ -216,6 +257,7 @@ struct kvm {
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
#endif
long tlbs_dirty;
};
/* The guest did something we don't support. */
@ -302,7 +344,11 @@ void kvm_set_page_accessed(struct page *page);
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
int memslot_id(struct kvm *kvm, gfn_t gfn);
@ -321,18 +367,25 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len);
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
unsigned long len);
int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
void *data, unsigned long len);
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa);
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_flush_remote_tlbs(struct kvm *kvm);
void kvm_reload_remote_mmus(struct kvm *kvm);
@ -398,7 +451,19 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
void kvm_free_physmem(struct kvm *kvm);
struct kvm *kvm_arch_create_vm(void);
#ifndef __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
return kzalloc(sizeof(struct kvm), GFP_KERNEL);
}
static inline void kvm_arch_free_vm(struct kvm *kvm)
{
kfree(kvm);
}
#endif
int kvm_arch_init_vm(struct kvm *kvm);
void kvm_arch_destroy_vm(struct kvm *kvm);
void kvm_free_all_assigned_devices(struct kvm *kvm);
void kvm_arch_sync_events(struct kvm *kvm);
@ -414,16 +479,8 @@ struct kvm_irq_ack_notifier {
void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
};
#define KVM_ASSIGNED_MSIX_PENDING 0x1
struct kvm_guest_msix_entry {
u32 vector;
u16 entry;
u16 flags;
};
struct kvm_assigned_dev_kernel {
struct kvm_irq_ack_notifier ack_notifier;
struct work_struct interrupt_work;
struct list_head list;
int assigned_dev_id;
int host_segnr;
@ -434,13 +491,14 @@ struct kvm_assigned_dev_kernel {
bool host_irq_disabled;
struct msix_entry *host_msix_entries;
int guest_irq;
struct kvm_guest_msix_entry *guest_msix_entries;
struct msix_entry *guest_msix_entries;
unsigned long irq_requested_type;
int irq_source_id;
int flags;
struct pci_dev *dev;
struct kvm *kvm;
spinlock_t assigned_dev_lock;
spinlock_t intx_lock;
char irq_name[32];
};
struct kvm_irq_mask_notifier {
@ -462,6 +520,8 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
unsigned long *deliver_bitmask);
#endif
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
int irq_source_id, int level);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
@ -603,17 +663,28 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
void kvm_eventfd_init(struct kvm *kvm);
int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
void kvm_irqfd_release(struct kvm *kvm);
void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
#else
static inline void kvm_eventfd_init(struct kvm *kvm) {}
static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
{
return -EINVAL;
}
static inline void kvm_irqfd_release(struct kvm *kvm) {}
#ifdef CONFIG_HAVE_KVM_IRQCHIP
static inline void kvm_irq_routing_update(struct kvm *kvm,
struct kvm_irq_routing_table *irq_rt)
{
rcu_assign_pointer(kvm->irq_routing, irq_rt);
}
#endif
static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
return -ENOSYS;

View File

@ -67,4 +67,11 @@ struct kvm_lapic_irq {
u32 dest_id;
};
struct gfn_to_hva_cache {
u64 generation;
gpa_t gpa;
unsigned long hva;
struct kvm_memory_slot *memslot;
};
#endif /* __KVM_TYPES_H__ */

View File

@ -6,6 +6,36 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x }
#define kvm_trace_exit_reason \
ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL), \
ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN), \
ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \
ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI)
TRACE_EVENT(kvm_userspace_exit,
TP_PROTO(__u32 reason, int errno),
TP_ARGS(reason, errno),
TP_STRUCT__entry(
__field( __u32, reason )
__field( int, errno )
),
TP_fast_assign(
__entry->reason = reason;
__entry->errno = errno;
),
TP_printk("reason %s (%d)",
__entry->errno < 0 ?
(__entry->errno == -EINTR ? "restart" : "error") :
__print_symbolic(__entry->reason, kvm_trace_exit_reason),
__entry->errno < 0 ? -__entry->errno : __entry->reason)
);
#if defined(__KVM_HAVE_IOAPIC)
TRACE_EVENT(kvm_set_irq,
TP_PROTO(unsigned int gsi, int level, int irq_source_id),
@ -185,6 +215,97 @@ TRACE_EVENT(kvm_age_page,
__entry->referenced ? "YOUNG" : "OLD")
);
#ifdef CONFIG_KVM_ASYNC_PF
DECLARE_EVENT_CLASS(kvm_async_get_page_class,
TP_PROTO(u64 gva, u64 gfn),
TP_ARGS(gva, gfn),
TP_STRUCT__entry(
__field(__u64, gva)
__field(u64, gfn)
),
TP_fast_assign(
__entry->gva = gva;
__entry->gfn = gfn;
),
TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
);
DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page,
TP_PROTO(u64 gva, u64 gfn),
TP_ARGS(gva, gfn)
);
DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault,
TP_PROTO(u64 gva, u64 gfn),
TP_ARGS(gva, gfn)
);
DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready,
TP_PROTO(u64 token, u64 gva),
TP_ARGS(token, gva),
TP_STRUCT__entry(
__field(__u64, token)
__field(__u64, gva)
),
TP_fast_assign(
__entry->token = token;
__entry->gva = gva;
),
TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva)
);
DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present,
TP_PROTO(u64 token, u64 gva),
TP_ARGS(token, gva)
);
DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready,
TP_PROTO(u64 token, u64 gva),
TP_ARGS(token, gva)
);
TRACE_EVENT(
kvm_async_pf_completed,
TP_PROTO(unsigned long address, struct page *page, u64 gva),
TP_ARGS(address, page, gva),
TP_STRUCT__entry(
__field(unsigned long, address)
__field(pfn_t, pfn)
__field(u64, gva)
),
TP_fast_assign(
__entry->address = address;
__entry->pfn = page ? page_to_pfn(page) : 0;
__entry->gva = gva;
),
TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva,
__entry->address, __entry->pfn)
);
#endif
#endif /* _TRACE_KVM_MAIN_H */
/* This part must be outside protection */

View File

@ -15,3 +15,6 @@ config KVM_APIC_ARCHITECTURE
config KVM_MMIO
bool
config KVM_ASYNC_PF
bool

View File

@ -55,58 +55,31 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
return index;
}
static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev;
int i;
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
u32 vector;
int index;
assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
interrupt_work);
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
spin_lock(&assigned_dev->intx_lock);
disable_irq_nosync(irq);
assigned_dev->host_irq_disabled = true;
spin_unlock(&assigned_dev->intx_lock);
}
spin_lock_irq(&assigned_dev->assigned_dev_lock);
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
struct kvm_guest_msix_entry *guest_entries =
assigned_dev->guest_msix_entries;
for (i = 0; i < assigned_dev->entries_nr; i++) {
if (!(guest_entries[i].flags &
KVM_ASSIGNED_MSIX_PENDING))
continue;
guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
index = find_index_from_host_irq(assigned_dev, irq);
if (index >= 0) {
vector = assigned_dev->
guest_msix_entries[index].vector;
kvm_set_irq(assigned_dev->kvm,
assigned_dev->irq_source_id,
guest_entries[i].vector, 1);
assigned_dev->irq_source_id, vector, 1);
}
} else
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
assigned_dev->guest_irq, 1);
spin_unlock_irq(&assigned_dev->assigned_dev_lock);
}
static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
{
unsigned long flags;
struct kvm_assigned_dev_kernel *assigned_dev =
(struct kvm_assigned_dev_kernel *) dev_id;
spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
int index = find_index_from_host_irq(assigned_dev, irq);
if (index < 0)
goto out;
assigned_dev->guest_msix_entries[index].flags |=
KVM_ASSIGNED_MSIX_PENDING;
}
schedule_work(&assigned_dev->interrupt_work);
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
disable_irq_nosync(irq);
assigned_dev->host_irq_disabled = true;
}
out:
spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
return IRQ_HANDLED;
}
@ -114,7 +87,6 @@ out:
static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_assigned_dev_kernel *dev;
unsigned long flags;
if (kian->gsi == -1)
return;
@ -127,12 +99,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
/* The guest irq may be shared so this ack may be
* from another device.
*/
spin_lock_irqsave(&dev->assigned_dev_lock, flags);
spin_lock(&dev->intx_lock);
if (dev->host_irq_disabled) {
enable_irq(dev->host_irq);
dev->host_irq_disabled = false;
}
spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
spin_unlock(&dev->intx_lock);
}
static void deassign_guest_irq(struct kvm *kvm,
@ -141,6 +113,9 @@ static void deassign_guest_irq(struct kvm *kvm,
kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
assigned_dev->ack_notifier.gsi = -1;
kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
assigned_dev->guest_irq, 0);
if (assigned_dev->irq_source_id != -1)
kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
assigned_dev->irq_source_id = -1;
@ -152,28 +127,19 @@ static void deassign_host_irq(struct kvm *kvm,
struct kvm_assigned_dev_kernel *assigned_dev)
{
/*
* In kvm_free_device_irq, cancel_work_sync return true if:
* 1. work is scheduled, and then cancelled.
* 2. work callback is executed.
*
* The first one ensured that the irq is disabled and no more events
* would happen. But for the second one, the irq may be enabled (e.g.
* for MSI). So we disable irq here to prevent further events.
* We disable irq here to prevent further events.
*
* Notice this maybe result in nested disable if the interrupt type is
* INTx, but it's OK for we are going to free it.
*
* If this function is a part of VM destroy, please ensure that till
* now, the kvm state is still legal for probably we also have to wait
* interrupt_work done.
* on a currently running IRQ handler.
*/
if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
int i;
for (i = 0; i < assigned_dev->entries_nr; i++)
disable_irq_nosync(assigned_dev->
host_msix_entries[i].vector);
cancel_work_sync(&assigned_dev->interrupt_work);
disable_irq(assigned_dev->host_msix_entries[i].vector);
for (i = 0; i < assigned_dev->entries_nr; i++)
free_irq(assigned_dev->host_msix_entries[i].vector,
@ -185,8 +151,7 @@ static void deassign_host_irq(struct kvm *kvm,
pci_disable_msix(assigned_dev->dev);
} else {
/* Deal with MSI and INTx */
disable_irq_nosync(assigned_dev->host_irq);
cancel_work_sync(&assigned_dev->interrupt_work);
disable_irq(assigned_dev->host_irq);
free_irq(assigned_dev->host_irq, (void *)assigned_dev);
@ -232,7 +197,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
{
kvm_free_assigned_irq(kvm, assigned_dev);
pci_reset_function(assigned_dev->dev);
__pci_reset_function(assigned_dev->dev);
pci_restore_state(assigned_dev->dev);
pci_release_regions(assigned_dev->dev);
pci_disable_device(assigned_dev->dev);
@ -265,8 +231,8 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
* on the same interrupt line is not a happy situation: there
* are going to be long delays in accepting, acking, etc.
*/
if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
0, "kvm_assigned_intx_device", (void *)dev))
if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
IRQF_ONESHOT, dev->irq_name, (void *)dev))
return -EIO;
return 0;
}
@ -284,8 +250,8 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
}
dev->host_irq = dev->dev->irq;
if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
"kvm_assigned_msi_device", (void *)dev)) {
if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
0, dev->irq_name, (void *)dev)) {
pci_disable_msi(dev->dev);
return -EIO;
}
@ -310,10 +276,9 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
return r;
for (i = 0; i < dev->entries_nr; i++) {
r = request_irq(dev->host_msix_entries[i].vector,
kvm_assigned_dev_intr, 0,
"kvm_assigned_msix_device",
(void *)dev);
r = request_threaded_irq(dev->host_msix_entries[i].vector,
NULL, kvm_assigned_dev_thread,
0, dev->irq_name, (void *)dev);
if (r)
goto err;
}
@ -370,6 +335,9 @@ static int assign_host_irq(struct kvm *kvm,
if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
return r;
snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
pci_name(dev->dev));
switch (host_irq_type) {
case KVM_DEV_IRQ_HOST_INTX:
r = assigned_device_enable_host_intx(kvm, dev);
@ -547,6 +515,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
}
pci_reset_function(dev);
pci_save_state(dev);
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
@ -554,12 +523,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
match->host_devfn = assigned_dev->devfn;
match->flags = assigned_dev->flags;
match->dev = dev;
spin_lock_init(&match->assigned_dev_lock);
spin_lock_init(&match->intx_lock);
match->irq_source_id = -1;
match->kvm = kvm;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
INIT_WORK(&match->interrupt_work,
kvm_assigned_dev_interrupt_work_handler);
list_add(&match->list, &kvm->arch.assigned_dev_head);
@ -579,6 +546,7 @@ out:
mutex_unlock(&kvm->lock);
return r;
out_list_del:
pci_restore_state(dev);
list_del(&match->list);
pci_release_regions(dev);
out_disable:
@ -651,9 +619,9 @@ static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
r = -ENOMEM;
goto msix_nr_out;
}
adev->guest_msix_entries = kzalloc(
sizeof(struct kvm_guest_msix_entry) *
entry_nr->entry_nr, GFP_KERNEL);
adev->guest_msix_entries =
kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
GFP_KERNEL);
if (!adev->guest_msix_entries) {
kfree(adev->host_msix_entries);
r = -ENOMEM;
@ -706,7 +674,7 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
unsigned long arg)
{
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
int r;
switch (ioctl) {
case KVM_ASSIGN_PCI_DEVICE: {
@ -724,7 +692,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
r = -EOPNOTSUPP;
break;
}
#ifdef KVM_CAP_ASSIGN_DEV_IRQ
case KVM_ASSIGN_DEV_IRQ: {
struct kvm_assigned_irq assigned_irq;
@ -747,8 +714,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
#endif
#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
case KVM_DEASSIGN_PCI_DEVICE: {
struct kvm_assigned_pci_dev assigned_dev;
@ -760,7 +725,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
goto out;
break;
}
#endif
#ifdef KVM_CAP_IRQ_ROUTING
case KVM_SET_GSI_ROUTING: {
struct kvm_irq_routing routing;
@ -813,6 +777,9 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
break;
}
#endif
default:
r = -ENOTTY;
break;
}
out:
return r;

216
virt/kvm/async_pf.c Normal file
View File

@ -0,0 +1,216 @@
/*
* kvm asynchronous fault support
*
* Copyright 2010 Red Hat, Inc.
*
* Author:
* Gleb Natapov <gleb@redhat.com>
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <linux/kvm_host.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/mmu_context.h>
#include "async_pf.h"
#include <trace/events/kvm.h>
static struct kmem_cache *async_pf_cache;
int kvm_async_pf_init(void)
{
async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
if (!async_pf_cache)
return -ENOMEM;
return 0;
}
void kvm_async_pf_deinit(void)
{
if (async_pf_cache)
kmem_cache_destroy(async_pf_cache);
async_pf_cache = NULL;
}
void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu)
{
INIT_LIST_HEAD(&vcpu->async_pf.done);
INIT_LIST_HEAD(&vcpu->async_pf.queue);
spin_lock_init(&vcpu->async_pf.lock);
}
static void async_pf_execute(struct work_struct *work)
{
struct page *page = NULL;
struct kvm_async_pf *apf =
container_of(work, struct kvm_async_pf, work);
struct mm_struct *mm = apf->mm;
struct kvm_vcpu *vcpu = apf->vcpu;
unsigned long addr = apf->addr;
gva_t gva = apf->gva;
might_sleep();
use_mm(mm);
down_read(&mm->mmap_sem);
get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL);
up_read(&mm->mmap_sem);
unuse_mm(mm);
spin_lock(&vcpu->async_pf.lock);
list_add_tail(&apf->link, &vcpu->async_pf.done);
apf->page = page;
apf->done = true;
spin_unlock(&vcpu->async_pf.lock);
/*
* apf may be freed by kvm_check_async_pf_completion() after
* this point
*/
trace_kvm_async_pf_completed(addr, page, gva);
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
mmdrop(mm);
kvm_put_kvm(vcpu->kvm);
}
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
{
/* cancel outstanding work queue item */
while (!list_empty(&vcpu->async_pf.queue)) {
struct kvm_async_pf *work =
list_entry(vcpu->async_pf.queue.next,
typeof(*work), queue);
cancel_work_sync(&work->work);
list_del(&work->queue);
if (!work->done) /* work was canceled */
kmem_cache_free(async_pf_cache, work);
}
spin_lock(&vcpu->async_pf.lock);
while (!list_empty(&vcpu->async_pf.done)) {
struct kvm_async_pf *work =
list_entry(vcpu->async_pf.done.next,
typeof(*work), link);
list_del(&work->link);
if (work->page)
put_page(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
vcpu->async_pf.queued = 0;
}
void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
{
struct kvm_async_pf *work;
while (!list_empty_careful(&vcpu->async_pf.done) &&
kvm_arch_can_inject_async_page_present(vcpu)) {
spin_lock(&vcpu->async_pf.lock);
work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
link);
list_del(&work->link);
spin_unlock(&vcpu->async_pf.lock);
if (work->page)
kvm_arch_async_page_ready(vcpu, work);
kvm_arch_async_page_present(vcpu, work);
list_del(&work->queue);
vcpu->async_pf.queued--;
if (work->page)
put_page(work->page);
kmem_cache_free(async_pf_cache, work);
}
}
int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
struct kvm_arch_async_pf *arch)
{
struct kvm_async_pf *work;
if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
return 0;
/* setup delayed work */
/*
* do alloc nowait since if we are going to sleep anyway we
* may as well sleep faulting in page
*/
work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
if (!work)
return 0;
work->page = NULL;
work->done = false;
work->vcpu = vcpu;
work->gva = gva;
work->addr = gfn_to_hva(vcpu->kvm, gfn);
work->arch = *arch;
work->mm = current->mm;
atomic_inc(&work->mm->mm_count);
kvm_get_kvm(work->vcpu->kvm);
/* this can't really happen otherwise gfn_to_pfn_async
would succeed */
if (unlikely(kvm_is_error_hva(work->addr)))
goto retry_sync;
INIT_WORK(&work->work, async_pf_execute);
if (!schedule_work(&work->work))
goto retry_sync;
list_add_tail(&work->queue, &vcpu->async_pf.queue);
vcpu->async_pf.queued++;
kvm_arch_async_page_not_present(vcpu, work);
return 1;
retry_sync:
kvm_put_kvm(work->vcpu->kvm);
mmdrop(work->mm);
kmem_cache_free(async_pf_cache, work);
return 0;
}
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
{
struct kvm_async_pf *work;
if (!list_empty_careful(&vcpu->async_pf.done))
return 0;
work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
if (!work)
return -ENOMEM;
work->page = bad_page;
get_page(bad_page);
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
spin_lock(&vcpu->async_pf.lock);
list_add_tail(&work->link, &vcpu->async_pf.done);
spin_unlock(&vcpu->async_pf.lock);
vcpu->async_pf.queued++;
return 0;
}

36
virt/kvm/async_pf.h Normal file
View File

@ -0,0 +1,36 @@
/*
* kvm asynchronous fault support
*
* Copyright 2010 Red Hat, Inc.
*
* Author:
* Gleb Natapov <gleb@redhat.com>
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifndef __KVM_ASYNC_PF_H__
#define __KVM_ASYNC_PF_H__
#ifdef CONFIG_KVM_ASYNC_PF
int kvm_async_pf_init(void);
void kvm_async_pf_deinit(void);
void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu);
#else
#define kvm_async_pf_init() (0)
#define kvm_async_pf_deinit() do{}while(0)
#define kvm_async_pf_vcpu_init(C) do{}while(0)
#endif
#endif

View File

@ -44,14 +44,19 @@
*/
struct _irqfd {
struct kvm *kvm;
struct eventfd_ctx *eventfd;
int gsi;
struct list_head list;
poll_table pt;
wait_queue_t wait;
struct work_struct inject;
struct work_struct shutdown;
/* Used for MSI fast-path */
struct kvm *kvm;
wait_queue_t wait;
/* Update side is protected by irqfds.lock */
struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
poll_table pt;
struct work_struct shutdown;
};
static struct workqueue_struct *irqfd_cleanup_wq;
@ -125,14 +130,22 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
unsigned long flags = (unsigned long)key;
struct kvm_kernel_irq_routing_entry *irq;
struct kvm *kvm = irqfd->kvm;
if (flags & POLLIN)
if (flags & POLLIN) {
rcu_read_lock();
irq = rcu_dereference(irqfd->irq_entry);
/* An event has been signaled, inject an interrupt */
schedule_work(&irqfd->inject);
if (irq)
kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
else
schedule_work(&irqfd->inject);
rcu_read_unlock();
}
if (flags & POLLHUP) {
/* The eventfd is closing, detach from KVM */
struct kvm *kvm = irqfd->kvm;
unsigned long flags;
spin_lock_irqsave(&kvm->irqfds.lock, flags);
@ -163,9 +176,31 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
add_wait_queue(wqh, &irqfd->wait);
}
/* Must be called under irqfds.lock */
static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
struct kvm_irq_routing_table *irq_rt)
{
struct kvm_kernel_irq_routing_entry *e;
struct hlist_node *n;
if (irqfd->gsi >= irq_rt->nr_rt_entries) {
rcu_assign_pointer(irqfd->irq_entry, NULL);
return;
}
hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
/* Only fast-path MSI. */
if (e->type == KVM_IRQ_ROUTING_MSI)
rcu_assign_pointer(irqfd->irq_entry, e);
else
rcu_assign_pointer(irqfd->irq_entry, NULL);
}
}
static int
kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
{
struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
struct file *file = NULL;
struct eventfd_ctx *eventfd = NULL;
@ -215,6 +250,10 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
goto fail;
}
irq_rt = rcu_dereference_protected(kvm->irq_routing,
lockdep_is_held(&kvm->irqfds.lock));
irqfd_update(kvm, irqfd, irq_rt);
events = file->f_op->poll(file, &irqfd->pt);
list_add_tail(&irqfd->list, &kvm->irqfds.items);
@ -271,8 +310,17 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
spin_lock_irq(&kvm->irqfds.lock);
list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
/*
* This rcu_assign_pointer is needed for when
* another thread calls kvm_irqfd_update before
* we flush workqueue below.
* It is paired with synchronize_rcu done by caller
* of that function.
*/
rcu_assign_pointer(irqfd->irq_entry, NULL);
irqfd_deactivate(irqfd);
}
}
spin_unlock_irq(&kvm->irqfds.lock);
@ -321,6 +369,25 @@ kvm_irqfd_release(struct kvm *kvm)
}
/*
* Change irq_routing and irqfd.
* Caller must invoke synchronize_rcu afterwards.
*/
void kvm_irq_routing_update(struct kvm *kvm,
struct kvm_irq_routing_table *irq_rt)
{
struct _irqfd *irqfd;
spin_lock_irq(&kvm->irqfds.lock);
rcu_assign_pointer(kvm->irq_routing, irq_rt);
list_for_each_entry(irqfd, &kvm->irqfds.items, list)
irqfd_update(kvm, irqfd, irq_rt);
spin_unlock_irq(&kvm->irqfds.lock);
}
/*
* create a host-wide workqueue for issuing deferred shutdown requests
* aggregated from all vm* instances. We need our own isolated single-thread

View File

@ -114,8 +114,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level)
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level)
{
struct kvm_lapic_irq irq;
@ -409,8 +409,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
mutex_lock(&kvm->irq_lock);
old = kvm->irq_routing;
rcu_assign_pointer(kvm->irq_routing, new);
kvm_irq_routing_update(kvm, new);
mutex_unlock(&kvm->irq_lock);
synchronize_rcu();
new = old;

View File

@ -55,6 +55,7 @@
#include <asm-generic/bitops/le.h>
#include "coalesced_mmio.h"
#include "async_pf.h"
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
@ -89,7 +90,8 @@ static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
static bool kvm_rebooting;
bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
static bool largepages_enabled = true;
@ -167,8 +169,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
int dirty_count = kvm->tlbs_dirty;
smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}
void kvm_reload_remote_mmus(struct kvm *kvm)
@ -186,6 +192,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
@ -247,7 +254,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
need_tlb_flush = kvm_unmap_hva(kvm, address);
need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
@ -291,6 +298,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm->mmu_notifier_count++;
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
need_tlb_flush |= kvm->tlbs_dirty;
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
@ -381,11 +389,15 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
static struct kvm *kvm_create_vm(void)
{
int r = 0, i;
struct kvm *kvm = kvm_arch_create_vm();
int r, i;
struct kvm *kvm = kvm_arch_alloc_vm();
if (IS_ERR(kvm))
goto out;
if (!kvm)
return ERR_PTR(-ENOMEM);
r = kvm_arch_init_vm(kvm);
if (r)
goto out_err_nodisable;
r = hardware_enable_all();
if (r)
@ -399,23 +411,19 @@ static struct kvm *kvm_create_vm(void)
r = -ENOMEM;
kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!kvm->memslots)
goto out_err;
goto out_err_nosrcu;
if (init_srcu_struct(&kvm->srcu))
goto out_err;
goto out_err_nosrcu;
for (i = 0; i < KVM_NR_BUSES; i++) {
kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
GFP_KERNEL);
if (!kvm->buses[i]) {
cleanup_srcu_struct(&kvm->srcu);
if (!kvm->buses[i])
goto out_err;
}
}
r = kvm_init_mmu_notifier(kvm);
if (r) {
cleanup_srcu_struct(&kvm->srcu);
if (r)
goto out_err;
}
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
@ -429,19 +437,35 @@ static struct kvm *kvm_create_vm(void)
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
out:
return kvm;
out_err:
cleanup_srcu_struct(&kvm->srcu);
out_err_nosrcu:
hardware_disable_all();
out_err_nodisable:
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm->buses[i]);
kfree(kvm->memslots);
kfree(kvm);
kvm_arch_free_vm(kvm);
return ERR_PTR(r);
}
static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
{
if (!memslot->dirty_bitmap)
return;
if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
vfree(memslot->dirty_bitmap_head);
else
kfree(memslot->dirty_bitmap_head);
memslot->dirty_bitmap = NULL;
memslot->dirty_bitmap_head = NULL;
}
/*
* Free any memory in @free but not in @dont.
*/
@ -454,7 +478,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
vfree(free->rmap);
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
vfree(free->dirty_bitmap);
kvm_destroy_dirty_bitmap(free);
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
@ -465,7 +489,6 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
}
free->npages = 0;
free->dirty_bitmap = NULL;
free->rmap = NULL;
}
@ -499,6 +522,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
kvm_arch_flush_shadow(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_free_physmem(kvm);
cleanup_srcu_struct(&kvm->srcu);
kvm_arch_free_vm(kvm);
hardware_disable_all();
mmdrop(mm);
}
@ -527,6 +553,27 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
return 0;
}
/*
* Allocation size is twice as large as the actual dirty bitmap size.
* This makes it possible to do double buffering: see x86's
* kvm_vm_ioctl_get_dirty_log().
*/
static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
{
unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
if (dirty_bytes > PAGE_SIZE)
memslot->dirty_bitmap = vzalloc(dirty_bytes);
else
memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
if (!memslot->dirty_bitmap)
return -ENOMEM;
memslot->dirty_bitmap_head = memslot->dirty_bitmap;
return 0;
}
/*
* Allocate some memory and give it an address in the guest physical address
* space.
@ -604,13 +651,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* Allocate if a slot is being created */
#ifndef CONFIG_S390
if (npages && !new.rmap) {
new.rmap = vmalloc(npages * sizeof(*new.rmap));
new.rmap = vzalloc(npages * sizeof(*new.rmap));
if (!new.rmap)
goto out_free;
memset(new.rmap, 0, npages * sizeof(*new.rmap));
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
}
@ -633,14 +678,11 @@ int __kvm_set_memory_region(struct kvm *kvm,
>> KVM_HPAGE_GFN_SHIFT(level));
lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
if (!new.lpage_info[i])
goto out_free;
memset(new.lpage_info[i], 0,
lpages * sizeof(*new.lpage_info[i]));
if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
new.lpage_info[i][0].write_count = 1;
if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
@ -661,12 +703,8 @@ skip_lpage:
/* Allocate page dirty bitmap if needed */
if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new);
new.dirty_bitmap = vmalloc(dirty_bytes);
if (!new.dirty_bitmap)
if (kvm_create_dirty_bitmap(&new) < 0)
goto out_free;
memset(new.dirty_bitmap, 0, dirty_bytes);
/* destroy any largepage mappings for dirty tracking */
if (old.npages)
flush_shadow = 1;
@ -685,6 +723,7 @@ skip_lpage:
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
if (mem->slot >= slots->nmemslots)
slots->nmemslots = mem->slot + 1;
slots->generation++;
slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
old_memslots = kvm->memslots;
@ -719,6 +758,7 @@ skip_lpage:
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
if (mem->slot >= slots->nmemslots)
slots->nmemslots = mem->slot + 1;
slots->generation++;
/* actual memory is freed via old in kvm_free_physmem_slot below */
if (!npages) {
@ -849,10 +889,10 @@ int kvm_is_error_hva(unsigned long addr)
}
EXPORT_SYMBOL_GPL(kvm_is_error_hva);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
gfn_t gfn)
{
int i;
struct kvm_memslots *slots = kvm_memslots(kvm);
for (i = 0; i < slots->nmemslots; ++i) {
struct kvm_memory_slot *memslot = &slots->memslots[i];
@ -863,6 +903,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
}
return NULL;
}
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
}
EXPORT_SYMBOL_GPL(gfn_to_memslot);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@ -925,12 +970,9 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
return memslot - slots->memslots;
}
static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages)
{
struct kvm_memory_slot *slot;
slot = gfn_to_memslot(kvm, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return bad_hva();
@ -942,28 +984,61 @@ static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
return gfn_to_hva_many(kvm, gfn, NULL);
return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
static pfn_t get_fault_pfn(void)
{
get_page(fault_page);
return fault_pfn;
}
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
bool *async, bool write_fault, bool *writable)
{
struct page *page[1];
int npages;
int npages = 0;
pfn_t pfn;
if (atomic)
/* we can do it either atomically or asynchronously, not both */
BUG_ON(atomic && async);
BUG_ON(!write_fault && !writable);
if (writable)
*writable = true;
if (atomic || async)
npages = __get_user_pages_fast(addr, 1, 1, page);
else {
if (unlikely(npages != 1) && !atomic) {
might_sleep();
npages = get_user_pages_fast(addr, 1, 1, page);
if (writable)
*writable = write_fault;
npages = get_user_pages_fast(addr, 1, write_fault, page);
/* map read fault as writable if possible */
if (unlikely(!write_fault) && npages == 1) {
struct page *wpage[1];
npages = __get_user_pages_fast(addr, 1, 1, wpage);
if (npages == 1) {
*writable = true;
put_page(page[0]);
page[0] = wpage[0];
}
npages = 1;
}
}
if (unlikely(npages != 1)) {
struct vm_area_struct *vma;
if (atomic)
goto return_fault_page;
return get_fault_pfn();
down_read(&current->mm->mmap_sem);
if (is_hwpoison_address(addr)) {
@ -972,19 +1047,20 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
return page_to_pfn(hwpoison_page);
}
vma = find_vma(current->mm, addr);
vma = find_vma_intersection(current->mm, addr, addr+1);
if (vma == NULL || addr < vma->vm_start ||
!(vma->vm_flags & VM_PFNMAP)) {
up_read(&current->mm->mmap_sem);
return_fault_page:
get_page(fault_page);
return page_to_pfn(fault_page);
if (vma == NULL)
pfn = get_fault_pfn();
else if ((vma->vm_flags & VM_PFNMAP)) {
pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
BUG_ON(!kvm_is_mmio_pfn(pfn));
} else {
if (async && (vma->vm_flags & VM_WRITE))
*async = true;
pfn = get_fault_pfn();
}
pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
up_read(&current->mm->mmap_sem);
BUG_ON(!kvm_is_mmio_pfn(pfn));
} else
pfn = page_to_pfn(page[0]);
@ -993,40 +1069,58 @@ return_fault_page:
pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
{
return hva_to_pfn(kvm, addr, true);
return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
}
EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
bool write_fault, bool *writable)
{
unsigned long addr;
if (async)
*async = false;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr)) {
get_page(bad_page);
return page_to_pfn(bad_page);
}
return hva_to_pfn(kvm, addr, atomic);
return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_pfn(kvm, gfn, true);
return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
bool write_fault, bool *writable)
{
return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_pfn(kvm, gfn, false);
return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable)
{
return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
return hva_to_pfn(kvm, addr, false);
return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
}
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@ -1035,7 +1129,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
unsigned long addr;
gfn_t entry;
addr = gfn_to_hva_many(kvm, gfn, &entry);
addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
if (kvm_is_error_hva(addr))
return -1;
@ -1219,9 +1313,51 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
return 0;
}
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
int offset = offset_in_page(gpa);
gfn_t gfn = gpa >> PAGE_SHIFT;
ghc->gpa = gpa;
ghc->generation = slots->generation;
ghc->memslot = __gfn_to_memslot(slots, gfn);
ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
if (!kvm_is_error_hva(ghc->hva))
ghc->hva += offset;
else
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
void *data, unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
int r;
if (slots->generation != ghc->generation)
kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
r = copy_to_user((void __user *)ghc->hva, data, len);
if (r)
return -EFAULT;
mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
{
return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
offset, len);
}
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
@ -1244,11 +1380,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
gfn_t gfn)
{
struct kvm_memory_slot *memslot;
memslot = gfn_to_memslot(kvm, gfn);
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
@ -1256,6 +1390,14 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
}
}
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *memslot;
memslot = gfn_to_memslot(kvm, gfn);
mark_page_dirty_in_slot(kvm, memslot, gfn);
}
/*
* The vCPU has executed a HLT instruction with in-kernel mode enabled.
*/
@ -1457,6 +1599,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (arg)
goto out;
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
case KVM_GET_REGS: {
struct kvm_regs *kvm_regs;
@ -1824,7 +1967,7 @@ static struct file_operations kvm_vm_fops = {
static int kvm_dev_ioctl_create_vm(void)
{
int fd, r;
int r;
struct kvm *kvm;
kvm = kvm_create_vm();
@ -1837,11 +1980,11 @@ static int kvm_dev_ioctl_create_vm(void)
return r;
}
#endif
fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
if (fd < 0)
r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
if (r < 0)
kvm_put_kvm(kvm);
return fd;
return r;
}
static long kvm_dev_ioctl_check_extension_generic(long arg)
@ -1922,7 +2065,7 @@ static struct miscdevice kvm_dev = {
&kvm_chardev_ops,
};
static void hardware_enable(void *junk)
static void hardware_enable_nolock(void *junk)
{
int cpu = raw_smp_processor_id();
int r;
@ -1942,7 +2085,14 @@ static void hardware_enable(void *junk)
}
}
static void hardware_disable(void *junk)
static void hardware_enable(void *junk)
{
spin_lock(&kvm_lock);
hardware_enable_nolock(junk);
spin_unlock(&kvm_lock);
}
static void hardware_disable_nolock(void *junk)
{
int cpu = raw_smp_processor_id();
@ -1952,13 +2102,20 @@ static void hardware_disable(void *junk)
kvm_arch_hardware_disable(NULL);
}
static void hardware_disable(void *junk)
{
spin_lock(&kvm_lock);
hardware_disable_nolock(junk);
spin_unlock(&kvm_lock);
}
static void hardware_disable_all_nolock(void)
{
BUG_ON(!kvm_usage_count);
kvm_usage_count--;
if (!kvm_usage_count)
on_each_cpu(hardware_disable, NULL, 1);
on_each_cpu(hardware_disable_nolock, NULL, 1);
}
static void hardware_disable_all(void)
@ -1977,7 +2134,7 @@ static int hardware_enable_all(void)
kvm_usage_count++;
if (kvm_usage_count == 1) {
atomic_set(&hardware_enable_failed, 0);
on_each_cpu(hardware_enable, NULL, 1);
on_each_cpu(hardware_enable_nolock, NULL, 1);
if (atomic_read(&hardware_enable_failed)) {
hardware_disable_all_nolock();
@ -2008,27 +2165,19 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
case CPU_STARTING:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu);
spin_lock(&kvm_lock);
hardware_enable(NULL);
spin_unlock(&kvm_lock);
break;
}
return NOTIFY_OK;
}
asmlinkage void kvm_handle_fault_on_reboot(void)
asmlinkage void kvm_spurious_fault(void)
{
if (kvm_rebooting) {
/* spin while reset goes on */
local_irq_enable();
while (true)
cpu_relax();
}
/* Fault while not rebooting. We want the trace. */
BUG();
}
EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
EXPORT_SYMBOL_GPL(kvm_spurious_fault);
static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
void *v)
@ -2041,7 +2190,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
*/
printk(KERN_INFO "kvm: exiting hardware virtualization\n");
kvm_rebooting = true;
on_each_cpu(hardware_disable, NULL, 1);
on_each_cpu(hardware_disable_nolock, NULL, 1);
return NOTIFY_OK;
}
@ -2211,7 +2360,7 @@ static void kvm_exit_debug(void)
static int kvm_suspend(struct sys_device *dev, pm_message_t state)
{
if (kvm_usage_count)
hardware_disable(NULL);
hardware_disable_nolock(NULL);
return 0;
}
@ -2219,7 +2368,7 @@ static int kvm_resume(struct sys_device *dev)
{
if (kvm_usage_count) {
WARN_ON(spin_is_locked(&kvm_lock));
hardware_enable(NULL);
hardware_enable_nolock(NULL);
}
return 0;
}
@ -2336,6 +2485,10 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
goto out_free_5;
}
r = kvm_async_pf_init();
if (r)
goto out_free;
kvm_chardev_ops.owner = module;
kvm_vm_fops.owner = module;
kvm_vcpu_fops.owner = module;
@ -2343,7 +2496,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
r = misc_register(&kvm_dev);
if (r) {
printk(KERN_ERR "kvm: misc device register failed\n");
goto out_free;
goto out_unreg;
}
kvm_preempt_ops.sched_in = kvm_sched_in;
@ -2353,6 +2506,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
return 0;
out_unreg:
kvm_async_pf_deinit();
out_free:
kmem_cache_destroy(kvm_vcpu_cache);
out_free_5:
@ -2385,11 +2540,12 @@ void kvm_exit(void)
kvm_exit_debug();
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
kvm_async_pf_deinit();
sysdev_unregister(&kvm_sysdev);
sysdev_class_unregister(&kvm_sysdev_class);
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
on_each_cpu(hardware_disable, NULL, 1);
on_each_cpu(hardware_disable_nolock, NULL, 1);
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);