From 908a7bdd6adba3dfd35d8a74a48aed90593de178 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 18 Feb 2013 11:21:16 +0100 Subject: [PATCH 001/204] KVM: nVMX: Improve I/O exit handling This prevents trapping L2 I/O exits if L1 has neither unconditional nor bitmap-based exiting enabled. Furthermore, it implements I/O bitmap handling. Reviewed-by: Gleb Natapov Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 49 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6667042714cc..b4ce43c82748 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5908,6 +5908,52 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); +static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + unsigned long exit_qualification; + gpa_t bitmap, last_bitmap; + unsigned int port; + int size; + u8 b; + + if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING)) + return 1; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; + + last_bitmap = (gpa_t)-1; + b = -1; + + while (size > 0) { + if (port < 0x8000) + bitmap = vmcs12->io_bitmap_a; + else if (port < 0x10000) + bitmap = vmcs12->io_bitmap_b; + else + return 1; + bitmap += (port & 0x7fff) / 8; + + if (last_bitmap != bitmap) + if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) + return 1; + if (b & (1 << (port & 7))) + return 1; + + port++; + size--; + last_bitmap = bitmap; + } + + return 0; +} + /* * Return 1 if we should exit from L2 to L1 to handle an MSR access access, * rather than handle it ourselves in L0. I.e., check whether L1 expressed @@ -6097,8 +6143,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_DR_ACCESS: return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: - /* TODO: support IO bitmaps */ - return 1; + return nested_vmx_exit_handled_io(vcpu, vmcs12); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); From bd31a7f55777a46063ebe3d50e6536cfff51a757 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 14 Feb 2013 19:46:27 +0100 Subject: [PATCH 002/204] KVM: nVMX: Trap unconditionally if msr bitmap access fails This avoids basing decisions on uninitialized variables, potentially leaking kernel data to the L1 guest. Reviewed-by: Gleb Natapov Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b4ce43c82748..c1b3041a1f86 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5985,7 +5985,8 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, /* Then read the msr_index'th bit from this bitmap: */ if (msr_index < 1024*8) { unsigned char b; - kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); + if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) + return 1; return 1 & (b >> (msr_index & 7)); } else return 1; /* let L1 handle the wrong parameter */ From fe1140cc369410a9c206fdb7aaabc644bd213dc2 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 23 Feb 2013 17:05:29 +0100 Subject: [PATCH 003/204] x86: kvmclock: Do not setup kvmclock vsyscall in the absence of that clock This fixes boot lockups with "no-kvmclock", when the host is not exposing this particular feature (QEMU: -cpu ...,-kvmclock) or when the kvmclock initialization failed for whatever reason. Reviewed-by: Marcelo Tosatti Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kernel/kvmclock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5bedbdddf1f2..b730efad6fe9 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -160,8 +160,12 @@ int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high, ret; - struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; + struct pvclock_vcpu_time_info *src; + if (!hv_clock) + return 0; + + src = &hv_clock[cpu].pvti; low = (int)__pa(src) | 1; high = ((u64)__pa(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); @@ -276,6 +280,9 @@ int __init kvm_setup_vsyscall_timeinfo(void) struct pvclock_vcpu_time_info *vcpu_time; unsigned int size; + if (!hv_clock) + return 0; + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); preempt_disable(); From 733568f9cecc061eca213ba0e877a1f820a40de5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 23 Feb 2013 15:07:47 +0100 Subject: [PATCH 004/204] KVM: VMX: Make prepare_vmcs12 and load_vmcs12_host_state static Both are only used locally. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c1b3041a1f86..8a99a62d98a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7330,7 +7330,7 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * exit-information fields only. Other fields are modified by L1 with VMWRITE, * which already writes to vmcs12 directly. */ -void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { /* update guest state fields: */ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); @@ -7421,7 +7421,8 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Failures During or After Loading Guest State"). * This function should be called when the active VMCS is L1's (vmcs01). */ -void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; From d6851fbeee3f79ac2629f823e15ac2a7f6f54e0e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 23 Feb 2013 22:34:39 +0100 Subject: [PATCH 005/204] KVM: nVMX: Advertise PAUSE and WBINVD exiting support These exits have no preconditions, and we already process the corresponding reasons in nested_vmx_exit_handled correctly. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8a99a62d98a6..329836af3240 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2080,6 +2080,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | + CPU_BASED_PAUSE_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the @@ -2094,7 +2095,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_WBINVD_EXITING; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) From 36c3cc422b7c5d3cd84cbac769758b197e08f221 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 23 Feb 2013 22:35:37 +0100 Subject: [PATCH 006/204] KVM: nVMX: Clear segment cache after switching between L1 and L2 Switching the VMCS obviously invalidates what may have been cached about the guest segments. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 329836af3240..3a58c1b8764e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7271,6 +7271,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vcpu->cpu = cpu; put_cpu(); + vmx_segment_cache_clear(vmx); + vmcs12->launch_state = 1; prepare_vmcs02(vcpu, vmcs12); @@ -7517,6 +7519,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) vcpu->cpu = cpu; put_cpu(); + vmx_segment_cache_clear(vmx); + /* if no vmcs02 cache requested, remove the one we used */ if (VMCS02_POOL_SIZE == 0) nested_free_vmcs02(vmx, vmx->nested.current_vmptr); From 957c897e8cf5a26abbce1a1a38833251339d596d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 24 Feb 2013 14:11:34 +0100 Subject: [PATCH 007/204] KVM: nVMX: Use cached exit reason No need to re-read what vmx_vcpu_run already picked up for us. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3a58c1b8764e..f7d2242e5d00 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6082,10 +6082,10 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, */ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) { - u32 exit_reason = vmcs_read32(VM_EXIT_REASON); u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 exit_reason = vmx->exit_reason; if (vmx->nested.nested_run_pending) return 0; @@ -7399,7 +7399,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) /* update exit information fields: */ - vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); + vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason; vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); From 02cdb50fd7e4b5ce1f6f70e27f74283ced0e1872 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 27 Feb 2013 11:33:25 +0800 Subject: [PATCH 008/204] arch/x86/kvm: beautify source code for __u32 irq which is never < 0 irp->irq is __u32 which is never < 0. Signed-off-by: Chen Gang Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3c5bb6fe5280..d0cf7371a558 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2696,7 +2696,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS) + if (irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; From 44ceb9d6653306f73fb40cbeca303b23937efd85 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Feb 2013 14:02:23 +0100 Subject: [PATCH 009/204] KVM: nVMX: Avoid one redundant vmcs_read in prepare_vmcs12 IDT_VECTORING_INFO_FIELD was already read right after vmexit. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7d2242e5d00..238c59b73144 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7404,8 +7404,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - vmcs12->idt_vectoring_info_field = - vmcs_read32(IDT_VECTORING_INFO_FIELD); + vmcs12->idt_vectoring_info_field = to_vmx(vcpu)->idt_vectoring_info; vmcs12->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE); vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); From 3ab66e8a455a4877889c65a848f2fb32be502f2c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 20 Feb 2013 14:03:24 +0100 Subject: [PATCH 010/204] KVM: VMX: Pass vcpu to __vmx_complete_interrupts Cleanup: __vmx_complete_interrupts has no use for the vmx structure. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 238c59b73144..7cc566b09ff2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6436,7 +6436,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); } -static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, +static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, u32 idt_vectoring_info, int instr_len_field, int error_code_field) @@ -6447,46 +6447,43 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; - vmx->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&vmx->vcpu); - kvm_clear_interrupt_queue(&vmx->vcpu); + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); if (!idtv_info_valid) return; - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + kvm_make_request(KVM_REQ_EVENT, vcpu); vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; switch (type) { case INTR_TYPE_NMI_INTR: - vmx->vcpu.arch.nmi_injected = true; + vcpu->arch.nmi_injected = true; /* * SDM 3: 27.7.1.2 (September 2008) * Clear bit "block by NMI" before VM entry if a NMI * delivery faulted. */ - vmx_set_nmi_mask(&vmx->vcpu, false); + vmx_set_nmi_mask(vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(instr_len_field); + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { u32 err = vmcs_read32(error_code_field); - kvm_queue_exception_e(&vmx->vcpu, vector, err); + kvm_queue_exception_e(vcpu, vector, err); } else - kvm_queue_exception(&vmx->vcpu, vector); + kvm_queue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(instr_len_field); + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_EXT_INTR: - kvm_queue_interrupt(&vmx->vcpu, vector, - type == INTR_TYPE_SOFT_INTR); + kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); break; default: break; @@ -6497,7 +6494,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { if (is_guest_mode(&vmx->vcpu)) return; - __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, + __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_ERROR_CODE); } @@ -6506,7 +6503,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) return; - __vmx_complete_interrupts(to_vmx(vcpu), + __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), VM_ENTRY_INSTRUCTION_LEN, VM_ENTRY_EXCEPTION_ERROR_CODE); From 462fce46065ec4b200c08619c047b9e5a8fd154a Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Feb 2013 19:41:56 +0900 Subject: [PATCH 011/204] KVM: set_memory_region: Drop user_alloc from prepare/commit_memory_region() X86 does not use this any more. The remaining user, s390's !user_alloc check, can be simply removed since KVM_SET_MEMORY_REGION ioctl is no longer supported. Note: fixed powerpc's indentations with spaces to suppress checkpatch errors. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/arm/kvm/arm.c | 6 ++---- arch/ia64/kvm/kvm-ia64.c | 6 ++---- arch/powerpc/kvm/powerpc.c | 12 +++++------- arch/s390/kvm/kvm-s390.c | 9 ++------- arch/x86/kvm/x86.c | 6 ++---- include/linux/kvm_host.h | 6 ++---- virt/kvm/kvm_main.c | 4 ++-- 7 files changed, 17 insertions(+), 32 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 5a936988eb24..24cb5f66787d 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -231,16 +231,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm, int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - bool user_alloc) + struct kvm_userspace_memory_region *mem) { return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - bool user_alloc) + struct kvm_memory_slot old) { } diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index ad3126a58644..cbc5b0417dab 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1579,8 +1579,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - bool user_alloc) + struct kvm_userspace_memory_region *mem) { unsigned long i; unsigned long pfn; @@ -1610,8 +1609,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - bool user_alloc) + struct kvm_memory_slot old) { return; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 934413cd3a1b..22b33159fbc4 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -411,18 +411,16 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) } int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - bool user_alloc) + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem) { return kvmppc_core_prepare_memory_region(kvm, memslot, mem); } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - bool user_alloc) + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old) { kvmppc_core_commit_memory_region(kvm, mem, old); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4cf35a0a79e7..07ac302ce246 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -975,8 +975,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - bool user_alloc) + struct kvm_userspace_memory_region *mem) { /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a @@ -997,16 +996,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (mem->memory_size & 0xffffful) return -EINVAL; - if (!user_alloc) - return -EINVAL; - return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - bool user_alloc) + struct kvm_memory_slot old) { int rc; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 811c5c9c8880..26216bb4403f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6907,8 +6907,7 @@ out_free: int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - bool user_alloc) + struct kvm_userspace_memory_region *mem) { int npages = memslot->npages; @@ -6938,8 +6937,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - bool user_alloc) + struct kvm_memory_slot old) { int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cad77fe09d77..b4757a1cc4c4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -464,12 +464,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - bool user_alloc); + struct kvm_userspace_memory_region *mem); void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - bool user_alloc); + struct kvm_memory_slot old); bool kvm_largepages_enabled(void); void kvm_disable_largepages(void); /* flush all memory translations */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index adc68feb5c5a..fd3037010e75 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -875,7 +875,7 @@ int __kvm_set_memory_region(struct kvm *kvm, slots = old_memslots; } - r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); + r = kvm_arch_prepare_memory_region(kvm, &new, old, mem); if (r) goto out_slots; @@ -915,7 +915,7 @@ int __kvm_set_memory_region(struct kvm *kvm, old_memslots = install_new_memslots(kvm, slots, &new); - kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); + kvm_arch_commit_memory_region(kvm, mem, old); kvm_free_physmem_slot(&old, &new); kfree(old_memslots); From 47ae31e257c548abdb199e0d26723139a9a967ba Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Feb 2013 19:43:00 +0900 Subject: [PATCH 012/204] KVM: set_memory_region: Drop user_alloc from set_memory_region() Except ia64's stale code, KVM_SET_MEMORY_REGION support, this is only used for sanity checks in __kvm_set_memory_region() which can easily be changed to use slot id instead. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/ia64/kvm/kvm-ia64.c | 18 ------------------ arch/x86/kvm/vmx.c | 6 +++--- include/linux/kvm_host.h | 10 +++------- virt/kvm/kvm_main.c | 18 +++++++----------- 4 files changed, 13 insertions(+), 39 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index cbc5b0417dab..43701f0c0f71 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -942,24 +942,6 @@ long kvm_arch_vm_ioctl(struct file *filp, int r = -ENOTTY; switch (ioctl) { - case KVM_SET_MEMORY_REGION: { - struct kvm_memory_region kvm_mem; - struct kvm_userspace_memory_region kvm_userspace_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) - goto out; - kvm_userspace_mem.slot = kvm_mem.slot; - kvm_userspace_mem.flags = kvm_mem.flags; - kvm_userspace_mem.guest_phys_addr = - kvm_mem.guest_phys_addr; - kvm_userspace_mem.memory_size = kvm_mem.memory_size; - r = kvm_vm_ioctl_set_memory_region(kvm, - &kvm_userspace_mem, false); - if (r) - goto out; - break; - } case KVM_CREATE_IRQCHIP: r = -EFAULT; r = kvm_ioapic_init(kvm); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7cc566b09ff2..58fb7c27e3b5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3694,7 +3694,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; @@ -3724,7 +3724,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; @@ -4364,7 +4364,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) .flags = 0, }; - ret = kvm_set_memory_region(kvm, &tss_mem, false); + ret = kvm_set_memory_region(kvm, &tss_mem); if (ret) return ret; kvm->arch.tss_addr = addr; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b4757a1cc4c4..84a994c7a5c5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -453,11 +453,9 @@ id_to_memslot(struct kvm_memslots *slots, int id) } int kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - bool user_alloc); + struct kvm_userspace_memory_region *mem); int __kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - bool user_alloc); + struct kvm_userspace_memory_region *mem); void kvm_arch_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont); int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); @@ -553,9 +551,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log); int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct - kvm_userspace_memory_region *mem, - bool user_alloc); + struct kvm_userspace_memory_region *mem); int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fd3037010e75..5b3e41b81f0d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -745,8 +745,7 @@ enum kvm_mr_change { * Must be called holding mmap_sem for write. */ int __kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - bool user_alloc) + struct kvm_userspace_memory_region *mem) { int r; gfn_t base_gfn; @@ -767,7 +766,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (mem->guest_phys_addr & (PAGE_SIZE - 1)) goto out; /* We can read the guest memory with __xxx_user() later on. */ - if (user_alloc && + if ((mem->slot < KVM_USER_MEM_SLOTS) && ((mem->userspace_addr & (PAGE_SIZE - 1)) || !access_ok(VERIFY_WRITE, (void __user *)(unsigned long)mem->userspace_addr, @@ -932,26 +931,23 @@ out: EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - bool user_alloc) + struct kvm_userspace_memory_region *mem) { int r; mutex_lock(&kvm->slots_lock); - r = __kvm_set_memory_region(kvm, mem, user_alloc); + r = __kvm_set_memory_region(kvm, mem); mutex_unlock(&kvm->slots_lock); return r; } EXPORT_SYMBOL_GPL(kvm_set_memory_region); int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct - kvm_userspace_memory_region *mem, - bool user_alloc) + struct kvm_userspace_memory_region *mem) { if (mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; - return kvm_set_memory_region(kvm, mem, user_alloc); + return kvm_set_memory_region(kvm, mem); } int kvm_get_dirty_log(struct kvm *kvm, @@ -2198,7 +2194,7 @@ static long kvm_vm_ioctl(struct file *filp, sizeof kvm_userspace_mem)) goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true); + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); break; } case KVM_GET_DIRTY_LOG: { From 74d0727cb7aaaea48a6353209093be26abc8d160 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Feb 2013 19:43:44 +0900 Subject: [PATCH 013/204] KVM: set_memory_region: Make kvm_mr_change available to arch code This will be used for cleaning up prepare/commit_memory_region() later. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 18 ++++++++++++++++++ virt/kvm/kvm_main.c | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 84a994c7a5c5..8eaf61f7b02d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -452,6 +452,24 @@ id_to_memslot(struct kvm_memslots *slots, int id) return slot; } +/* + * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: + * - create a new memory slot + * - delete an existing memory slot + * - modify an existing memory slot + * -- move it in the guest physical memory space + * -- just change its flags + * + * Since flags can be changed by some of these operations, the following + * differentiation is the best we can do for __kvm_set_memory_region(): + */ +enum kvm_mr_change { + KVM_MR_CREATE, + KVM_MR_DELETE, + KVM_MR_MOVE, + KVM_MR_FLAGS_ONLY, +}; + int kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); int __kvm_set_memory_region(struct kvm *kvm, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5b3e41b81f0d..c7979ed41923 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -718,24 +718,6 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, return old_memslots; } -/* - * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: - * - create a new memory slot - * - delete an existing memory slot - * - modify an existing memory slot - * -- move it in the guest physical memory space - * -- just change its flags - * - * Since flags can be changed by some of these operations, the following - * differentiation is the best we can do for __kvm_set_memory_region(): - */ -enum kvm_mr_change { - KVM_MR_CREATE, - KVM_MR_DELETE, - KVM_MR_MOVE, - KVM_MR_FLAGS_ONLY, -}; - /* * Allocate some memory and give it an address in the guest physical address * space. From 7b6195a91d60909a2834ab7181e2b9476e6fe749 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Feb 2013 19:44:34 +0900 Subject: [PATCH 014/204] KVM: set_memory_region: Refactor prepare_memory_region() This patch drops the parameter old, a copy of the old memory slot, and adds a new parameter named change to know the change being requested. This not only cleans up the code but also removes extra copying of the memory slot structure. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/arm/kvm/arm.c | 4 ++-- arch/ia64/kvm/kvm-ia64.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/x86/kvm/x86.c | 10 ++++------ include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 2 +- 7 files changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 24cb5f66787d..96ebab7a1959 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -230,8 +230,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm, int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { return 0; } diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 43701f0c0f71..5c2b07e8c3d6 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1560,8 +1560,8 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { unsigned long i; unsigned long pfn; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 22b33159fbc4..8aa51cd67c28 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -412,8 +412,8 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { return kvmppc_core_prepare_memory_region(kvm, memslot, mem); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 07ac302ce246..4288780c86b8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -974,8 +974,8 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 26216bb4403f..7198234fa088 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6906,23 +6906,21 @@ out_free: int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { - int npages = memslot->npages; - /* * Only private memory slots need to be mapped here since * KVM_SET_MEMORY_REGION ioctl is no longer supported. */ - if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) { + if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) { unsigned long userspace_addr; /* * MAP_SHARED to prevent internal slot pages from being moved * by fork()/COW. */ - userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE, + userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8eaf61f7b02d..caa72cf7e8e7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -479,8 +479,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem); + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c7979ed41923..8f85bae862c7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -856,7 +856,7 @@ int __kvm_set_memory_region(struct kvm *kvm, slots = old_memslots; } - r = kvm_arch_prepare_memory_region(kvm, &new, old, mem); + r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); if (r) goto out_slots; From 8482644aea11e0647867732319ccf35879a9acc2 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Feb 2013 19:45:25 +0900 Subject: [PATCH 015/204] KVM: set_memory_region: Refactor commit_memory_region() This patch makes the parameter old a const pointer to the old memory slot and adds a new parameter named change to know the change being requested: the former is for removing extra copying and the latter is for cleaning up the code. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/arm/kvm/arm.c | 3 ++- arch/ia64/kvm/kvm-ia64.c | 3 ++- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/kvm/book3s_hv.c | 4 ++-- arch/powerpc/kvm/book3s_pr.c | 2 +- arch/powerpc/kvm/booke.c | 2 +- arch/powerpc/kvm/powerpc.c | 3 ++- arch/s390/kvm/kvm-s390.c | 3 ++- arch/x86/kvm/x86.c | 15 ++++++++------- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 2 +- 11 files changed, 24 insertions(+), 18 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 96ebab7a1959..b32dc446e802 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -238,7 +238,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old, + enum kvm_mr_change change) { } diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 5c2b07e8c3d6..7a54455dde39 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1591,7 +1591,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old, + enum kvm_mr_change change) { return; } diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 44a657adf416..44fa9ad1d62c 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -152,7 +152,7 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old); + const struct kvm_memory_slot *old); extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info); extern void kvmppc_core_flush_memslot(struct kvm *kvm, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 80dcc53a1aba..1e521baf9a7d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1639,12 +1639,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - if (npages && old.npages) { + if (npages && old->npages) { /* * If modifying a memslot, reset all the rmap dirty bits. * If this is a new memslot, we don't need to do anything diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 5e93438afb06..286e23e6b92d 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1283,7 +1283,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old) { } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 020923e43134..eb88fa621073 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1531,7 +1531,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old) { } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 8aa51cd67c28..7b5d4d20cdc5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -420,7 +420,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old, + enum kvm_mr_change change) { kvmppc_core_commit_memory_region(kvm, mem, old); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4288780c86b8..6cae4ad647a9 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1001,7 +1001,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old, + enum kvm_mr_change change) { int rc; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7198234fa088..35b491229c3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6935,16 +6935,17 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old) + const struct kvm_memory_slot *old, + enum kvm_mr_change change) { - int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; + int nr_mmu_pages = 0; - if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) { + if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { int ret; - ret = vm_munmap(old.userspace_addr, - old.npages * PAGE_SIZE); + ret = vm_munmap(old->userspace_addr, + old->npages * PAGE_SIZE); if (ret < 0) printk(KERN_WARNING "kvm_vm_ioctl_set_memory_region: " @@ -6961,13 +6962,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * Existing largepage mappings are destroyed here and new ones will * not be created until the end of the logging. */ - if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) + if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_slot_remove_write_access(kvm, mem->slot); /* * If memory slot is created, or moved, we need to clear all * mmio sptes. */ - if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) { + if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { kvm_mmu_zap_all(kvm); kvm_reload_remote_mmus(kvm); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index caa72cf7e8e7..ac584cc53581 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -483,7 +483,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change); void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old); + const struct kvm_memory_slot *old, + enum kvm_mr_change change); bool kvm_largepages_enabled(void); void kvm_disable_largepages(void); /* flush all memory translations */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8f85bae862c7..0e919a1d4d56 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -896,7 +896,7 @@ int __kvm_set_memory_region(struct kvm *kvm, old_memslots = install_new_memslots(kvm, slots, &new); - kvm_arch_commit_memory_region(kvm, mem, old); + kvm_arch_commit_memory_region(kvm, mem, &old, change); kvm_free_physmem_slot(&old, &new); kfree(old_memslots); From 16014753b10b76385600cd59450a70b8650c72cb Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 27 Feb 2013 19:46:27 +0900 Subject: [PATCH 016/204] KVM: ARM: Remove kvm_arch_set_memory_region() This was replaced with prepare/commit long before: commit f7784b8ec9b6a041fa828cfbe9012fe51933f5ac KVM: split kvm_arch_set_memory_region into prepare and commit Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/arm/kvm/arm.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index b32dc446e802..e4ad0bb01843 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -220,14 +220,6 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) -{ - return 0; -} - int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem, From 503cd0c50ac7161eb5c3891b48b620cb0a5521cd Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 3 Mar 2013 13:05:44 +0100 Subject: [PATCH 017/204] KVM: nVMX: Fix switching of debug state First of all, do not blindly overwrite GUEST_DR7 on L2 entry. The host may have guest debugging enabled. Then properly reset DR7 and DEBUG_CTL on L2->L1 switch as specified in the SDM. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 58fb7c27e3b5..097f5d662275 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6978,7 +6978,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); @@ -7492,6 +7492,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl); + + kvm_set_dr(vcpu, 7, 0x400); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } /* From 07e169335ff0570c6e67b5ccf74d793f00ab0834 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 28 Feb 2013 12:33:16 +0100 Subject: [PATCH 018/204] virtio_ccw: pass a cookie value to kvm hypercall Lookups by channel/vq pair on host during virtio notifications might be expensive. Interpret hypercall return value as a cookie which host can use to do device lookups for the next notification more efficiently. [CH: Fix line > 80 chars] Tested-by: Christian Borntraeger Reviewed-by: Christian Borntraeger Signed-off-by: Michael S. Tsirkin Signed-off-by: Cornelia Huck Signed-off-by: Marcelo Tosatti --- drivers/s390/kvm/virtio_ccw.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 2029b6caa595..3d657522b3e2 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -77,6 +77,7 @@ struct virtio_ccw_vq_info { void *queue; struct vq_info_block *info_block; struct list_head node; + long cookie; }; #define KVM_VIRTIO_CCW_RING_ALIGN 4096 @@ -145,15 +146,18 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev, } static inline long do_kvm_notify(struct subchannel_id schid, - unsigned long queue_index) + unsigned long queue_index, + long cookie) { register unsigned long __nr asm("1") = KVM_S390_VIRTIO_CCW_NOTIFY; register struct subchannel_id __schid asm("2") = schid; register unsigned long __index asm("3") = queue_index; register long __rc asm("2"); + register long __cookie asm("4") = cookie; asm volatile ("diag 2,4,0x500\n" - : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index) + : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index), + "d"(__cookie) : "memory", "cc"); return __rc; } @@ -166,7 +170,8 @@ static void virtio_ccw_kvm_notify(struct virtqueue *vq) vcdev = to_vc_device(info->vq->vdev); ccw_device_get_schid(vcdev->cdev, &schid); - do_kvm_notify(schid, virtqueue_get_queue_index(vq)); + info->cookie = do_kvm_notify(schid, virtqueue_get_queue_index(vq), + info->cookie); } static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev, From 6a773cb825afb74a600a08fe87fab55ee98ec2ac Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 28 Feb 2013 12:33:17 +0100 Subject: [PATCH 019/204] KVM: s390: Export virtio-ccw api. Export the virtio-ccw api in a header for usage by other code. Signed-off-by: Cornelia Huck Signed-off-by: Marcelo Tosatti --- arch/s390/include/uapi/asm/Kbuild | 1 + arch/s390/include/uapi/asm/virtio-ccw.h | 21 +++++++++++++++++++++ drivers/s390/kvm/virtio_ccw.c | 5 +---- 3 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 arch/s390/include/uapi/asm/virtio-ccw.h diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index 7bf68fff7c5d..9ccd1905bdad 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -44,5 +44,6 @@ header-y += termios.h header-y += types.h header-y += ucontext.h header-y += unistd.h +header-y += virtio-ccw.h header-y += vtoc.h header-y += zcrypt.h diff --git a/arch/s390/include/uapi/asm/virtio-ccw.h b/arch/s390/include/uapi/asm/virtio-ccw.h new file mode 100644 index 000000000000..a9a4ebf79fa7 --- /dev/null +++ b/arch/s390/include/uapi/asm/virtio-ccw.h @@ -0,0 +1,21 @@ +/* + * Definitions for virtio-ccw devices. + * + * Copyright IBM Corp. 2013 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): Cornelia Huck + */ +#ifndef __KVM_VIRTIO_CCW_H +#define __KVM_VIRTIO_CCW_H + +/* Alignment of vring buffers. */ +#define KVM_VIRTIO_CCW_RING_ALIGN 4096 + +/* Subcode for diagnose 500 (virtio hypercall). */ +#define KVM_S390_VIRTIO_CCW_NOTIFY 3 + +#endif diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 3d657522b3e2..42d507c4e06b 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -31,6 +31,7 @@ #include #include #include +#include /* * virtio related functions @@ -80,10 +81,6 @@ struct virtio_ccw_vq_info { long cookie; }; -#define KVM_VIRTIO_CCW_RING_ALIGN 4096 - -#define KVM_S390_VIRTIO_CCW_NOTIFY 3 - #define CCW_CMD_SET_VQ 0x13 #define CCW_CMD_VDEV_RESET 0x33 #define CCW_CMD_SET_IND 0x43 From a0f155e9646d5f1c263f6f9aae880151100243bb Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 28 Feb 2013 12:33:18 +0100 Subject: [PATCH 020/204] KVM: Initialize irqfd from kvm_init(). Currently, eventfd introduces module_init/module_exit functions to initialize/cleanup the irqfd workqueue. This only works, however, if no other module_init/module_exit functions are built into the same module. Let's just move the initialization and cleanup to kvm_init and kvm_exit. This way, it is also clearer where kvm startup may fail. Signed-off-by: Cornelia Huck Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 13 +++++++++++++ virt/kvm/eventfd.c | 7 ++----- virt/kvm/kvm_main.c | 6 ++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ac584cc53581..d50fe173028b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -424,6 +424,19 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); int __must_check vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); +#ifdef __KVM_HAVE_IOAPIC +int kvm_irqfd_init(void); +void kvm_irqfd_exit(void); +#else +static inline int kvm_irqfd_init(void) +{ + return 0; +} + +static inline void kvm_irqfd_exit(void) +{ +} +#endif int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, struct module *module); void kvm_exit(void); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index adb17f266b28..0b6fe69bb03d 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -543,7 +543,7 @@ void kvm_irq_routing_update(struct kvm *kvm, * aggregated from all vm* instances. We need our own isolated single-thread * queue to prevent deadlock against flushing the normal work-queue. */ -static int __init irqfd_module_init(void) +int kvm_irqfd_init(void) { irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); if (!irqfd_cleanup_wq) @@ -552,13 +552,10 @@ static int __init irqfd_module_init(void) return 0; } -static void __exit irqfd_module_exit(void) +void kvm_irqfd_exit(void) { destroy_workqueue(irqfd_cleanup_wq); } - -module_init(irqfd_module_init); -module_exit(irqfd_module_exit); #endif /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0e919a1d4d56..faf05bddd131 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2898,6 +2898,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, int r; int cpu; + r = kvm_irqfd_init(); + if (r) + goto out_irqfd; r = kvm_arch_init(opaque); if (r) goto out_fail; @@ -2978,6 +2981,8 @@ out_free_0a: out_free_0: kvm_arch_exit(); out_fail: + kvm_irqfd_exit(); +out_irqfd: return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -2994,6 +2999,7 @@ void kvm_exit(void) on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_arch_hardware_unsetup(); kvm_arch_exit(); + kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); } EXPORT_SYMBOL_GPL(kvm_exit); From 060f0ce6ff975decd1e0ee318c08e228bccbee1e Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 28 Feb 2013 12:33:19 +0100 Subject: [PATCH 021/204] KVM: Introduce KVM_VIRTIO_CCW_NOTIFY_BUS. Add a new bus type for virtio-ccw devices on s390. Signed-off-by: Cornelia Huck Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d50fe173028b..9fa13ebc3381 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -149,6 +149,7 @@ struct kvm_io_bus { enum kvm_bus { KVM_MMIO_BUS, KVM_PIO_BUS, + KVM_VIRTIO_CCW_NOTIFY_BUS, KVM_NR_BUSES }; From 2b83451b45d720ca38c03878ce42ff9139cad9e3 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 28 Feb 2013 12:33:20 +0100 Subject: [PATCH 022/204] KVM: ioeventfd for virtio-ccw devices. Enhance KVM_IOEVENTFD with a new flag that allows to attach to virtio-ccw devices on s390 via the KVM_VIRTIO_CCW_NOTIFY_BUS. Signed-off-by: Cornelia Huck Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 8 ++++++++ include/uapi/linux/kvm.h | 3 +++ virt/kvm/eventfd.c | 17 +++++++++++++---- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 119358dfb742..c16b442556e8 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1486,15 +1486,23 @@ struct kvm_ioeventfd { __u8 pad[36]; }; +For the special case of virtio-ccw devices on s390, the ioevent is matched +to a subchannel/virtqueue tuple instead. + The following flags are defined: #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) +#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ + (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) If datamatch flag is set, the event will be signaled only if the written value to the registered address is equal to datamatch in struct kvm_ioeventfd. +For virtio-ccw devices, addr contains the subchannel id and datamatch the +virtqueue index. + 4.60 KVM_DIRTY_TLB diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3c56ba3d80c1..74d0ff3dfd66 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -449,12 +449,15 @@ enum { kvm_ioeventfd_flag_nr_datamatch, kvm_ioeventfd_flag_nr_pio, kvm_ioeventfd_flag_nr_deassign, + kvm_ioeventfd_flag_nr_virtio_ccw_notify, kvm_ioeventfd_flag_nr_max, }; #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) +#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ + (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 0b6fe69bb03d..020522ed9094 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -674,15 +674,24 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) return false; } +static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) +{ + if (flags & KVM_IOEVENTFD_FLAG_PIO) + return KVM_PIO_BUS; + if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) + return KVM_VIRTIO_CCW_NOTIFY_BUS; + return KVM_MMIO_BUS; +} + static int kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { - int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; - enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; + enum kvm_bus bus_idx; struct _ioeventfd *p; struct eventfd_ctx *eventfd; int ret; + bus_idx = ioeventfd_bus_from_flags(args->flags); /* must be natural-word sized */ switch (args->len) { case 1: @@ -757,12 +766,12 @@ fail: static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { - int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; - enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; + enum kvm_bus bus_idx; struct _ioeventfd *p, *tmp; struct eventfd_ctx *eventfd; int ret = -ENOENT; + bus_idx = ioeventfd_bus_from_flags(args->flags); eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); From 10ccaa1e7057d8a9dc3e9ce833af40ec8187b25e Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 28 Feb 2013 12:33:21 +0100 Subject: [PATCH 023/204] KVM: s390: Wire up ioeventfd. Enable ioeventfd support on s390 and hook up diagnose 500 virtio-ccw notifications. Signed-off-by: Cornelia Huck Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/Kconfig | 1 + arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/diag.c | 26 ++++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 1 + 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 60f9f8ae0fc8..70b46eacf8e1 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_EVENTFD ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 3975722bb19d..8fe9d65a4585 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License (version 2 only) # as published by the Free Software Foundation. -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o) +common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o eventfd.o) ccflags-y := -Ivirt/kvm -Iarch/s390/kvm diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index a390687feb13..1c01a9912989 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -13,6 +13,7 @@ #include #include +#include #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" @@ -104,6 +105,29 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) return -EREMOTE; } +static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) +{ + int ret, idx; + + /* No virtio-ccw notification? Get out quickly. */ + if (!vcpu->kvm->arch.css_support || + (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) + return -EOPNOTSUPP; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + /* + * The layout is as follows: + * - gpr 2 contains the subchannel id (passed as addr) + * - gpr 3 contains the virtqueue index (passed as datamatch) + */ + ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS, + vcpu->run->s.regs.gprs[2], + 8, &vcpu->run->s.regs.gprs[3]); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + /* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */ + return ret < 0 ? ret : 0; +} + int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) { int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; @@ -118,6 +142,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) return __diag_time_slice_end_directed(vcpu); case 0x308: return __diag_ipl_functions(vcpu); + case 0x500: + return __diag_virtio_hypercall(vcpu); default: return -EOPNOTSUPP; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6cae4ad647a9..33161b4a8280 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -142,6 +142,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: + case KVM_CAP_IOEVENTFD: r = 1; break; case KVM_CAP_NR_VCPUS: From c4627c72e9c9e0fc35af2e9d612888fe4564377d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 3 Mar 2013 20:47:11 +0100 Subject: [PATCH 024/204] KVM: nVMX: Reset RFLAGS on VM-exit Ouch, how could this work so well that far? We need to clear RFLAGS to the reset value as specified by the SDM. Particularly, IF must be off after VM-exit! Reviewed-by: Gleb Natapov Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 097f5d662275..aacf6a458ae1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7434,6 +7434,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); + vmx_set_rflags(vcpu, X86_EFLAGS_BIT1); /* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't * actually changed, because it depends on the current state of From 33fb20c39e98b90813b5ab2d9a0d6faa6300caca Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 6 Mar 2013 15:44:03 +0100 Subject: [PATCH 025/204] KVM: nVMX: Fix content of MSR_IA32_VMX_ENTRY/EXIT_CTLS Properly set those bits to 1 that the spec demands in case bit 55 of VMX_BASIC is 0 - like in our case. Reviewed-by: Paolo Bonzini Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 4 ++++ arch/x86/kvm/vmx.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index b6fbf860e398..5fb6e24f0649 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -81,6 +81,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x00200000 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 +#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff + #define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 @@ -89,6 +91,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 +#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff + /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index aacf6a458ae1..a9d885353108 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2049,21 +2049,28 @@ static __init void nested_vmx_setup_ctls_msrs(void) PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; - /* exit controls */ - nested_vmx_exit_ctls_low = 0; + /* + * Exit controls + * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and + * 17 must be 1. + */ + nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ #ifdef CONFIG_X86_64 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; #else nested_vmx_exit_ctls_high = 0; #endif + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - nested_vmx_entry_ctls_low = 0; + /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ + nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; + nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, From 1a0d74e66405a795bb37a4a23ece50f8d8e5e81e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 7 Mar 2013 14:08:07 +0100 Subject: [PATCH 026/204] KVM: nVMX: Fix setting of CR0 and CR4 in guest mode The logic for calculating the value with which we call kvm_set_cr0/4 was broken (will definitely be visible with nested unrestricted guest mode support). Also, we performed the check regarding CR0_ALWAYSON too early when in guest mode. What really needs to be done on both CR0 and CR4 is to mask out L1-owned bits and merge them in from L1's guest_cr0/4. In contrast, arch.cr0/4 and arch.cr0/4_guest_owned_bits contain the mangled L0+L1 state and, thus, are not suited as input. For both CRs, we can then apply the check against VMXON_CRx_ALWAYSON and refuse the update if it fails. To be fully consistent, we implement this check now also for CR4. For CR4, we move the check into vmx_set_cr4 while we keep it in handle_set_cr0. This is because the CR0 checks for vmxon vs. guest mode will diverge soon when adding unrestricted guest mode support. Finally, we have to set the shadow to the value L2 wanted to write originally. Reviewed-by: Gleb Natapov Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 48 +++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a9d885353108..260da9ac1678 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3223,7 +3223,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) */ if (!nested_vmx_allowed(vcpu)) return 1; - } else if (to_vmx(vcpu)->nested.vmxon) + } + if (to_vmx(vcpu)->nested.vmxon && + ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) return 1; vcpu->arch.cr4 = cr4; @@ -4612,34 +4614,50 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { - if (to_vmx(vcpu)->nested.vmxon && - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) - return 1; - if (is_guest_mode(vcpu)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long orig_val = val; + /* * We get here when L2 changed cr0 in a way that did not change * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), - * but did change L0 shadowed bits. This can currently happen - * with the TS bit: L0 may want to leave TS on (for lazy fpu - * loading) while pretending to allow the guest to change it. + * but did change L0 shadowed bits. So we first calculate the + * effective cr0 value that L1 would like to write into the + * hardware. It consists of the L2-owned bits from the new + * value combined with the L1-owned bits from L1's guest_cr0. */ - if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | - (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) + val = (val & ~vmcs12->cr0_guest_host_mask) | + (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); + + /* TODO: will have to take unrestricted guest mode into + * account */ + if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) return 1; - vmcs_writel(CR0_READ_SHADOW, val); + + if (kvm_set_cr0(vcpu, val)) + return 1; + vmcs_writel(CR0_READ_SHADOW, orig_val); return 0; - } else + } else { + if (to_vmx(vcpu)->nested.vmxon && + ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) + return 1; return kvm_set_cr0(vcpu, val); + } } static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) { if (is_guest_mode(vcpu)) { - if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | - (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long orig_val = val; + + /* analogously to handle_set_cr0 */ + val = (val & ~vmcs12->cr4_guest_host_mask) | + (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); + if (kvm_set_cr4(vcpu, val)) return 1; - vmcs_writel(CR4_READ_SHADOW, val); + vmcs_writel(CR4_READ_SHADOW, orig_val); return 0; } else return kvm_set_cr4(vcpu, val); From 744b37fb5a63d45e92e590967bae82d8ac62e950 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Mar 2013 13:14:40 +0100 Subject: [PATCH 027/204] s390/kvm,gaccess: fix guest access return code handling Guest access functions like copy_to/from_guest() call __guestaddr_to_user() which in turn call gmap_fault() in order to translate a guest address to a user space address. In error case __guest_addr_to_user() returns either -EFAULT or -ENOMEM. The copy_to/from_guest functions just pass these return values down to the callers. The -ENOMEM case however is problematic since there are several places which access guest memory like: rc = copy_to_guest(...); if (rc == -EFAULT) error_handling(); So in case of -ENOMEM the code assumes that the guest memory access succeeded even though it failed. This can cause guest data or state corruption. If __guestaddr_to_user() returns -ENOMEM the meaning is that a valid user space mapping exists, but there was not enough memory available when trying to build the guest mapping. In other words an out-of-memory situation occured. For normal user space accesses an out-of-memory situation causes the page fault handler to map -ENOMEM to -EFAULT (see fixup code in do_no_context()). We need to do exactly the same for the kvm gaccess functions. So __guestaddr_to_user() should just map all error codes to -EFAULT. Signed-off-by: Heiko Carstens Reviewed-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/gaccess.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 4703f129e95e..84d01dd7a8e4 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -22,13 +22,16 @@ static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu, unsigned long guestaddr) { unsigned long prefix = vcpu->arch.sie_block->prefix; + unsigned long uaddress; if (guestaddr < 2 * PAGE_SIZE) guestaddr += prefix; else if ((guestaddr >= prefix) && (guestaddr < prefix + 2 * PAGE_SIZE)) guestaddr -= prefix; - - return (void __user *) gmap_fault(guestaddr, vcpu->arch.gmap); + uaddress = gmap_fault(guestaddr, vcpu->arch.gmap); + if (IS_ERR_VALUE(uaddress)) + uaddress = -EFAULT; + return (void __user *)uaddress; } static inline int get_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr, From 9e0fdb4145205bea95c2888a195c3ead2652f120 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Mar 2013 13:14:41 +0100 Subject: [PATCH 028/204] s390/mm,gmap: implement gmap_translate() Implement gmap_translate() function which translates a guest absolute address to a user space process address without establishing the guest page table entries. This is useful for kvm guest address translations where no memory access is expected to happen soon (e.g. tprot exception handler). Signed-off-by: Heiko Carstens Reviewed-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/include/asm/pgtable.h | 2 + arch/s390/mm/pgtable.c | 107 +++++++++++++++++++++++++------- 2 files changed, 87 insertions(+), 22 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 4a2930844d43..75b8750a16ff 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -759,6 +759,8 @@ void gmap_disable(struct gmap *gmap); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long length); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); +unsigned long __gmap_translate(unsigned long address, struct gmap *); +unsigned long gmap_translate(unsigned long address, struct gmap *); unsigned long __gmap_fault(unsigned long address, struct gmap *); unsigned long gmap_fault(unsigned long address, struct gmap *); void gmap_discard(unsigned long from, unsigned long to, struct gmap *); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index ae44d2a34313..2accf7113d13 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -379,45 +379,108 @@ out_unmap: } EXPORT_SYMBOL_GPL(gmap_map_segment); +static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap) +{ + unsigned long *table; + + table = gmap->table + ((address >> 53) & 0x7ff); + if (unlikely(*table & _REGION_ENTRY_INV)) + return ERR_PTR(-EFAULT); + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = table + ((address >> 42) & 0x7ff); + if (unlikely(*table & _REGION_ENTRY_INV)) + return ERR_PTR(-EFAULT); + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = table + ((address >> 31) & 0x7ff); + if (unlikely(*table & _REGION_ENTRY_INV)) + return ERR_PTR(-EFAULT); + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = table + ((address >> 20) & 0x7ff); + return table; +} + +/** + * __gmap_translate - translate a guest address to a user space address + * @address: guest address + * @gmap: pointer to guest mapping meta data structure + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + * The mmap_sem of the mm that belongs to the address space must be held + * when this function gets called. + */ +unsigned long __gmap_translate(unsigned long address, struct gmap *gmap) +{ + unsigned long *segment_ptr, vmaddr, segment; + struct gmap_pgtable *mp; + struct page *page; + + current->thread.gmap_addr = address; + segment_ptr = gmap_table_walk(address, gmap); + if (IS_ERR(segment_ptr)) + return PTR_ERR(segment_ptr); + /* Convert the gmap address to an mm address. */ + segment = *segment_ptr; + if (!(segment & _SEGMENT_ENTRY_INV)) { + page = pfn_to_page(segment >> PAGE_SHIFT); + mp = (struct gmap_pgtable *) page->index; + return mp->vmaddr | (address & ~PMD_MASK); + } else if (segment & _SEGMENT_ENTRY_RO) { + vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; + return vmaddr | (address & ~PMD_MASK); + } + return -EFAULT; +} +EXPORT_SYMBOL_GPL(__gmap_translate); + +/** + * gmap_translate - translate a guest address to a user space address + * @address: guest address + * @gmap: pointer to guest mapping meta data structure + * + * Returns user space address which corresponds to the guest address or + * -EFAULT if no such mapping exists. + * This function does not establish potentially missing page table entries. + */ +unsigned long gmap_translate(unsigned long address, struct gmap *gmap) +{ + unsigned long rc; + + down_read(&gmap->mm->mmap_sem); + rc = __gmap_translate(address, gmap); + up_read(&gmap->mm->mmap_sem); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_translate); + /* * this function is assumed to be called with mmap_sem held */ unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) { - unsigned long *table, vmaddr, segment; - struct mm_struct *mm; + unsigned long *segment_ptr, vmaddr, segment; + struct vm_area_struct *vma; struct gmap_pgtable *mp; struct gmap_rmap *rmap; - struct vm_area_struct *vma; + struct mm_struct *mm; struct page *page; pgd_t *pgd; pud_t *pud; pmd_t *pmd; current->thread.gmap_addr = address; - mm = gmap->mm; - /* Walk the gmap address space page table */ - table = gmap->table + ((address >> 53) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) + segment_ptr = gmap_table_walk(address, gmap); + if (IS_ERR(segment_ptr)) return -EFAULT; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 42) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) - return -EFAULT; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 31) & 0x7ff); - if (unlikely(*table & _REGION_ENTRY_INV)) - return -EFAULT; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - table = table + ((address >> 20) & 0x7ff); - /* Convert the gmap address to an mm address. */ - segment = *table; - if (likely(!(segment & _SEGMENT_ENTRY_INV))) { + segment = *segment_ptr; + if (!(segment & _SEGMENT_ENTRY_INV)) { page = pfn_to_page(segment >> PAGE_SHIFT); mp = (struct gmap_pgtable *) page->index; return mp->vmaddr | (address & ~PMD_MASK); } else if (segment & _SEGMENT_ENTRY_RO) { + mm = gmap->mm; vmaddr = segment & _SEGMENT_ENTRY_ORIGIN; vma = find_vma(mm, vmaddr); if (!vma || vma->vm_start > vmaddr) @@ -441,12 +504,12 @@ unsigned long __gmap_fault(unsigned long address, struct gmap *gmap) /* Link gmap segment table entry location to page table. */ page = pmd_page(*pmd); mp = (struct gmap_pgtable *) page->index; - rmap->entry = table; + rmap->entry = segment_ptr; spin_lock(&mm->page_table_lock); list_add(&rmap->list, &mp->mapper); spin_unlock(&mm->page_table_lock); /* Set gmap segment table entry to page table. */ - *table = pmd_val(*pmd) & PAGE_MASK; + *segment_ptr = pmd_val(*pmd) & PAGE_MASK; return vmaddr | (address & ~PMD_MASK); } return -EFAULT; From 59a1fa2d80c0d351755cb29273b2b256dc4b3a11 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Mar 2013 13:14:42 +0100 Subject: [PATCH 029/204] s390/kvm,tprot: use new gmap_translate() function When out-of-memory the tprot code incorrectly injected a program check for the guest which reported an addressing exception even if the guest address was valid. Let's use the new gmap_translate() which translates a guest address to a user space address whithout the chance of running into an out-of-memory situation. Also make it more explicit that for -EFAULT we won't find a vma. Signed-off-by: Heiko Carstens Reviewed-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/priv.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 0ef9894606e5..75ad91e38e8a 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -575,20 +575,13 @@ static int handle_tprot(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) return -EOPNOTSUPP; - - /* we must resolve the address without holding the mmap semaphore. - * This is ok since the userspace hypervisor is not supposed to change - * the mapping while the guest queries the memory. Otherwise the guest - * might crash or get wrong info anyway. */ - user_address = (unsigned long) __guestaddr_to_user(vcpu, address1); - down_read(¤t->mm->mmap_sem); + user_address = __gmap_translate(address1, vcpu->arch.gmap); + if (IS_ERR_VALUE(user_address)) + goto out_inject; vma = find_vma(current->mm, user_address); - if (!vma) { - up_read(¤t->mm->mmap_sem); - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - } - + if (!vma) + goto out_inject; vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); if (!(vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_READ)) vcpu->arch.sie_block->gpsw.mask |= (1ul << 44); @@ -597,6 +590,10 @@ static int handle_tprot(struct kvm_vcpu *vcpu) up_read(¤t->mm->mmap_sem); return 0; + +out_inject: + up_read(¤t->mm->mmap_sem); + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); } int kvm_s390_handle_e5(struct kvm_vcpu *vcpu) From dc5008b9bf6adb0c0a5afba6fb376a85451b2697 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Mar 2013 13:14:43 +0100 Subject: [PATCH 030/204] s390/kvm: remove explicit -EFAULT return code checking on guest access Let's change to the paradigm that every return code from guest memory access functions that is not zero translates to -EFAULT and do not explictly compare. Explictly comparing the return value with -EFAULT has already shown to be a bit fragile. In addition this is closer to the handling of copy_to/from_user functions, which imho is in general a good idea. Also shorten the return code handling in interrupt.c a bit. Signed-off-by: Heiko Carstens Acked-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/intercept.c | 4 +- arch/s390/kvm/interrupt.c | 241 +++++++++++--------------------------- arch/s390/kvm/priv.c | 6 +- 3 files changed, 74 insertions(+), 177 deletions(-) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index f26ff1e31bdb..9b2204759445 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -45,7 +45,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) do { rc = get_guest_u64(vcpu, useraddr, &vcpu->arch.sie_block->gcr[reg]); - if (rc == -EFAULT) { + if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); break; } @@ -79,7 +79,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu) reg = reg1; do { rc = get_guest_u32(vcpu, useraddr, &val); - if (rc == -EFAULT) { + if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); break; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 37116a77cb4b..5afa931aed11 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -180,7 +180,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info *inti) { const unsigned short table[] = { 2, 4, 4, 6 }; - int rc, exception = 0; + int rc = 0; switch (inti->type) { case KVM_S390_INT_EMERGENCY: @@ -188,74 +188,38 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->emerg.code, 0); - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->emerg.code); - if (rc == -EFAULT) - exception = 1; - - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - __LC_EXT_NEW_PSW, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; + rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201); + rc |= put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->emerg.code); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); break; - case KVM_S390_INT_EXTERNAL_CALL: VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); vcpu->stat.deliver_external_call++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->extcall.code, 0); - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->extcall.code); - if (rc == -EFAULT) - exception = 1; - - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - __LC_EXT_NEW_PSW, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; + rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); + rc |= put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->extcall.code); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); break; - case KVM_S390_INT_SERVICE: VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", inti->ext.ext_params); vcpu->stat.deliver_service_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->ext.ext_params, 0); - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401); - if (rc == -EFAULT) - exception = 1; - - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - __LC_EXT_NEW_PSW, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params); - if (rc == -EFAULT) - exception = 1; + rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); + rc |= put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params); break; - case KVM_S390_INT_VIRTIO: VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", inti->ext.ext_params, inti->ext.ext_params2); @@ -263,34 +227,16 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->ext.ext_params, inti->ext.ext_params2); - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, 0x0d00); - if (rc == -EFAULT) - exception = 1; - - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - __LC_EXT_NEW_PSW, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u64(vcpu, __LC_EXT_PARAMS2, - inti->ext.ext_params2); - if (rc == -EFAULT) - exception = 1; + rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603); + rc |= put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, 0x0d00); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); + rc |= put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params); + rc |= put_guest_u64(vcpu, __LC_EXT_PARAMS2, + inti->ext.ext_params2); break; - case KVM_S390_SIGP_STOP: VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); vcpu->stat.deliver_stop_signal++; @@ -313,18 +259,14 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_restart_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0, 0); - rc = copy_to_guest(vcpu, offsetof(struct _lowcore, - restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - offsetof(struct _lowcore, restart_psw), sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; + rc = copy_to_guest(vcpu, + offsetof(struct _lowcore, restart_old_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + offsetof(struct _lowcore, restart_psw), + sizeof(psw_t)); atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); break; - case KVM_S390_PROGRAM_INT: VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x", inti->pgm.code, @@ -332,24 +274,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_program_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->pgm.code, 0); - rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u16(vcpu, __LC_PGM_ILC, - table[vcpu->arch.sie_block->ipa >> 14]); - if (rc == -EFAULT) - exception = 1; - - rc = copy_to_guest(vcpu, __LC_PGM_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - __LC_PGM_NEW_PSW, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; + rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code); + rc |= put_guest_u16(vcpu, __LC_PGM_ILC, + table[vcpu->arch.sie_block->ipa >> 14]); + rc |= copy_to_guest(vcpu, __LC_PGM_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_PGM_NEW_PSW, sizeof(psw_t)); break; case KVM_S390_MCHK: @@ -358,24 +289,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->mchk.cr14, inti->mchk.mcic); - rc = kvm_s390_vcpu_store_status(vcpu, - KVM_S390_STORE_STATUS_PREFIXED); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic); - if (rc == -EFAULT) - exception = 1; - - rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - __LC_MCK_NEW_PSW, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; + rc = kvm_s390_vcpu_store_status(vcpu, + KVM_S390_STORE_STATUS_PREFIXED); + rc |= put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic); + rc |= copy_to_guest(vcpu, __LC_MCK_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_MCK_NEW_PSW, sizeof(psw_t)); break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: @@ -388,67 +308,44 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_io_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, param0, param1); - rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID, - inti->io.subchannel_id); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR, - inti->io.subchannel_nr); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u32(vcpu, __LC_IO_INT_PARM, - inti->io.io_int_parm); - if (rc == -EFAULT) - exception = 1; - - rc = put_guest_u32(vcpu, __LC_IO_INT_WORD, - inti->io.io_int_word); - if (rc == -EFAULT) - exception = 1; - - rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - __LC_IO_NEW_PSW, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; + rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID, + inti->io.subchannel_id); + rc |= put_guest_u16(vcpu, __LC_SUBCHANNEL_NR, + inti->io.subchannel_nr); + rc |= put_guest_u32(vcpu, __LC_IO_INT_PARM, + inti->io.io_int_parm); + rc |= put_guest_u32(vcpu, __LC_IO_INT_WORD, + inti->io.io_int_word); + rc |= copy_to_guest(vcpu, __LC_IO_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_IO_NEW_PSW, sizeof(psw_t)); break; } default: BUG(); } - if (exception) { + if (rc) { printk("kvm: The guest lowcore is not mapped during interrupt " - "delivery, killing userspace\n"); + "delivery, killing userspace\n"); do_exit(SIGKILL); } } static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) { - int rc, exception = 0; + int rc; if (psw_extint_disabled(vcpu)) return 0; if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul)) return 0; - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1004); - if (rc == -EFAULT) - exception = 1; - rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, - __LC_EXT_NEW_PSW, sizeof(psw_t)); - if (rc == -EFAULT) - exception = 1; - if (exception) { + rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1004); + rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, + __LC_EXT_NEW_PSW, sizeof(psw_t)); + if (rc) { printk("kvm: The guest lowcore is not mapped during interrupt " "delivery, killing userspace\n"); do_exit(SIGKILL); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 75ad91e38e8a..34b42dc285ee 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -108,7 +108,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) } rc = put_guest_u16(vcpu, useraddr, vcpu->vcpu_id); - if (rc == -EFAULT) { + if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; } @@ -230,7 +230,7 @@ static int handle_stfl(struct kvm_vcpu *vcpu) rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), &facility_list, sizeof(facility_list)); - if (rc == -EFAULT) + if (rc) kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); else { VCPU_EVENT(vcpu, 5, "store facility list value %x", @@ -348,7 +348,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu) } rc = put_guest_u64(vcpu, operand2, vcpu->arch.stidp_data); - if (rc == -EFAULT) { + if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; } From 396083a964aa4e86061d0e3449b1e0548a8197a9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Mar 2013 13:14:44 +0100 Subject: [PATCH 031/204] s390/kvm,gaccess: shorten put/get_guest code The put_guest_u*/get_guest_u* are nothing but wrappers for the regular put_user/get_user uaccess functions. The only difference is that before accessing user space the guest address must be translated to a user space address. Change the order of arguments for the guest access functions so they match their uaccess parts. Also remove the u* suffix, so we simply have put_guest/get_guest which will automatically use the right size dependent on pointer type of the destination/source that now must be correct. In result the same behaviour as put_user/get_user except that accesses must be aligned. Signed-off-by: Heiko Carstens Acked-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/gaccess.h | 149 ++++++++++---------------------------- arch/s390/kvm/intercept.c | 6 +- arch/s390/kvm/interrupt.c | 52 +++++++------ arch/s390/kvm/priv.c | 22 +++--- 4 files changed, 79 insertions(+), 150 deletions(-) diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 84d01dd7a8e4..82f450ecb585 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,122 +18,47 @@ #include #include "kvm-s390.h" -static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu, - unsigned long guestaddr) +static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr) { unsigned long prefix = vcpu->arch.sie_block->prefix; - unsigned long uaddress; + unsigned long gaddr = (unsigned long) gptr; + unsigned long uaddr; - if (guestaddr < 2 * PAGE_SIZE) - guestaddr += prefix; - else if ((guestaddr >= prefix) && (guestaddr < prefix + 2 * PAGE_SIZE)) - guestaddr -= prefix; - uaddress = gmap_fault(guestaddr, vcpu->arch.gmap); - if (IS_ERR_VALUE(uaddress)) - uaddress = -EFAULT; - return (void __user *)uaddress; + if (gaddr < 2 * PAGE_SIZE) + gaddr += prefix; + else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE)) + gaddr -= prefix; + uaddr = gmap_fault(gaddr, vcpu->arch.gmap); + if (IS_ERR_VALUE(uaddr)) + uaddr = -EFAULT; + return (void *)uaddr; } -static inline int get_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr, - u64 *result) -{ - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); - - BUG_ON(guestaddr & 7); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - return get_user(*result, (unsigned long __user *) uptr); -} - -static inline int get_guest_u32(struct kvm_vcpu *vcpu, unsigned long guestaddr, - u32 *result) -{ - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); - - BUG_ON(guestaddr & 3); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - return get_user(*result, (u32 __user *) uptr); -} - -static inline int get_guest_u16(struct kvm_vcpu *vcpu, unsigned long guestaddr, - u16 *result) -{ - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); - - BUG_ON(guestaddr & 1); - - if (IS_ERR(uptr)) - return PTR_ERR(uptr); - - return get_user(*result, (u16 __user *) uptr); -} - -static inline int get_guest_u8(struct kvm_vcpu *vcpu, unsigned long guestaddr, - u8 *result) -{ - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - return get_user(*result, (u8 __user *) uptr); -} - -static inline int put_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr, - u64 value) -{ - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); - - BUG_ON(guestaddr & 7); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - return put_user(value, (u64 __user *) uptr); -} - -static inline int put_guest_u32(struct kvm_vcpu *vcpu, unsigned long guestaddr, - u32 value) -{ - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); - - BUG_ON(guestaddr & 3); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - return put_user(value, (u32 __user *) uptr); -} - -static inline int put_guest_u16(struct kvm_vcpu *vcpu, unsigned long guestaddr, - u16 value) -{ - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); - - BUG_ON(guestaddr & 1); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - return put_user(value, (u16 __user *) uptr); -} - -static inline int put_guest_u8(struct kvm_vcpu *vcpu, unsigned long guestaddr, - u8 value) -{ - void __user *uptr = __guestaddr_to_user(vcpu, guestaddr); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - return put_user(value, (u8 __user *) uptr); -} +#define get_guest(vcpu, x, gptr) \ +({ \ + __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr); \ + int __mask = sizeof(__typeof__(*(gptr))) - 1; \ + int __ret = PTR_RET(__uptr); \ + \ + if (!__ret) { \ + BUG_ON((unsigned long)__uptr & __mask); \ + __ret = get_user(x, __uptr); \ + } \ + __ret; \ +}) +#define put_guest(vcpu, x, gptr) \ +({ \ + __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr); \ + int __mask = sizeof(__typeof__(*(gptr))) - 1; \ + int __ret = PTR_RET(__uptr); \ + \ + if (!__ret) { \ + BUG_ON((unsigned long)__uptr & __mask); \ + __ret = put_user(x, __uptr); \ + } \ + __ret; \ +}) static inline int __copy_to_guest_slow(struct kvm_vcpu *vcpu, unsigned long guestdest, @@ -144,7 +69,7 @@ static inline int __copy_to_guest_slow(struct kvm_vcpu *vcpu, u8 *data = from; for (i = 0; i < n; i++) { - rc = put_guest_u8(vcpu, guestdest++, *(data++)); + rc = put_guest(vcpu, *(data++), (u8 *)guestdest++); if (rc < 0) return rc; } @@ -270,7 +195,7 @@ static inline int __copy_from_guest_slow(struct kvm_vcpu *vcpu, void *to, u8 *data = to; for (i = 0; i < n; i++) { - rc = get_guest_u8(vcpu, guestsrc++, data++); + rc = get_guest(vcpu, *(data++), (u8 *)guestsrc++); if (rc < 0) return rc; } diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 9b2204759445..64744003a66e 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -43,8 +43,8 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); do { - rc = get_guest_u64(vcpu, useraddr, - &vcpu->arch.sie_block->gcr[reg]); + rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg], + (u64 *) useraddr); if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); break; @@ -78,7 +78,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu) reg = reg1; do { - rc = get_guest_u32(vcpu, useraddr, &val); + rc = get_guest(vcpu, val, (u32 *) useraddr); if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); break; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5afa931aed11..d78824b18e9d 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -188,8 +188,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->emerg.code, 0); - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201); - rc |= put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->emerg.code); + rc = put_guest(vcpu, 0x1201, (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest(vcpu, inti->emerg.code, + (u16 *)__LC_EXT_CPU_ADDR); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -200,8 +201,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_external_call++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->extcall.code, 0); - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); - rc |= put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->extcall.code); + rc = put_guest(vcpu, 0x1202, (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest(vcpu, inti->extcall.code, + (u16 *)__LC_EXT_CPU_ADDR); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -213,12 +215,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_service_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->ext.ext_params, 0); - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401); + rc = put_guest(vcpu, 0x2401, (u16 *)__LC_EXT_INT_CODE); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, __LC_EXT_NEW_PSW, sizeof(psw_t)); - rc |= put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params); + rc |= put_guest(vcpu, inti->ext.ext_params, + (u32 *)__LC_EXT_PARAMS); break; case KVM_S390_INT_VIRTIO: VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", @@ -227,15 +230,16 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->ext.ext_params, inti->ext.ext_params2); - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603); - rc |= put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, 0x0d00); + rc = put_guest(vcpu, 0x2603, (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest(vcpu, 0x0d00, (u16 *)__LC_EXT_CPU_ADDR); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, __LC_EXT_NEW_PSW, sizeof(psw_t)); - rc |= put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params); - rc |= put_guest_u64(vcpu, __LC_EXT_PARAMS2, - inti->ext.ext_params2); + rc |= put_guest(vcpu, inti->ext.ext_params, + (u32 *)__LC_EXT_PARAMS); + rc |= put_guest(vcpu, inti->ext.ext_params2, + (u64 *)__LC_EXT_PARAMS2); break; case KVM_S390_SIGP_STOP: VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); @@ -274,9 +278,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_program_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->pgm.code, 0); - rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code); - rc |= put_guest_u16(vcpu, __LC_PGM_ILC, - table[vcpu->arch.sie_block->ipa >> 14]); + rc = put_guest(vcpu, inti->pgm.code, (u16 *)__LC_PGM_INT_CODE); + rc |= put_guest(vcpu, table[vcpu->arch.sie_block->ipa >> 14], + (u16 *)__LC_PGM_ILC); rc |= copy_to_guest(vcpu, __LC_PGM_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -291,7 +295,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, inti->mchk.mcic); rc = kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_PREFIXED); - rc |= put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic); + rc |= put_guest(vcpu, inti->mchk.mcic, (u64 *) __LC_MCCK_CODE); rc |= copy_to_guest(vcpu, __LC_MCK_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -308,14 +312,14 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_io_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, param0, param1); - rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID, - inti->io.subchannel_id); - rc |= put_guest_u16(vcpu, __LC_SUBCHANNEL_NR, - inti->io.subchannel_nr); - rc |= put_guest_u32(vcpu, __LC_IO_INT_PARM, - inti->io.io_int_parm); - rc |= put_guest_u32(vcpu, __LC_IO_INT_WORD, - inti->io.io_int_word); + rc = put_guest(vcpu, inti->io.subchannel_id, + (u16 *) __LC_SUBCHANNEL_ID); + rc |= put_guest(vcpu, inti->io.subchannel_nr, + (u16 *) __LC_SUBCHANNEL_NR); + rc |= put_guest(vcpu, inti->io.io_int_parm, + (u32 *) __LC_IO_INT_PARM); + rc |= put_guest(vcpu, inti->io.io_int_word, + (u32 *) __LC_IO_INT_WORD); rc |= copy_to_guest(vcpu, __LC_IO_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -340,7 +344,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) return 0; if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul)) return 0; - rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1004); + rc = put_guest(vcpu, 0x1004, (u16 *)__LC_EXT_INT_CODE); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 34b42dc285ee..cb07147cda73 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -41,7 +41,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) } /* get the value */ - if (get_guest_u32(vcpu, operand2, &address)) { + if (get_guest(vcpu, address, (u32 *) operand2)) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; } @@ -82,7 +82,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) address = address & 0x7fffe000u; /* get the value */ - if (put_guest_u32(vcpu, operand2, address)) { + if (put_guest(vcpu, address, (u32 *)operand2)) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; } @@ -107,7 +107,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) goto out; } - rc = put_guest_u16(vcpu, useraddr, vcpu->vcpu_id); + rc = put_guest(vcpu, vcpu->vcpu_id, (u16 *)useraddr); if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; @@ -142,18 +142,18 @@ static int handle_tpi(struct kvm_vcpu *vcpu) * Store the two-word I/O interruption code into the * provided area. */ - put_guest_u16(vcpu, addr, inti->io.subchannel_id); - put_guest_u16(vcpu, addr + 2, inti->io.subchannel_nr); - put_guest_u32(vcpu, addr + 4, inti->io.io_int_parm); + put_guest(vcpu, inti->io.subchannel_id, (u16 *) addr); + put_guest(vcpu, inti->io.subchannel_nr, (u16 *) (addr + 2)); + put_guest(vcpu, inti->io.io_int_parm, (u32 *) (addr + 4)); } else { /* * Store the three-word I/O interruption code into * the appropriate lowcore area. */ - put_guest_u16(vcpu, 184, inti->io.subchannel_id); - put_guest_u16(vcpu, 186, inti->io.subchannel_nr); - put_guest_u32(vcpu, 188, inti->io.io_int_parm); - put_guest_u32(vcpu, 192, inti->io.io_int_word); + put_guest(vcpu, inti->io.subchannel_id, (u16 *) 184); + put_guest(vcpu, inti->io.subchannel_nr, (u16 *) 186); + put_guest(vcpu, inti->io.io_int_parm, (u32 *) 188); + put_guest(vcpu, inti->io.io_int_word, (u32 *) 192); } cc = 1; } else @@ -347,7 +347,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu) goto out; } - rc = put_guest_u64(vcpu, operand2, vcpu->arch.stidp_data); + rc = put_guest(vcpu, vcpu->arch.stidp_data, (u64 *)operand2); if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; From f9dc72e82d32cc9fe40d1dea7709d434bba2d4a9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Mar 2013 13:14:45 +0100 Subject: [PATCH 032/204] s390/kvm,gaccess: shorten copy_to/from_guest code The code can be significantly shortened. There is no functional change, except that for large (> PAGE_SIZE) copies the guest translation would be done more frequently. However, there is not a single user which does this currently. If one gets added later on this functionality can be added easily again. Signed-off-by: Heiko Carstens Reviewed-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/gaccess.h | 292 ++++++---------------------------------- 1 file changed, 40 insertions(+), 252 deletions(-) diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 82f450ecb585..8608d7e6a334 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,16 +18,19 @@ #include #include "kvm-s390.h" -static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr) +static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr, + int prefixing) { unsigned long prefix = vcpu->arch.sie_block->prefix; unsigned long gaddr = (unsigned long) gptr; unsigned long uaddr; - if (gaddr < 2 * PAGE_SIZE) - gaddr += prefix; - else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE)) - gaddr -= prefix; + if (prefixing) { + if (gaddr < 2 * PAGE_SIZE) + gaddr += prefix; + else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE)) + gaddr -= prefix; + } uaddr = gmap_fault(gaddr, vcpu->arch.gmap); if (IS_ERR_VALUE(uaddr)) uaddr = -EFAULT; @@ -36,7 +39,7 @@ static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr) #define get_guest(vcpu, x, gptr) \ ({ \ - __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr); \ + __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\ int __mask = sizeof(__typeof__(*(gptr))) - 1; \ int __ret = PTR_RET(__uptr); \ \ @@ -49,7 +52,7 @@ static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr) #define put_guest(vcpu, x, gptr) \ ({ \ - __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr); \ + __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\ int __mask = sizeof(__typeof__(*(gptr))) - 1; \ int __ret = PTR_RET(__uptr); \ \ @@ -60,255 +63,40 @@ static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr) __ret; \ }) -static inline int __copy_to_guest_slow(struct kvm_vcpu *vcpu, - unsigned long guestdest, - void *from, unsigned long n) +static inline int __copy_guest(struct kvm_vcpu *vcpu, unsigned long to, + unsigned long from, unsigned long len, + int to_guest, int prefixing) { - int rc; - unsigned long i; - u8 *data = from; + unsigned long _len, rc; + void *uptr; - for (i = 0; i < n; i++) { - rc = put_guest(vcpu, *(data++), (u8 *)guestdest++); - if (rc < 0) - return rc; + while (len) { + uptr = to_guest ? (void *)to : (void *)from; + uptr = __gptr_to_uptr(vcpu, uptr, prefixing); + if (IS_ERR(uptr)) + return -EFAULT; + _len = PAGE_SIZE - ((unsigned long)uptr & (PAGE_SIZE - 1)); + _len = min(_len, len); + if (to_guest) + rc = copy_to_user(uptr, (void *)from, _len); + else + rc = copy_from_user((void *)to, uptr, _len); + if (rc) + return -EFAULT; + len -= _len; + from += _len; + to += _len; } return 0; } -static inline int __copy_to_guest_fast(struct kvm_vcpu *vcpu, - unsigned long guestdest, - void *from, unsigned long n) -{ - int r; - void __user *uptr; - unsigned long size; +#define copy_to_guest(vcpu, to, from, size) \ + __copy_guest(vcpu, to, (unsigned long)from, size, 1, 1) +#define copy_from_guest(vcpu, to, from, size) \ + __copy_guest(vcpu, (unsigned long)to, from, size, 0, 1) +#define copy_to_guest_absolute(vcpu, to, from, size) \ + __copy_guest(vcpu, to, (unsigned long)from, size, 1, 0) +#define copy_from_guest_absolute(vcpu, to, from, size) \ + __copy_guest(vcpu, (unsigned long)to, from, size, 0, 0) - if (guestdest + n < guestdest) - return -EFAULT; - - /* simple case: all within one segment table entry? */ - if ((guestdest & PMD_MASK) == ((guestdest+n) & PMD_MASK)) { - uptr = (void __user *) gmap_fault(guestdest, vcpu->arch.gmap); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - r = copy_to_user(uptr, from, n); - - if (r) - r = -EFAULT; - - goto out; - } - - /* copy first segment */ - uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - size = PMD_SIZE - (guestdest & ~PMD_MASK); - - r = copy_to_user(uptr, from, size); - - if (r) { - r = -EFAULT; - goto out; - } - from += size; - n -= size; - guestdest += size; - - /* copy full segments */ - while (n >= PMD_SIZE) { - uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - r = copy_to_user(uptr, from, PMD_SIZE); - - if (r) { - r = -EFAULT; - goto out; - } - from += PMD_SIZE; - n -= PMD_SIZE; - guestdest += PMD_SIZE; - } - - /* copy the tail segment */ - if (n) { - uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - r = copy_to_user(uptr, from, n); - - if (r) - r = -EFAULT; - } -out: - return r; -} - -static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu, - unsigned long guestdest, - void *from, unsigned long n) -{ - return __copy_to_guest_fast(vcpu, guestdest, from, n); -} - -static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest, - void *from, unsigned long n) -{ - unsigned long prefix = vcpu->arch.sie_block->prefix; - - if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE)) - goto slowpath; - - if ((guestdest < prefix) && (guestdest + n > prefix)) - goto slowpath; - - if ((guestdest < prefix + 2 * PAGE_SIZE) - && (guestdest + n > prefix + 2 * PAGE_SIZE)) - goto slowpath; - - if (guestdest < 2 * PAGE_SIZE) - guestdest += prefix; - else if ((guestdest >= prefix) && (guestdest < prefix + 2 * PAGE_SIZE)) - guestdest -= prefix; - - return __copy_to_guest_fast(vcpu, guestdest, from, n); -slowpath: - return __copy_to_guest_slow(vcpu, guestdest, from, n); -} - -static inline int __copy_from_guest_slow(struct kvm_vcpu *vcpu, void *to, - unsigned long guestsrc, - unsigned long n) -{ - int rc; - unsigned long i; - u8 *data = to; - - for (i = 0; i < n; i++) { - rc = get_guest(vcpu, *(data++), (u8 *)guestsrc++); - if (rc < 0) - return rc; - } - return 0; -} - -static inline int __copy_from_guest_fast(struct kvm_vcpu *vcpu, void *to, - unsigned long guestsrc, - unsigned long n) -{ - int r; - void __user *uptr; - unsigned long size; - - if (guestsrc + n < guestsrc) - return -EFAULT; - - /* simple case: all within one segment table entry? */ - if ((guestsrc & PMD_MASK) == ((guestsrc+n) & PMD_MASK)) { - uptr = (void __user *) gmap_fault(guestsrc, vcpu->arch.gmap); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - r = copy_from_user(to, uptr, n); - - if (r) - r = -EFAULT; - - goto out; - } - - /* copy first segment */ - uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - size = PMD_SIZE - (guestsrc & ~PMD_MASK); - - r = copy_from_user(to, uptr, size); - - if (r) { - r = -EFAULT; - goto out; - } - to += size; - n -= size; - guestsrc += size; - - /* copy full segments */ - while (n >= PMD_SIZE) { - uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - r = copy_from_user(to, uptr, PMD_SIZE); - - if (r) { - r = -EFAULT; - goto out; - } - to += PMD_SIZE; - n -= PMD_SIZE; - guestsrc += PMD_SIZE; - } - - /* copy the tail segment */ - if (n) { - uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap); - - if (IS_ERR((void __force *) uptr)) - return PTR_ERR((void __force *) uptr); - - r = copy_from_user(to, uptr, n); - - if (r) - r = -EFAULT; - } -out: - return r; -} - -static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to, - unsigned long guestsrc, - unsigned long n) -{ - return __copy_from_guest_fast(vcpu, to, guestsrc, n); -} - -static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to, - unsigned long guestsrc, unsigned long n) -{ - unsigned long prefix = vcpu->arch.sie_block->prefix; - - if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE)) - goto slowpath; - - if ((guestsrc < prefix) && (guestsrc + n > prefix)) - goto slowpath; - - if ((guestsrc < prefix + 2 * PAGE_SIZE) - && (guestsrc + n > prefix + 2 * PAGE_SIZE)) - goto slowpath; - - if (guestsrc < 2 * PAGE_SIZE) - guestsrc += prefix; - else if ((guestsrc >= prefix) && (guestsrc < prefix + 2 * PAGE_SIZE)) - guestsrc -= prefix; - - return __copy_from_guest_fast(vcpu, to, guestsrc, n); -slowpath: - return __copy_from_guest_slow(vcpu, to, guestsrc, n); -} -#endif +#endif /* __KVM_S390_GACCESS_H */ From 7c959e82ac331396d05e7118a48c7c1debbefdf8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Mar 2013 13:14:46 +0100 Subject: [PATCH 033/204] s390/kvm: cleanup/fix handle_tpi() - add missing specification exception check - remove one level of indentation - use defines instead of magic numbers Signed-off-by: Heiko Carstens Reviewed-by: Cornelia Huck Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/priv.c | 54 ++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index cb07147cda73..d64382c1ed61 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -129,39 +130,44 @@ static int handle_skey(struct kvm_vcpu *vcpu) static int handle_tpi(struct kvm_vcpu *vcpu) { - u64 addr; struct kvm_s390_interrupt_info *inti; + u64 addr; int cc; addr = kvm_s390_get_base_disp_s(vcpu); - + if (addr & 3) { + kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + goto out; + } + cc = 0; inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0); - if (inti) { - if (addr) { - /* - * Store the two-word I/O interruption code into the - * provided area. - */ - put_guest(vcpu, inti->io.subchannel_id, (u16 *) addr); - put_guest(vcpu, inti->io.subchannel_nr, (u16 *) (addr + 2)); - put_guest(vcpu, inti->io.io_int_parm, (u32 *) (addr + 4)); - } else { - /* - * Store the three-word I/O interruption code into - * the appropriate lowcore area. - */ - put_guest(vcpu, inti->io.subchannel_id, (u16 *) 184); - put_guest(vcpu, inti->io.subchannel_nr, (u16 *) 186); - put_guest(vcpu, inti->io.io_int_parm, (u32 *) 188); - put_guest(vcpu, inti->io.io_int_word, (u32 *) 192); - } - cc = 1; - } else - cc = 0; + if (!inti) + goto no_interrupt; + cc = 1; + if (addr) { + /* + * Store the two-word I/O interruption code into the + * provided area. + */ + put_guest(vcpu, inti->io.subchannel_id, (u16 *) addr); + put_guest(vcpu, inti->io.subchannel_nr, (u16 *) (addr + 2)); + put_guest(vcpu, inti->io.io_int_parm, (u32 *) (addr + 4)); + } else { + /* + * Store the three-word I/O interruption code into + * the appropriate lowcore area. + */ + put_guest(vcpu, inti->io.subchannel_id, (u16 *) __LC_SUBCHANNEL_ID); + put_guest(vcpu, inti->io.subchannel_nr, (u16 *) __LC_SUBCHANNEL_NR); + put_guest(vcpu, inti->io.io_int_parm, (u32 *) __LC_IO_INT_PARM); + put_guest(vcpu, inti->io.io_int_word, (u32 *) __LC_IO_INT_WORD); + } kfree(inti); +no_interrupt: /* Set condition code and we're done. */ vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44; +out: return 0; } From 0a75ca277c9f1145df37f8bbad10aecf0049a554 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 5 Mar 2013 13:14:47 +0100 Subject: [PATCH 034/204] s390/kvm,gaccess: add address space annotations Add missing address space annotations to all put_guest()/get_guest() callers. Signed-off-by: Heiko Carstens Acked-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger Signed-off-by: Marcelo Tosatti --- arch/s390/kvm/gaccess.h | 21 +++++++++++---------- arch/s390/kvm/intercept.c | 4 ++-- arch/s390/kvm/interrupt.c | 36 ++++++++++++++++++------------------ arch/s390/kvm/priv.c | 22 +++++++++++----------- 4 files changed, 42 insertions(+), 41 deletions(-) diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 8608d7e6a334..302e0e52b009 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,8 +18,9 @@ #include #include "kvm-s390.h" -static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr, - int prefixing) +static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu, + void __user *gptr, + int prefixing) { unsigned long prefix = vcpu->arch.sie_block->prefix; unsigned long gaddr = (unsigned long) gptr; @@ -34,14 +35,14 @@ static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr, uaddr = gmap_fault(gaddr, vcpu->arch.gmap); if (IS_ERR_VALUE(uaddr)) uaddr = -EFAULT; - return (void *)uaddr; + return (void __user *)uaddr; } #define get_guest(vcpu, x, gptr) \ ({ \ __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\ int __mask = sizeof(__typeof__(*(gptr))) - 1; \ - int __ret = PTR_RET(__uptr); \ + int __ret = PTR_RET((void __force *)__uptr); \ \ if (!__ret) { \ BUG_ON((unsigned long)__uptr & __mask); \ @@ -54,7 +55,7 @@ static inline void *__gptr_to_uptr(struct kvm_vcpu *vcpu, void *gptr, ({ \ __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\ int __mask = sizeof(__typeof__(*(gptr))) - 1; \ - int __ret = PTR_RET(__uptr); \ + int __ret = PTR_RET((void __force *)__uptr); \ \ if (!__ret) { \ BUG_ON((unsigned long)__uptr & __mask); \ @@ -68,19 +69,19 @@ static inline int __copy_guest(struct kvm_vcpu *vcpu, unsigned long to, int to_guest, int prefixing) { unsigned long _len, rc; - void *uptr; + void __user *uptr; while (len) { - uptr = to_guest ? (void *)to : (void *)from; + uptr = to_guest ? (void __user *)to : (void __user *)from; uptr = __gptr_to_uptr(vcpu, uptr, prefixing); - if (IS_ERR(uptr)) + if (IS_ERR((void __force *)uptr)) return -EFAULT; _len = PAGE_SIZE - ((unsigned long)uptr & (PAGE_SIZE - 1)); _len = min(_len, len); if (to_guest) - rc = copy_to_user(uptr, (void *)from, _len); + rc = copy_to_user((void __user *) uptr, (void *)from, _len); else - rc = copy_from_user((void *)to, uptr, _len); + rc = copy_from_user((void *)to, (void __user *)uptr, _len); if (rc) return -EFAULT; len -= _len; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 64744003a66e..c6ba4dfd7f1e 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -44,7 +44,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) do { rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg], - (u64 *) useraddr); + (u64 __user *) useraddr); if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); break; @@ -78,7 +78,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu) reg = reg1; do { - rc = get_guest(vcpu, val, (u32 *) useraddr); + rc = get_guest(vcpu, val, (u32 __user *) useraddr); if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); break; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d78824b18e9d..5c948177529e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -188,9 +188,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->emerg.code, 0); - rc = put_guest(vcpu, 0x1201, (u16 *)__LC_EXT_INT_CODE); + rc = put_guest(vcpu, 0x1201, (u16 __user *)__LC_EXT_INT_CODE); rc |= put_guest(vcpu, inti->emerg.code, - (u16 *)__LC_EXT_CPU_ADDR); + (u16 __user *)__LC_EXT_CPU_ADDR); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -201,9 +201,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_external_call++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->extcall.code, 0); - rc = put_guest(vcpu, 0x1202, (u16 *)__LC_EXT_INT_CODE); + rc = put_guest(vcpu, 0x1202, (u16 __user *)__LC_EXT_INT_CODE); rc |= put_guest(vcpu, inti->extcall.code, - (u16 *)__LC_EXT_CPU_ADDR); + (u16 __user *)__LC_EXT_CPU_ADDR); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -215,13 +215,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_service_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->ext.ext_params, 0); - rc = put_guest(vcpu, 0x2401, (u16 *)__LC_EXT_INT_CODE); + rc = put_guest(vcpu, 0x2401, (u16 __user *)__LC_EXT_INT_CODE); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, __LC_EXT_NEW_PSW, sizeof(psw_t)); rc |= put_guest(vcpu, inti->ext.ext_params, - (u32 *)__LC_EXT_PARAMS); + (u32 __user *)__LC_EXT_PARAMS); break; case KVM_S390_INT_VIRTIO: VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", @@ -230,16 +230,16 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->ext.ext_params, inti->ext.ext_params2); - rc = put_guest(vcpu, 0x2603, (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest(vcpu, 0x0d00, (u16 *)__LC_EXT_CPU_ADDR); + rc = put_guest(vcpu, 0x2603, (u16 __user *)__LC_EXT_INT_CODE); + rc |= put_guest(vcpu, 0x0d00, (u16 __user *)__LC_EXT_CPU_ADDR); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, __LC_EXT_NEW_PSW, sizeof(psw_t)); rc |= put_guest(vcpu, inti->ext.ext_params, - (u32 *)__LC_EXT_PARAMS); + (u32 __user *)__LC_EXT_PARAMS); rc |= put_guest(vcpu, inti->ext.ext_params2, - (u64 *)__LC_EXT_PARAMS2); + (u64 __user *)__LC_EXT_PARAMS2); break; case KVM_S390_SIGP_STOP: VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); @@ -278,9 +278,9 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, vcpu->stat.deliver_program_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, inti->pgm.code, 0); - rc = put_guest(vcpu, inti->pgm.code, (u16 *)__LC_PGM_INT_CODE); + rc = put_guest(vcpu, inti->pgm.code, (u16 __user *)__LC_PGM_INT_CODE); rc |= put_guest(vcpu, table[vcpu->arch.sie_block->ipa >> 14], - (u16 *)__LC_PGM_ILC); + (u16 __user *)__LC_PGM_ILC); rc |= copy_to_guest(vcpu, __LC_PGM_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -295,7 +295,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, inti->mchk.mcic); rc = kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_PREFIXED); - rc |= put_guest(vcpu, inti->mchk.mcic, (u64 *) __LC_MCCK_CODE); + rc |= put_guest(vcpu, inti->mchk.mcic, (u64 __user *) __LC_MCCK_CODE); rc |= copy_to_guest(vcpu, __LC_MCK_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -313,13 +313,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu, trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, param0, param1); rc = put_guest(vcpu, inti->io.subchannel_id, - (u16 *) __LC_SUBCHANNEL_ID); + (u16 __user *) __LC_SUBCHANNEL_ID); rc |= put_guest(vcpu, inti->io.subchannel_nr, - (u16 *) __LC_SUBCHANNEL_NR); + (u16 __user *) __LC_SUBCHANNEL_NR); rc |= put_guest(vcpu, inti->io.io_int_parm, - (u32 *) __LC_IO_INT_PARM); + (u32 __user *) __LC_IO_INT_PARM); rc |= put_guest(vcpu, inti->io.io_int_word, - (u32 *) __LC_IO_INT_WORD); + (u32 __user *) __LC_IO_INT_WORD); rc |= copy_to_guest(vcpu, __LC_IO_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, @@ -344,7 +344,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) return 0; if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul)) return 0; - rc = put_guest(vcpu, 0x1004, (u16 *)__LC_EXT_INT_CODE); + rc = put_guest(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE); rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW, &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index d64382c1ed61..7db2ad076f31 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -42,7 +42,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) } /* get the value */ - if (get_guest(vcpu, address, (u32 *) operand2)) { + if (get_guest(vcpu, address, (u32 __user *) operand2)) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; } @@ -83,7 +83,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) address = address & 0x7fffe000u; /* get the value */ - if (put_guest(vcpu, address, (u32 *)operand2)) { + if (put_guest(vcpu, address, (u32 __user *)operand2)) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; } @@ -108,7 +108,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) goto out; } - rc = put_guest(vcpu, vcpu->vcpu_id, (u16 *)useraddr); + rc = put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr); if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; @@ -149,18 +149,18 @@ static int handle_tpi(struct kvm_vcpu *vcpu) * Store the two-word I/O interruption code into the * provided area. */ - put_guest(vcpu, inti->io.subchannel_id, (u16 *) addr); - put_guest(vcpu, inti->io.subchannel_nr, (u16 *) (addr + 2)); - put_guest(vcpu, inti->io.io_int_parm, (u32 *) (addr + 4)); + put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) addr); + put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) (addr + 2)); + put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) (addr + 4)); } else { /* * Store the three-word I/O interruption code into * the appropriate lowcore area. */ - put_guest(vcpu, inti->io.subchannel_id, (u16 *) __LC_SUBCHANNEL_ID); - put_guest(vcpu, inti->io.subchannel_nr, (u16 *) __LC_SUBCHANNEL_NR); - put_guest(vcpu, inti->io.io_int_parm, (u32 *) __LC_IO_INT_PARM); - put_guest(vcpu, inti->io.io_int_word, (u32 *) __LC_IO_INT_WORD); + put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) __LC_SUBCHANNEL_ID); + put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) __LC_SUBCHANNEL_NR); + put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) __LC_IO_INT_PARM); + put_guest(vcpu, inti->io.io_int_word, (u32 __user *) __LC_IO_INT_WORD); } kfree(inti); no_interrupt: @@ -353,7 +353,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu) goto out; } - rc = put_guest(vcpu, vcpu->arch.stidp_data, (u64 *)operand2); + rc = put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2); if (rc) { kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out; From 1044b0303464788820984a5b01c0a81860dce749 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 6 Mar 2013 16:05:07 +0900 Subject: [PATCH 035/204] KVM: MMU: Fix and clean up for_each_gfn_* macros The expression (sp)->gfn should not be expanded using @gfn. Although no user of these macros passes a string other than gfn now, this should be fixed before anyone sees strange errors. Note: ignored the following checkpatch errors: ERROR: Macros with complex values should be enclosed in parenthesis ERROR: trailing statements should be on next line Reviewed-by: Gleb Natapov Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 956ca358108a..3e4822b449da 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1644,16 +1644,14 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -#define for_each_gfn_sp(kvm, sp, gfn) \ - hlist_for_each_entry(sp, \ - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ - if ((sp)->gfn != (gfn)) {} else +#define for_each_gfn_sp(_kvm, _sp, _gfn) \ + hlist_for_each_entry(_sp, \ + &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ + if ((_sp)->gfn != (_gfn)) {} else -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn) \ - hlist_for_each_entry(sp, \ - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ - if ((sp)->gfn != (gfn) || (sp)->role.direct || \ - (sp)->role.invalid) {} else +#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ + for_each_gfn_sp(_kvm, _sp, _gfn) \ + if ((_sp)->role.direct || (_sp)->role.invalid) {} else /* @sp->gfn should be write-protected at the call site */ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, From 945315b9dbbe102bb3393a34ea4a10fb2a5ff303 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 6 Mar 2013 16:05:52 +0900 Subject: [PATCH 036/204] KVM: MMU: Use list_for_each_entry_safe in kvm_mmu_commit_zap_page() We are traversing the linked list, invalid_list, deleting each entry by kvm_mmu_free_page(). _safe version is there for such a case. Reviewed-by: Gleb Natapov Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3e4822b449da..0f42645a063c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2087,7 +2087,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list) { - struct kvm_mmu_page *sp; + struct kvm_mmu_page *sp, *nsp; if (list_empty(invalid_list)) return; @@ -2104,11 +2104,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, */ kvm_flush_remote_tlbs(kvm); - do { - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); + list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON(!sp->role.invalid || sp->root_count); kvm_mmu_free_page(sp); - } while (!list_empty(invalid_list)); + } } /* From 5da596078f915a62e39a20e582308eab91b88c9a Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 6 Mar 2013 16:06:58 +0900 Subject: [PATCH 037/204] KVM: MMU: Introduce a helper function for FIFO zapping Make the code for zapping the oldest mmu page, placed at the tail of the active list, a separate function. Reviewed-by: Gleb Natapov Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 55 +++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0f42645a063c..fdacabba6f62 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2110,6 +2110,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } } +static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, + struct list_head *invalid_list) +{ + struct kvm_mmu_page *sp; + + if (list_empty(&kvm->arch.active_mmu_pages)) + return false; + + sp = list_entry(kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + + return true; +} + /* * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock @@ -2117,23 +2132,15 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) { LIST_HEAD(invalid_list); - /* - * If we set the number of mmu pages to be smaller be than the - * number of actived pages , we must to free some mmu pages before we - * change the value - */ spin_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && - !list_empty(&kvm->arch.active_mmu_pages)) { - struct kvm_mmu_page *page; + /* Need to free some mmu pages to achieve the goal. */ + while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) + if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) + break; - page = container_of(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); - } kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } @@ -4007,13 +4014,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { LIST_HEAD(invalid_list); - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && - !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - struct kvm_mmu_page *sp; + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { + if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) + break; - sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -4182,19 +4186,6 @@ restart: spin_unlock(&kvm->mmu_lock); } -static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, - struct list_head *invalid_list) -{ - struct kvm_mmu_page *page; - - if (list_empty(&kvm->arch.active_mmu_pages)) - return; - - page = container_of(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, page, invalid_list); -} - static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; @@ -4229,7 +4220,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); + prepare_zap_oldest_mmu_page(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); From 3a08a8f9f0936e182d387afd85fdc5d303381521 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Mon, 4 Mar 2013 23:32:07 +0530 Subject: [PATCH 038/204] kvm: Record the preemption status of vcpus using preempt notifiers Note that we mark as preempted only when vcpu's task state was Running during preemption. Thanks Jiannan, Avi for preemption notifier ideas. Thanks Gleb, PeterZ for their precious suggestions. Thanks Srikar for an idea on avoiding rcu lock while checking task state that improved overcommit numbers. Reviewed-by: Chegu Vinod Reviewed-by: Marcelo Tosatti Signed-off-by: Raghavendra K T Signed-off-by: Gleb Natapov --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9fa13ebc3381..0f4941a9c9c8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -253,6 +253,7 @@ struct kvm_vcpu { bool dy_eligible; } spin_loop; #endif + bool preempted; struct kvm_vcpu_arch arch; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index faf05bddd131..470f2bc8205a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -244,6 +244,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); + vcpu->preempted = false; r = kvm_arch_vcpu_init(vcpu); if (r < 0) @@ -2880,6 +2881,8 @@ struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) static void kvm_sched_in(struct preempt_notifier *pn, int cpu) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + if (vcpu->preempted) + vcpu->preempted = false; kvm_arch_vcpu_load(vcpu, cpu); } @@ -2889,6 +2892,8 @@ static void kvm_sched_out(struct preempt_notifier *pn, { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + if (current->state == TASK_RUNNING) + vcpu->preempted = true; kvm_arch_vcpu_put(vcpu); } From 7bc7ae25b1438bb9fe1f176b951d758789847640 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Mon, 4 Mar 2013 23:32:27 +0530 Subject: [PATCH 039/204] kvm: Iterate over only vcpus that are preempted This helps in filtering out the eligible candidates further and thus potentially helps in quickly allowing preempted lockholders to run. Note that if a vcpu was spinning during preemption we filter them by checking whether they are preempted due to pause loop exit. Reviewed-by: Chegu Vinod Reviewed-by: Marcelo Tosatti Signed-off-by: Raghavendra K T Signed-off-by: Gleb Natapov --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 470f2bc8205a..ff7154188b5f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1768,6 +1768,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) continue; } else if (pass && i > last_boosted_vcpu) break; + if (!ACCESS_ONCE(vcpu->preempted)) + continue; if (vcpu == me) continue; if (waitqueue_active(&vcpu->wq)) From 0fa24ce3f57144e390a1566774c23434975a52a9 Mon Sep 17 00:00:00 2001 From: Ioan Orghici Date: Sun, 10 Mar 2013 15:46:00 +0200 Subject: [PATCH 040/204] kvm: remove cast for kmalloc return value Signed-off-by: Ioan Orghici Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 260da9ac1678..473a5fe7e006 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5286,8 +5286,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) } /* Create a new VMCS */ - item = (struct vmcs02_list *) - kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); + item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); if (!item) return NULL; item->vmcs02.vmcs = alloc_vmcs(); From 03ba32cae66e3798d697e582633af2c7dd6907e5 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 11 Mar 2013 23:10:24 -0300 Subject: [PATCH 041/204] VMX: x86: handle host TSC calibration failure If the host TSC calibration fails, tsc_khz is zero (see tsc_init.c). Handle such case properly in KVM (instead of dividing by zero). https://bugzilla.redhat.com/show_bug.cgi?id=859282 Signed-off-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35b491229c3a..b67985af1753 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1079,6 +1079,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) u32 thresh_lo, thresh_hi; int use_scaling = 0; + /* tsc_khz can be zero if TSC calibration fails */ + if (this_tsc_khz == 0) + return; + /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, &vcpu->arch.virtual_tsc_shift, @@ -1156,20 +1160,23 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; - /* n.b - signed multiplication and division required */ - usdiff = data - kvm->arch.last_tsc_write; + if (vcpu->arch.virtual_tsc_khz) { + /* n.b - signed multiplication and division required */ + usdiff = data - kvm->arch.last_tsc_write; #ifdef CONFIG_X86_64 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; + usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; #else - /* do_div() only does unsigned */ - asm("idivl %2; xor %%edx, %%edx" - : "=A"(usdiff) - : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); + /* do_div() only does unsigned */ + asm("idivl %2; xor %%edx, %%edx" + : "=A"(usdiff) + : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); #endif - do_div(elapsed, 1000); - usdiff -= elapsed; - if (usdiff < 0) - usdiff = -usdiff; + do_div(elapsed, 1000); + usdiff -= elapsed; + if (usdiff < 0) + usdiff = -usdiff; + } else + usdiff = USEC_PER_SEC; /* disable TSC match window below */ /* * Special case: TSC write with a small delta (1 second) of virtual From 57f252f22908535e04d520f3833a6e3116eb159d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 12 Mar 2013 10:20:24 +0100 Subject: [PATCH 042/204] KVM: x86: Drop unused return code from VCPU reset callback Neither vmx nor svm nor the common part may generate an error on kvm_vcpu_reset. So drop the return code. Reviewed-by: Paolo Bonzini Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 4 +--- arch/x86/kvm/vmx.c | 7 +------ arch/x86/kvm/x86.c | 15 ++++++--------- 4 files changed, 9 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 635a74d22409..348d85965ead 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -643,7 +643,7 @@ struct kvm_x86_ops { /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); - int (*vcpu_reset)(struct kvm_vcpu *vcpu); + void (*vcpu_reset)(struct kvm_vcpu *vcpu); void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e1b1ce21bc00..907e4280116d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1191,7 +1191,7 @@ static void init_vmcb(struct vcpu_svm *svm) enable_gif(svm); } -static int svm_vcpu_reset(struct kvm_vcpu *vcpu) +static void svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 dummy; @@ -1207,8 +1207,6 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); - - return 0; } static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 473a5fe7e006..f588171be177 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4100,11 +4100,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } -static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; vmx->rmode.vm86_active = 0; @@ -4195,10 +4194,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) update_exception_bitmap(&vmx->vcpu); vpid_sync_context(vmx); - - ret = 0; - - return ret; } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b67985af1753..fadd5a750476 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -162,7 +162,7 @@ u64 __read_mostly host_xcr0; static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); -static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); +static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { @@ -5858,9 +5858,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - r = kvm_vcpu_reset(vcpu); - if (r) - return r; + kvm_vcpu_reset(vcpu); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } @@ -6486,9 +6484,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); if (r) return r; - r = kvm_vcpu_reset(vcpu); - if (r == 0) - r = kvm_mmu_setup(vcpu); + kvm_vcpu_reset(vcpu); + r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); return r; @@ -6525,7 +6522,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) +static void kvm_vcpu_reset(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; @@ -6552,7 +6549,7 @@ static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - return kvm_x86_ops->vcpu_reset(vcpu); + kvm_x86_ops->vcpu_reset(vcpu); } int kvm_arch_hardware_enable(void *garbage) From 5d218814328da91a27e982748443e7e375e11396 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 12 Mar 2013 22:36:43 -0300 Subject: [PATCH 043/204] KVM: MMU: make kvm_mmu_available_pages robust against n_used_mmu_pages > n_max_mmu_pages As noticed by Ulrich Obergfell , the mmu counters are for beancounting purposes only - so n_used_mmu_pages and n_max_mmu_pages could be relaxed (example: before f0f5933a1626c8df7b), resulting in n_used_mmu_pages > n_max_mmu_pages. Make code robust against n_used_mmu_pages > n_max_mmu_pages. Reviewed-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti Signed-off-by: Gleb Natapov --- arch/x86/kvm/mmu.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 69871080e866..3b1ad0049ea4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -57,8 +57,11 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { - return kvm->arch.n_max_mmu_pages - - kvm->arch.n_used_mmu_pages; + if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; + + return 0; } static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) From 66450a21f99636af4fafac2afd33f1a40631bc3a Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 13 Mar 2013 12:42:34 +0100 Subject: [PATCH 044/204] KVM: x86: Rework INIT and SIPI handling A VCPU sending INIT or SIPI to some other VCPU races for setting the remote VCPU's mp_state. When we were unlucky, KVM_MP_STATE_INIT_RECEIVED was overwritten by kvm_emulate_halt and, thus, got lost. This introduces APIC events for those two signals, keeping them in kvm_apic until kvm_apic_accept_events is run over the target vcpu context. kvm_apic_has_events reports to kvm_arch_vcpu_runnable if there are pending events, thus if vcpu blocking should end. The patch comes with the side effect of effectively obsoleting KVM_MP_STATE_SIPI_RECEIVED. We still accept it from user space, but immediately translate it to KVM_MP_STATE_INIT_RECEIVED + KVM_APIC_SIPI. The vcpu itself will no longer enter the KVM_MP_STATE_SIPI_RECEIVED state. That also means we no longer exit to user space after receiving a SIPI event. Furthermore, we already reset the VCPU on INIT, only fixing up the code segment later on when SIPI arrives. Moreover, we fix INIT handling for the BSP: it never enter wait-for-SIPI but directly starts over on INIT. Tested-by: Paolo Bonzini Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 3 +- arch/x86/kvm/lapic.c | 48 ++++++++++++++++++++++----- arch/x86/kvm/lapic.h | 11 +++++++ arch/x86/kvm/svm.c | 6 ---- arch/x86/kvm/vmx.c | 12 ++----- arch/x86/kvm/x86.c | 58 +++++++++++++++++++++------------ 6 files changed, 93 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 348d85965ead..ef7f4a5cf8c7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -345,7 +345,6 @@ struct kvm_vcpu_arch { unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; - int sipi_vector; u64 ia32_misc_enable_msr; bool tpr_access_reporting; @@ -819,6 +818,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); @@ -1002,6 +1002,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); +void kvm_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_define_shared_msr(unsigned index, u32 msr); void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 02b51dd4e4ad..a8e9369f41c5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -731,7 +731,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (!trig_mode || level) { result = 1; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + /* assumes that there are only KVM_APIC_INIT/SIPI */ + apic->pending_events = (1UL << KVM_APIC_INIT); + /* make sure pending_events is visible before sending + * the request */ + smp_wmb(); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { @@ -743,13 +747,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_STARTUP: apic_debug("SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { - result = 1; - vcpu->arch.sipi_vector = vector; - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - } + result = 1; + apic->sipi_vector = vector; + /* make sure sipi_vector is visible for the receiver */ + smp_wmb(); + set_bit(KVM_APIC_SIPI, &apic->pending_events); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_EXTINT: @@ -1860,6 +1864,34 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) addr); } +void kvm_apic_accept_events(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + unsigned int sipi_vector; + + if (!kvm_vcpu_has_lapic(vcpu)) + return; + + if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { + kvm_lapic_reset(vcpu); + kvm_vcpu_reset(vcpu); + if (kvm_vcpu_is_bsp(apic->vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + } + if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + /* evaluate pending_events before reading the vector */ + smp_rmb(); + sipi_vector = apic->sipi_vector; + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, sipi_vector); + kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + } +} + void kvm_lapic_init(void) { /* do not patch jump label more than once per second */ diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 1676d34ddb4e..2c721b986eec 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -5,6 +5,9 @@ #include +#define KVM_APIC_INIT 0 +#define KVM_APIC_SIPI 1 + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -32,6 +35,8 @@ struct kvm_lapic { void *regs; gpa_t vapic_addr; struct page *vapic_page; + unsigned long pending_events; + unsigned int sipi_vector; }; int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); @@ -39,6 +44,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); +void kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); @@ -158,4 +164,9 @@ void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, u64 *eoi_bitmap); +static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.apic->pending_events; +} + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 907e4280116d..7219a4012a0e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1199,12 +1199,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcb(svm); - if (!kvm_vcpu_is_bsp(vcpu)) { - kvm_rip_write(vcpu, 0); - svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; - svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; - } - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f588171be177..af1ffaf20892 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4119,12 +4119,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); - if (kvm_vcpu_is_bsp(&vmx->vcpu)) - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - else { - vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); - vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); - } + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); @@ -4147,10 +4142,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_SYSENTER_EIP, 0); vmcs_writel(GUEST_RFLAGS, 0x02); - if (kvm_vcpu_is_bsp(&vmx->vcpu)) - kvm_rip_write(vcpu, 0xfff0); - else - kvm_rip_write(vcpu, 0); + kvm_rip_write(vcpu, 0xfff0); vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fadd5a750476..61a5bb60af86 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0; static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu); - static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; @@ -2830,10 +2828,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); events->nmi.pad = 0; - events->sipi_vector = vcpu->arch.sipi_vector; + events->sipi_vector = 0; /* never valid when reporting to user space */ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -2864,8 +2861,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.nmi_pending = events->nmi.pending; kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); - if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) - vcpu->arch.sipi_vector = events->sipi_vector; + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && + kvm_vcpu_has_lapic(vcpu)) + vcpu->arch.apic->sipi_vector = events->sipi_vector; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5720,6 +5718,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + kvm_apic_accept_events(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { + r = 1; + goto out; + } + inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ @@ -5854,14 +5858,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) int r; struct kvm *kvm = vcpu->kvm; - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - kvm_vcpu_reset(vcpu); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); r = vapic_enter(vcpu); if (r) { @@ -5878,8 +5874,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) - { + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { + kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: vcpu->arch.mp_state = @@ -5887,7 +5883,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; break; - case KVM_MP_STATE_SIPI_RECEIVED: + case KVM_MP_STATE_INIT_RECEIVED: + break; default: r = -EINTR; break; @@ -6022,6 +6019,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); + kvm_apic_accept_events(vcpu); clear_bit(KVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; @@ -6178,6 +6176,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + kvm_apic_accept_events(vcpu); mp_state->mp_state = vcpu->arch.mp_state; return 0; } @@ -6185,7 +6184,15 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu->arch.mp_state = mp_state->mp_state; + if (!kvm_vcpu_has_lapic(vcpu) && + mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + return -EINVAL; + + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); + } else + vcpu->arch.mp_state = mp_state->mp_state; kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -6522,7 +6529,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -static void kvm_vcpu_reset(struct kvm_vcpu *vcpu) +void kvm_vcpu_reset(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; @@ -6552,6 +6559,17 @@ static void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_reset(vcpu); } +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector) +{ + struct kvm_segment cs; + + kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs.selector = vector << 8; + cs.base = vector << 12; + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_rip_write(vcpu, 0); +} + int kvm_arch_hardware_enable(void *garbage) { struct kvm *kvm; @@ -6995,7 +7013,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || kvm_apic_has_events(vcpu) || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); From eabeaaccfca0ed61b8e00a09b8cfa703c4f11b59 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 13 Mar 2013 11:30:50 +0100 Subject: [PATCH 045/204] KVM: nVMX: Clean up and fix pin-based execution controls Only interrupt and NMI exiting are mandatory for KVM to work, thus can be exposed to the guest unconditionally, virtual NMI exiting is optional. So we must not advertise it unless the host supports it. Introduce the symbolic constant PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR at this chance. Reviewed-by:: Paolo Bonzini Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/vmx.c | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5fb6e24f0649..3c9f455bacee 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,6 +71,8 @@ #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 +#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 + #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 #define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index af1ffaf20892..8eaabfb20232 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2040,14 +2040,16 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ /* pin-based controls */ + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, + nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); /* * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. */ - nested_vmx_pinbased_ctls_low = 0x16 ; - nested_vmx_pinbased_ctls_high = 0x16 | - PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* * Exit controls From c18911a23ce1dec27fa3325b50587de2569d26f8 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 13 Mar 2013 16:06:41 +0100 Subject: [PATCH 046/204] KVM: nVMX: Provide EFER.LMA saving support We will need EFER.LMA saving to provide unrestricted guest mode. All what is missing for this is picking up EFER.LMA from VM_ENTRY_CONTROLS on L2->L1 switches. If the host does not support EFER.LMA saving, no change is performed, otherwise we properly emulate for L1 what the hardware does for L0. Advertise the support, depending on the host feature. Reviewed-by: Paolo Bonzini Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/vmx.c | 13 ++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 3c9f455bacee..056bda586a45 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -95,6 +95,8 @@ #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff +#define VMX_MISC_SAVE_EFER_LMA 0x00000020 + /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eaabfb20232..02f8c32b9b08 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2022,6 +2022,7 @@ static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; +static u32 nested_vmx_misc_low, nested_vmx_misc_high; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -2106,6 +2107,11 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING; + + /* miscellaneous data */ + rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); + nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_high = 0; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -2176,7 +2182,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_MISC: - *pdata = 0; + *pdata = vmx_control_msr(nested_vmx_misc_low, + nested_vmx_misc_high); break; /* * These MSRs specify bits which the guest must keep fixed (on or off) @@ -7398,6 +7405,10 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + vmcs12->vm_entry_controls = + (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | + (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); + /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); From 0238ea913c21a89387f93097acfbdfeebc9c9257 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 13 Mar 2013 11:31:24 +0100 Subject: [PATCH 047/204] KVM: nVMX: Add preemption timer support Provided the host has this feature, it's straightforward to offer it to the guest as well. We just need to load to timer value on L2 entry if the feature was enabled by L1 and watch out for the corresponding exit reason. Reviewed-by: Paolo Bonzini Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/include/asm/vmx.h | 3 +++ arch/x86/include/uapi/asm/vmx.h | 5 +++-- arch/x86/kvm/vmx.c | 17 ++++++++++++++--- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 056bda586a45..fc1c3134473b 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -70,6 +70,7 @@ #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 +#define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040 #define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 @@ -95,6 +96,7 @@ #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff +#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f #define VMX_MISC_SAVE_EFER_LMA 0x00000020 /* VMCS Encodings */ @@ -217,6 +219,7 @@ enum vmcs_field { GUEST_INTERRUPTIBILITY_INFO = 0x00004824, GUEST_ACTIVITY_STATE = 0X00004826, GUEST_SYSENTER_CS = 0x0000482A, + VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, HOST_IA32_SYSENTER_CS = 0x00004c00, CR0_GUEST_HOST_MASK = 0x00006000, CR4_GUEST_HOST_MASK = 0x00006002, diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 2871fccfee68..d651082c7cf7 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,7 @@ #define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_PREEMPTION_TIMER 52 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 @@ -110,7 +111,7 @@ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ - { EXIT_REASON_INVPCID, "INVPCID" } - + { EXIT_REASON_INVPCID, "INVPCID" }, \ + { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02f8c32b9b08..17a693868458 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -298,7 +298,8 @@ struct __packed vmcs12 { u32 guest_activity_state; u32 guest_sysenter_cs; u32 host_ia32_sysenter_cs; - u32 padding32[8]; /* room for future expansion */ + u32 vmx_preemption_timer_value; + u32 padding32[7]; /* room for future expansion */ u16 virtual_processor_id; u16 guest_es_selector; u16 guest_cs_selector; @@ -537,6 +538,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), + FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), FIELD(CR0_READ_SHADOW, cr0_read_shadow), @@ -2049,7 +2051,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; + PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS | + PIN_BASED_VMX_PREEMPTION_TIMER; nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; /* @@ -2110,7 +2113,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK | + VMX_MISC_SAVE_EFER_LMA; nested_vmx_misc_high = 0; } @@ -6190,6 +6194,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_EPT_VIOLATION: case EXIT_REASON_EPT_MISCONFIG: return 0; + case EXIT_REASON_PREEMPTION_TIMER: + return vmcs12->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -7011,6 +7018,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs_config.pin_based_exec_ctrl | vmcs12->pin_based_vm_exec_control)); + if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, + vmcs12->vmx_preemption_timer_value); + /* * Whether page-faults are trapped is determined by a combination of * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. From 95b0430d1a53541076ffbaf453f8b49a547cceba Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 12 Mar 2013 17:44:40 +0900 Subject: [PATCH 048/204] KVM: MMU: Mark sp mmio cached when creating mmio spte This will be used not to zap unrelated mmu pages when creating/moving a memory slot later. Reviewed-by: Marcelo Tosatti Signed-off-by: Takuya Yoshikawa Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ef7f4a5cf8c7..9b75cae83d10 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -230,6 +230,7 @@ struct kvm_mmu_page { #endif int write_flooding_count; + bool mmio_cached; }; struct kvm_pio_request { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fdacabba6f62..de45ec195346 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -199,8 +199,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + access &= ACC_WRITE_MASK | ACC_USER_MASK; + sp->mmio_cached = true; trace_mark_mmio_spte(sptep, gfn, access); mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); } From 982b3394dd23eec6e5a2f7871238435a167b63cc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 12 Mar 2013 17:45:30 +0900 Subject: [PATCH 049/204] KVM: x86: Optimize mmio spte zapping when creating/moving memslot When we create or move a memory slot, we need to zap mmio sptes. Currently, zap_all() is used for this and this is causing two problems: - extra page faults after zapping mmu pages - long mmu_lock hold time during zapping mmu pages For the latter, Marcelo reported a disastrous mmu_lock hold time during hot-plug, which made the guest unresponsive for a long time. This patch takes a simple way to fix these problems: do not zap mmu pages unless they are marked mmio cached. On our test box, this took only 50us for the 4GB guest and we did not see ms of mmu_lock hold time any more. Note that we still need to do zap_all() for other cases. So another work is also needed: Xiao's work may be the one. Reviewed-by: Marcelo Tosatti Signed-off-by: Takuya Yoshikawa Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 18 ++++++++++++++++++ arch/x86/kvm/x86.c | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9b75cae83d10..3f205c6cde59 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -767,6 +767,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); +void kvm_mmu_zap_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index de45ec195346..c1a9b7b08ab7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4189,6 +4189,24 @@ restart: spin_unlock(&kvm->mmu_lock); } +void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + + spin_lock(&kvm->mmu_lock); +restart: + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (!sp->mmio_cached) + continue; + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + } + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); +} + static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 61a5bb60af86..d3c478742e2c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6991,7 +6991,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * mmio sptes. */ if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { - kvm_mmu_zap_all(kvm); + kvm_mmu_zap_mmio_sptes(kvm); kvm_reload_remote_mmus(kvm); } } From 4918c6ca6838658b71d3ce75e1a4373195ac8d40 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 15 Mar 2013 08:38:56 +0100 Subject: [PATCH 050/204] KVM: VMX: Require KVM_SET_TSS_ADDR being called prior to running a VCPU Very old user space (namely qemu-kvm before kvm-49) didn't set the TSS base before running the VCPU. We always warned about this bug, but no reports about users actually seeing this are known. Time to finally remove the workaround that effectively prevented to call vmx_vcpu_reset while already holding the KVM srcu lock. Reviewed-by: Gleb Natapov Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 17a693868458..ad978a6c282e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2898,22 +2898,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->cpl = 0; } -static gva_t rmode_tss_base(struct kvm *kvm) -{ - if (!kvm->arch.tss_addr) { - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - gfn_t base_gfn; - - slots = kvm_memslots(kvm); - slot = id_to_memslot(slots, 0); - base_gfn = slot->base_gfn + slot->npages - 3; - - return base_gfn << PAGE_SHIFT; - } - return kvm->arch.tss_addr; -} - static void fix_rmode_seg(int seg, struct kvm_segment *save) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -2964,19 +2948,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu) /* * Very old userspace does not call KVM_SET_TSS_ADDR before entering - * vcpu. Call it here with phys address pointing 16M below 4G. + * vcpu. Warn the user that an update is overdue. */ - if (!vcpu->kvm->arch.tss_addr) { + if (!vcpu->kvm->arch.tss_addr) printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " "called before entering vcpu\n"); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - } vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); + vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); @@ -3623,7 +3603,7 @@ static int init_rmode_tss(struct kvm *kvm) int r, idx, ret = 0; idx = srcu_read_lock(&kvm->srcu); - fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + fn = kvm->arch.tss_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -4190,9 +4170,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); From 04b66839d312d3bdaff77f265eb7305347fa1fb7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 19 Mar 2013 16:30:26 +0100 Subject: [PATCH 051/204] KVM: x86: correctly initialize the CS base on reset The CS base was initialized to 0 on VMX (wrong, but usually overridden by userspace before starting) or 0xf0000 on SVM. The correct value is 0xffff0000, and VMX is able to emulate it now, so use it. Reviewed-by: Gleb Natapov Signed-off-by: Paolo Bonzini Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 8 +------- arch/x86/kvm/vmx.c | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7219a4012a0e..7a46c1f46861 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1131,17 +1131,11 @@ static void init_vmcb(struct vcpu_svm *svm) init_seg(&save->gs); save->cs.selector = 0xf000; + save->cs.base = 0xffff0000; /* Executable/Readable Code Segment */ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; save->cs.limit = 0xffff; - /* - * cs.base should really be 0xffff0000, but vmx can't handle that, so - * be consistent with it. - * - * Replace when we have real mode working for vmx. - */ - save->cs.base = 0xf0000; save->gdtr.limit = 0xffff; save->idtr.limit = 0xffff; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad978a6c282e..03f574641852 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4113,6 +4113,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_write32(GUEST_CS_BASE, 0xffff0000); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); From 7ddca7e43c8f28f9419da81a0e7730b66aa60fe9 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 21 Mar 2013 19:33:43 +0900 Subject: [PATCH 052/204] KVM: MMU: Move kvm_mmu_free_some_pages() into kvm_mmu_alloc_page() What this function is doing is to ensure that the number of shadow pages does not exceed the maximum limit stored in n_max_mmu_pages: so this is placed at every code path that can reach kvm_mmu_alloc_page(). Although it might have some sense to spread this function in each such code path when it could be called before taking mmu_lock, the rule was changed not to do so. Taking this background into account, this patch moves it into kvm_mmu_alloc_page() and simplifies the code. Note: the unlikely hint in kvm_mmu_free_some_pages() guarantees that the overhead of this function is almost zero except when we actually need to allocate some shadow pages, so we do not need to care about calling it multiple times in one path by doing kvm_mmu_get_page() a few times. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 9 +++------ arch/x86/kvm/paging_tmpl.h | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c1a9b7b08ab7..38f34c5361f4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1505,6 +1505,9 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; + + kvm_mmu_free_some_pages(vcpu); + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) @@ -2842,7 +2845,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, @@ -2920,7 +2922,6 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL, NULL); ++sp->root_count; @@ -2932,7 +2933,6 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL, @@ -2971,7 +2971,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); root = __pa(sp->spt); @@ -3005,7 +3004,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return 1; } spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 0, ACC_ALL, NULL); @@ -3311,7 +3309,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 105dd5bd550e..af143f065532 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -627,7 +627,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - kvm_mmu_free_some_pages(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, From 81f4f76bbc712a2dff8bb020057c554e285370e1 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 21 Mar 2013 19:34:27 +0900 Subject: [PATCH 053/204] KVM: MMU: Rename kvm_mmu_free_some_pages() to make_mmu_pages_available() The current name "kvm_mmu_free_some_pages" should be used for something that actually frees some shadow pages, as we expect from the name, but what the function is doing is to make some, KVM_MIN_FREE_MMU_PAGES, shadow pages available: it does nothing when there are enough. This patch changes the name to reflect this meaning better; while doing this renaming, the code in the wrapper function is inlined into the main body since the whole function will be inlined into the only caller now. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 9 +++++++-- arch/x86/kvm/mmu.h | 6 ------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 38f34c5361f4..633e30cfbd63 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1501,12 +1501,14 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } +static void make_mmu_pages_available(struct kvm_vcpu *vcpu); + static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; - kvm_mmu_free_some_pages(vcpu); + make_mmu_pages_available(vcpu); sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); @@ -4010,10 +4012,13 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +static void make_mmu_pages_available(struct kvm_vcpu *vcpu) { LIST_HEAD(invalid_list); + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + return; + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) break; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3b1ad0049ea4..2adcbc2cac6d 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -64,12 +64,6 @@ static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) return 0; } -static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) - __kvm_mmu_free_some_pages(vcpu); -} - static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) From d26f22c9cdfa935e674b2ff747dbcfaf9fa048f8 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Sun, 24 Feb 2013 18:57:11 +0000 Subject: [PATCH 054/204] KVM: PPC: move tsr update in a separate function This is done so that same function can be called from SREGS and ONE_REG interface (follow up patch). Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- arch/powerpc/kvm/booke.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index eb88fa621073..f2fd47d35ab5 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1148,6 +1148,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, return r; } +static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) +{ + u32 old_tsr = vcpu->arch.tsr; + + vcpu->arch.tsr = new_tsr; + + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + + update_timer_ints(vcpu); +} + /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { @@ -1287,16 +1299,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, kvmppc_emulate_dec(vcpu); } - if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { - u32 old_tsr = vcpu->arch.tsr; - - vcpu->arch.tsr = sregs->u.e.tsr; - - if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) - arm_next_watchdog(vcpu); - - update_timer_ints(vcpu); - } + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) + kvmppc_set_tsr(vcpu, sregs->u.e.tsr); return 0; } From 78accda4f888c77122cf3da6185f905d4677eb07 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Sun, 24 Feb 2013 18:57:12 +0000 Subject: [PATCH 055/204] KVM: PPC: Added one_reg interface for timer registers If userspace wants to change some specific bits of TSR (timer status register) then it uses GET/SET_SREGS ioctl interface. So the steps will be: i) user-space will make get ioctl, ii) change TSR in userspace iii) then make set ioctl. It can happen that TSR gets changed by kernel after step i) and before step iii). To avoid this we have added below one_reg ioctls for oring and clearing specific bits in TSR. This patch adds one registerface for: 1) setting specific bit in TSR (timer status register) 2) clearing specific bit in TSR (timer status register) 3) setting/getting the TCR register. There are cases where we want to only change TCR and not TSR. Although we can uses SREGS without KVM_SREGS_E_UPDATE_TSR flag but I think one reg is better. I am open if someone feels we should use SREGS only here. 4) getting/setting TSR register Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 4 ++++ arch/powerpc/include/uapi/asm/kvm.h | 5 +++++ arch/powerpc/kvm/booke.c | 30 +++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c16b442556e8..976eb650e7ef 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1788,6 +1788,10 @@ registers, find a list below: PPC | KVM_REG_PPC_VPA_DTL | 128 PPC | KVM_REG_PPC_EPCR | 32 PPC | KVM_REG_PPC_EPR | 32 + PPC | KVM_REG_PPC_TCR | 32 + PPC | KVM_REG_PPC_TSR | 32 + PPC | KVM_REG_PPC_OR_TSR | 32 + PPC | KVM_REG_PPC_CLEAR_TSR | 32 ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 16064d00adb9..ef072b1a6e3f 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -417,4 +417,9 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) #define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86) +/* Timer Status Register OR/CLEAR interface */ +#define KVM_REG_PPC_OR_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x87) +#define KVM_REG_PPC_CLEAR_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x88) +#define KVM_REG_PPC_TCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x89) +#define KVM_REG_PPC_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8a) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index f2fd47d35ab5..11825539e2b1 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1442,6 +1442,12 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); break; #endif + case KVM_REG_PPC_TCR: + r = put_user(vcpu->arch.tcr, (u32 __user *)(long)reg->addr); + break; + case KVM_REG_PPC_TSR: + r = put_user(vcpu->arch.tsr, (u32 __user *)(long)reg->addr); + break; default: break; } @@ -1485,6 +1491,30 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) break; } #endif + case KVM_REG_PPC_OR_TSR: { + u32 tsr_bits; + r = get_user(tsr_bits, (u32 __user *)(long)reg->addr); + kvmppc_set_tsr_bits(vcpu, tsr_bits); + break; + } + case KVM_REG_PPC_CLEAR_TSR: { + u32 tsr_bits; + r = get_user(tsr_bits, (u32 __user *)(long)reg->addr); + kvmppc_clr_tsr_bits(vcpu, tsr_bits); + break; + } + case KVM_REG_PPC_TSR: { + u32 tsr; + r = get_user(tsr, (u32 __user *)(long)reg->addr); + kvmppc_set_tsr(vcpu, tsr); + break; + } + case KVM_REG_PPC_TCR: { + u32 tcr; + r = get_user(tcr, (u32 __user *)(long)reg->addr); + kvmppc_set_tcr(vcpu, tcr); + break; + } default: break; } From 15b708beee6841e0a59ded702c8bfe3042a5b5a4 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 27 Feb 2013 18:13:10 +0000 Subject: [PATCH 056/204] KVM: PPC: booke: Added debug handler Installed debug handler will be used for guest debug support and debug facility emulation features (patches for these features will follow this patch). Signed-off-by: Liu Yu [bharat.bhushan@freescale.com: Substantial changes] Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/booke_interrupts.S | 42 ++++++++++++++++++++++++++--- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d1bb86074721..e34f8fee9080 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -504,6 +504,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 mmucfg; u32 epr; + u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; #endif gpa_t paddr_accessed; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b6c17ec9b169..d87c90886c75 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -596,6 +596,7 @@ int main(void) DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); + DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); #endif /* CONFIG_PPC_BOOK3S */ #endif /* CONFIG_KVM */ diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index f4bb55c96517..2c6deb5ef2fe 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -54,8 +54,7 @@ (1< Date: Wed, 13 Feb 2013 19:37:48 +0000 Subject: [PATCH 057/204] kvm/ppc/e500: h2g_tlb1_rmap: esel 0 is valid Add one to esel values in h2g_tlb1_rmap, so that "no mapping" can be distinguished from "esel 0". Note that we're not saved by the fact that host esel 0 is reserved for non-KVM use, because KVM host esel numbering is not the raw host numbering (see to_htlb1_esel). Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500_mmu_host.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index a222edfb9a9b..35fb80ec1f57 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -511,10 +511,10 @@ static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500, vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel; vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; if (vcpu_e500->h2g_tlb1_rmap[sesel]) { - unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel]; + unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel] - 1; vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel); } - vcpu_e500->h2g_tlb1_rmap[sesel] = esel; + vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1; return sesel; } From 36ada4f4317e27bf52f52aef5c72f553eef08f4a Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 13 Feb 2013 19:37:49 +0000 Subject: [PATCH 058/204] kvm/ppc/e500: g2h_tlb1_map: clear old bit before setting new bit It's possible that we're using the same host TLB1 slot to map (a presumably different portion of) the same guest TLB1 entry. Clear the bit in the map before setting it, so that if the esels are the same the bit will remain set. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500_mmu_host.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 35fb80ec1f57..8e72b2124f63 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -507,13 +507,14 @@ static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500, if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) vcpu_e500->host_tlb1_nv = 0; - vcpu_e500->tlb_refs[1][sesel] = *ref; - vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel; - vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; if (vcpu_e500->h2g_tlb1_rmap[sesel]) { unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel] - 1; vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel); } + + vcpu_e500->tlb_refs[1][sesel] = *ref; + vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; + vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel; vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1; return sesel; From 47bf379742bf1baad9624e203912b72c3fa9c80a Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 6 Mar 2013 16:02:49 +0000 Subject: [PATCH 059/204] kvm/ppc/e500: eliminate tlb_refs Commit 523f0e5421c12610527c620b983b443f329e3a32 ("KVM: PPC: E500: Explicitly mark shadow maps invalid") began using E500_TLB_VALID for guest TLB1 entries, and skipping invalidations if it's not set. However, when E500_TLB_VALID was set for such entries, it was on a fake local ref, and so the invalidations never happen. gtlb_privs is documented as being only for guest TLB0, though we already violate that with E500_TLB_BITMAP. Now that we have MMU notifiers, and thus don't need to actually retain a reference to the mapped pages, get rid of tlb_refs, and use gtlb_privs for E500_TLB_VALID in TLB1. Since we can have more than one host TLB entry for a given tlbe_ref, be careful not to clear existing flags that are relevant to other host TLB entries when preparing a new host TLB entry. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500.h | 24 ++++------ arch/powerpc/kvm/e500_mmu_host.c | 75 ++++++++++---------------------- 2 files changed, 30 insertions(+), 69 deletions(-) diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 41cefd43655f..33db48a8ce24 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -26,17 +26,20 @@ #define E500_PID_NUM 3 #define E500_TLB_NUM 2 -#define E500_TLB_VALID 1 -#define E500_TLB_BITMAP 2 +/* entry is mapped somewhere in host TLB */ +#define E500_TLB_VALID (1 << 0) +/* TLB1 entry is mapped by host TLB1, tracked by bitmaps */ +#define E500_TLB_BITMAP (1 << 1) +/* TLB1 entry is mapped by host TLB0 */ #define E500_TLB_TLB0 (1 << 2) struct tlbe_ref { - pfn_t pfn; - unsigned int flags; /* E500_TLB_* */ + pfn_t pfn; /* valid only for TLB0, except briefly */ + unsigned int flags; /* E500_TLB_* */ }; struct tlbe_priv { - struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */ + struct tlbe_ref ref; }; #ifdef CONFIG_KVM_E500V2 @@ -63,17 +66,6 @@ struct kvmppc_vcpu_e500 { unsigned int gtlb_nv[E500_TLB_NUM]; - /* - * information associated with each host TLB entry -- - * TLB1 only for now. If/when guest TLB1 entries can be - * mapped with host TLB0, this will be used for that too. - * - * We don't want to use this for guest TLB0 because then we'd - * have the overhead of doing the translation again even if - * the entry is still in the guest TLB (e.g. we swapped out - * and back, and our host TLB entries got evicted). - */ - struct tlbe_ref *tlb_refs[E500_TLB_NUM]; unsigned int host_tlb1_nv; u32 svr; diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 8e72b2124f63..1c6a9d729df4 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -193,8 +193,11 @@ void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref; /* Don't bother with unmapped entries */ - if (!(ref->flags & E500_TLB_VALID)) - return; + if (!(ref->flags & E500_TLB_VALID)) { + WARN(ref->flags & (E500_TLB_BITMAP | E500_TLB_TLB0), + "%s: flags %x\n", __func__, ref->flags); + WARN_ON(tlbsel == 1 && vcpu_e500->g2h_tlb1_map[esel]); + } if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) { u64 tmp = vcpu_e500->g2h_tlb1_map[esel]; @@ -248,7 +251,7 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, pfn_t pfn) { ref->pfn = pfn; - ref->flags = E500_TLB_VALID; + ref->flags |= E500_TLB_VALID; if (tlbe_is_writable(gtlbe)) kvm_set_pfn_dirty(pfn); @@ -257,6 +260,7 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) { if (ref->flags & E500_TLB_VALID) { + /* FIXME: don't log bogus pfn for TLB1 */ trace_kvm_booke206_ref_release(ref->pfn, ref->flags); ref->flags = 0; } @@ -274,36 +278,23 @@ static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500) static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) { - int tlbsel = 0; + int tlbsel; int i; - for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { - struct tlbe_ref *ref = - &vcpu_e500->gtlb_priv[tlbsel][i].ref; - kvmppc_e500_ref_release(ref); + for (tlbsel = 0; tlbsel <= 1; tlbsel++) { + for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->gtlb_priv[tlbsel][i].ref; + kvmppc_e500_ref_release(ref); + } } } -static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - int stlbsel = 1; - int i; - - kvmppc_e500_tlbil_all(vcpu_e500); - - for (i = 0; i < host_tlb_params[stlbsel].entries; i++) { - struct tlbe_ref *ref = - &vcpu_e500->tlb_refs[stlbsel][i]; - kvmppc_e500_ref_release(ref); - } - - clear_tlb_privs(vcpu_e500); -} - void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - clear_tlb_refs(vcpu_e500); + kvmppc_e500_tlbil_all(vcpu_e500); + clear_tlb_privs(vcpu_e500); clear_tlb1_bitmap(vcpu_e500); } @@ -458,8 +449,6 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); } - /* Drop old ref and setup new one. */ - kvmppc_e500_ref_release(ref); kvmppc_e500_ref_setup(ref, gtlbe, pfn); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, @@ -512,10 +501,10 @@ static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500, vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel); } - vcpu_e500->tlb_refs[1][sesel] = *ref; vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel; vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1; + WARN_ON(!(ref->flags & E500_TLB_VALID)); return sesel; } @@ -527,13 +516,12 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, struct kvm_book3e_206_tlb_entry *stlbe, int esel) { - struct tlbe_ref ref; + struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[1][esel].ref; int sesel; int r; - ref.flags = 0; r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, - &ref); + ref); if (r) return r; @@ -545,7 +533,7 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, } /* Otherwise map into TLB1 */ - sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, &ref, esel); + sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, ref, esel); write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel); return 0; @@ -566,7 +554,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, case 0: priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - /* Triggers after clear_tlb_refs or on initial mapping */ + /* Triggers after clear_tlb_privs or on initial mapping */ if (!(priv->ref.flags & E500_TLB_VALID)) { kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); } else { @@ -666,35 +654,16 @@ int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500) host_tlb_params[0].entries / host_tlb_params[0].ways; host_tlb_params[1].sets = 1; - vcpu_e500->tlb_refs[0] = - kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries, - GFP_KERNEL); - if (!vcpu_e500->tlb_refs[0]) - goto err; - - vcpu_e500->tlb_refs[1] = - kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries, - GFP_KERNEL); - if (!vcpu_e500->tlb_refs[1]) - goto err; - vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) * host_tlb_params[1].entries, GFP_KERNEL); if (!vcpu_e500->h2g_tlb1_rmap) - goto err; + return -EINVAL; return 0; - -err: - kfree(vcpu_e500->tlb_refs[0]); - kfree(vcpu_e500->tlb_refs[1]); - return -EINVAL; } void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) { kfree(vcpu_e500->h2g_tlb1_rmap); - kfree(vcpu_e500->tlb_refs[0]); - kfree(vcpu_e500->tlb_refs[1]); } From 4fe27d2addda8af7714546a69369fb92dddcf9a3 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 Feb 2013 14:00:25 +0000 Subject: [PATCH 060/204] KVM: PPC: Remove unused argument to kvmppc_core_dequeue_external Currently kvmppc_core_dequeue_external() takes a struct kvm_interrupt * argument and does nothing with it, in any of its implementations. This removes it in order to make things easier for forthcoming in-kernel interrupt controller emulation code. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_ppc.h | 3 +-- arch/powerpc/kvm/book3s.c | 3 +-- arch/powerpc/kvm/booke.c | 3 +-- arch/powerpc/kvm/powerpc.c | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 44fa9ad1d62c..f58930779ae8 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -104,8 +104,7 @@ extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); -extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq); +extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu); extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu); extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index a4b645285240..6548445fd823 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -160,8 +160,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, kvmppc_book3s_queue_irqprio(vcpu, vec); } -void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 11825539e2b1..58057d6f146d 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -222,8 +222,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, kvmppc_booke_queue_irqprio(vcpu, prio); } -void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 7b5d4d20cdc5..16b45954511c 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -739,7 +739,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq == KVM_INTERRUPT_UNSET) { - kvmppc_core_dequeue_external(vcpu, irq); + kvmppc_core_dequeue_external(vcpu); return 0; } From 2cef4deb4018c02fb3cd08f76c8a988f7ddee480 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 25 Mar 2013 17:22:48 +0100 Subject: [PATCH 061/204] KVM: s390: Dont do a gmap update on minor memslot changes Some memslot updates dont affect the gmap implementation, e.g. setting/unsetting dirty tracking. Since a gmap update will cause tlb flushes and segment table invalidations we want to avoid that. Signed-off-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/kvm-s390.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 33161b4a8280..f241e3315ebb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1007,6 +1007,16 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, { int rc; + /* If the basics of the memslot do not change, we do not want + * to update the gmap. Every update causes several unnecessary + * segment translation exceptions. This is usually handled just + * fine by the normal fault handler + gmap, but it will also + * cause faults on the prefix page of running guest CPUs. + */ + if (old->userspace_addr == mem->userspace_addr && + old->base_gfn * PAGE_SIZE == mem->guest_phys_addr && + old->npages * PAGE_SIZE == mem->memory_size) + return; rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, mem->guest_phys_addr, mem->memory_size); From d21683ea1f1b03823928a98b6380332b9385e3a7 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Mar 2013 17:22:49 +0100 Subject: [PATCH 062/204] KVM: s390: fix 24 bit psw handling in lpsw/lpswe handler When checking for validity the lpsw/lpswe handler check that only the lower 20 bits instead of 24 bits have a non-zero value. There handling valid psws as invalid ones. Fix the 24 bit psw mask. Signed-off-by: Heiko Carstens Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/priv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 7db2ad076f31..7b397b37d11a 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -255,7 +255,7 @@ static void handle_new_psw(struct kvm_vcpu *vcpu) #define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA) #define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL -#define PSW_ADDR_24 0x00000000000fffffUL +#define PSW_ADDR_24 0x0000000000ffffffUL #define PSW_ADDR_31 0x000000007fffffffUL int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) From ace5058763b72d128efcbe27969e89226c9c593a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Mar 2013 17:22:50 +0100 Subject: [PATCH 063/204] KVM: s390: fix psw conversion in lpsw handler When converting a 64 bit psw to a 128 bit psw the addressing mode bit of the "addr" part of the 64 bit psw must be moved to the basic addressing mode bit of the "mask" part of the 128 bit psw. In addition the addressing mode bit must be cleared when moved to the "addr" part of the 128 bit psw. Otherwise an invalid psw would be generated if the orginal psw was in the 31 bit addressing mode. Signed-off-by: Heiko Carstens Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/priv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 7b397b37d11a..844a2b986112 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -286,7 +286,8 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gpsw.mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32; - vcpu->arch.sie_block->gpsw.addr = new_psw.addr; + vcpu->arch.sie_block->gpsw.mask |= new_psw.addr & PSW32_ADDR_AMODE; + vcpu->arch.sie_block->gpsw.addr = new_psw.addr & ~PSW32_ADDR_AMODE; if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) || (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) && From 6fd0fcc93b1eaf82911782de5c7aa35c174bf620 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Mar 2013 17:22:51 +0100 Subject: [PATCH 064/204] KVM: s390: fix return code handling in lpsw/lpswe handlers kvm_s390_inject_program_int() may return with a non-zero return value, in case of an error (out of memory). Report that to the calling functions instead of ignoring the error case. Signed-off-by: Heiko Carstens Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/priv.c | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 844a2b986112..9d32c56fb02c 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -269,20 +269,14 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) addr = kvm_s390_get_base_disp_s(vcpu); - if (addr & 7) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + if (addr & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - goto out; - } + if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (!(new_psw.mask & PSW32_MASK_BASE)) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + if (!(new_psw.mask & PSW32_MASK_BASE)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); vcpu->arch.sie_block->gpsw.mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32; @@ -293,13 +287,10 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) && (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) || ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) == - PSW_MASK_EA)) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + PSW_MASK_EA)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); handle_new_psw(vcpu); -out: return 0; } @@ -310,15 +301,11 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) addr = kvm_s390_get_base_disp_s(vcpu); - if (addr & 7) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + if (addr & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - goto out; - } + if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); vcpu->arch.sie_block->gpsw.mask = new_psw.mask; vcpu->arch.sie_block->gpsw.addr = new_psw.addr; @@ -330,13 +317,10 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) && (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) || ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) == - PSW_MASK_EA)) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + PSW_MASK_EA)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); handle_new_psw(vcpu); -out: return 0; } From 3736b874a39a1df2a94186c357aabeb6a7d7d4f6 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Mar 2013 17:22:52 +0100 Subject: [PATCH 065/204] KVM: s390: make if statements in lpsw/lpswe handlers readable Being unable to parse the 5- and 8-line if statements I had to split them to be able to make any sense of them and verify that they match the architecture. So change the code since I guess that other people will also have a hard time parsing such long conditional statements with line breaks. Introduce a common is_valid_psw() function which does all the checks needed. In case of lpsw (64 bit psw -> 128 bit psw conversion) it will do some not needed additional checks, since a couple of bits can't be set anyway, but that doesn't hurt. Signed-off-by: Heiko Carstens Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/priv.c | 58 ++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 9d32c56fb02c..05d186c21eca 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -258,68 +258,58 @@ static void handle_new_psw(struct kvm_vcpu *vcpu) #define PSW_ADDR_24 0x0000000000ffffffUL #define PSW_ADDR_31 0x000000007fffffffUL +static int is_valid_psw(psw_t *psw) { + if (psw->mask & PSW_MASK_UNASSIGNED) + return 0; + if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) { + if (psw->addr & ~PSW_ADDR_31) + return 0; + } + if (!(psw->mask & PSW_MASK_ADDR_MODE) && (psw->addr & ~PSW_ADDR_24)) + return 0; + if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_EA) + return 0; + return 1; +} + int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) { - u64 addr; + psw_t *gpsw = &vcpu->arch.sie_block->gpsw; psw_compat_t new_psw; + u64 addr; - if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + if (gpsw->mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OPERATION); - addr = kvm_s390_get_base_disp_s(vcpu); - if (addr & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (!(new_psw.mask & PSW32_MASK_BASE)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - - vcpu->arch.sie_block->gpsw.mask = - (new_psw.mask & ~PSW32_MASK_BASE) << 32; - vcpu->arch.sie_block->gpsw.mask |= new_psw.addr & PSW32_ADDR_AMODE; - vcpu->arch.sie_block->gpsw.addr = new_psw.addr & ~PSW32_ADDR_AMODE; - - if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) || - (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) && - (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) || - ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) == - PSW_MASK_EA)) + gpsw->mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32; + gpsw->mask |= new_psw.addr & PSW32_ADDR_AMODE; + gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE; + if (!is_valid_psw(gpsw)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - handle_new_psw(vcpu); return 0; } static int handle_lpswe(struct kvm_vcpu *vcpu) { - u64 addr; psw_t new_psw; + u64 addr; addr = kvm_s390_get_base_disp_s(vcpu); - if (addr & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - vcpu->arch.sie_block->gpsw.mask = new_psw.mask; - vcpu->arch.sie_block->gpsw.addr = new_psw.addr; - - if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) || - (((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) == - PSW_MASK_BA) && - (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_31)) || - (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) && - (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) || - ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) == - PSW_MASK_EA)) + vcpu->arch.sie_block->gpsw = new_psw; + if (!is_valid_psw(&vcpu->arch.sie_block->gpsw)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - handle_new_psw(vcpu); return 0; } From db4a29cb6ac7b2fda505923bdbc58fc35a719f62 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Mar 2013 17:22:53 +0100 Subject: [PATCH 066/204] KVM: s390: fix and enforce return code handling for irq injections kvm_s390_inject_program_int() and friends may fail if no memory is available. This must be reported to the calling functions, so that this gets passed down to user space which should fix the situation. Alternatively we end up with guest state corruption. So fix this and enforce return value checking by adding a __must_check annotation to all of these function prototypes. Signed-off-by: Heiko Carstens Acked-by: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/intercept.c | 12 ++---- arch/s390/kvm/kvm-s390.c | 3 +- arch/s390/kvm/kvm-s390.h | 12 +++--- arch/s390/kvm/priv.c | 83 ++++++++++++--------------------------- 4 files changed, 37 insertions(+), 73 deletions(-) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index c6ba4dfd7f1e..b7d1b2edeeb3 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -45,10 +45,8 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) do { rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg], (u64 __user *) useraddr); - if (rc) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - break; - } + if (rc) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); useraddr += 8; if (reg == reg3) break; @@ -79,10 +77,8 @@ static int handle_lctl(struct kvm_vcpu *vcpu) reg = reg1; do { rc = get_guest(vcpu, val, (u32 __user *) useraddr); - if (rc) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - break; - } + if (rc) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul; vcpu->arch.sie_block->gcr[reg] |= val; useraddr += 4; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f241e3315ebb..d05a59c1eea7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -633,8 +633,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } else { VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); trace_kvm_s390_sie_fault(vcpu); - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - rc = 0; + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); } } VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 4d89d64a8161..efc14f687265 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -110,12 +110,12 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); void kvm_s390_tasklet(unsigned long parm); void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu); -int kvm_s390_inject_vm(struct kvm *kvm, - struct kvm_s390_interrupt *s390int); -int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, - struct kvm_s390_interrupt *s390int); -int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); -int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); +int __must_check kvm_s390_inject_vm(struct kvm *kvm, + struct kvm_s390_interrupt *s390int); +int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, + struct kvm_s390_interrupt *s390int); +int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); +int __must_check kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, u64 cr6, u64 schid); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 05d186c21eca..23a8370b1045 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -36,31 +36,24 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) operand2 = kvm_s390_get_base_disp_s(vcpu); /* must be word boundary */ - if (operand2 & 3) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + if (operand2 & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); /* get the value */ - if (get_guest(vcpu, address, (u32 __user *) operand2)) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - goto out; - } + if (get_guest(vcpu, address, (u32 __user *) operand2)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); address = address & 0x7fffe000u; /* make sure that the new value is valid memory */ if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || - (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - goto out; - } + (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); kvm_s390_set_prefix(vcpu, address); VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); trace_kvm_s390_handle_prefix(vcpu, 1, address); -out: return 0; } @@ -74,49 +67,37 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) operand2 = kvm_s390_get_base_disp_s(vcpu); /* must be word boundary */ - if (operand2 & 3) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + if (operand2 & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); address = vcpu->arch.sie_block->prefix; address = address & 0x7fffe000u; /* get the value */ - if (put_guest(vcpu, address, (u32 __user *)operand2)) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - goto out; - } + if (put_guest(vcpu, address, (u32 __user *)operand2)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); trace_kvm_s390_handle_prefix(vcpu, 0, address); -out: return 0; } static int handle_store_cpu_address(struct kvm_vcpu *vcpu) { u64 useraddr; - int rc; vcpu->stat.instruction_stap++; useraddr = kvm_s390_get_base_disp_s(vcpu); - if (useraddr & 1) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + if (useraddr & 1) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - rc = put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr); - if (rc) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - goto out; - } + if (put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr); trace_kvm_s390_handle_stap(vcpu, useraddr); -out: return 0; } @@ -135,10 +116,8 @@ static int handle_tpi(struct kvm_vcpu *vcpu) int cc; addr = kvm_s390_get_base_disp_s(vcpu); - if (addr & 3) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + if (addr & 3) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); cc = 0; inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0); if (!inti) @@ -167,7 +146,6 @@ no_interrupt: /* Set condition code and we're done. */ vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44; -out: return 0; } @@ -237,12 +215,9 @@ static int handle_stfl(struct kvm_vcpu *vcpu) rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), &facility_list, sizeof(facility_list)); if (rc) - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - else { - VCPU_EVENT(vcpu, 5, "store facility list value %x", - facility_list); - trace_kvm_s390_handle_stfl(vcpu, facility_list); - } + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list); + trace_kvm_s390_handle_stfl(vcpu, facility_list); return 0; } @@ -317,25 +292,18 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) static int handle_stidp(struct kvm_vcpu *vcpu) { u64 operand2; - int rc; vcpu->stat.instruction_stidp++; operand2 = kvm_s390_get_base_disp_s(vcpu); - if (operand2 & 7) { - kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - goto out; - } + if (operand2 & 7) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - rc = put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2); - if (rc) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - goto out; - } + if (put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); VCPU_EVENT(vcpu, 5, "%s", "store cpu id"); -out: return 0; } @@ -377,6 +345,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff; u64 operand2; unsigned long mem; + int rc = 0; vcpu->stat.instruction_stsi++; VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); @@ -412,7 +381,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) } if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) { - kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); goto out_mem; } trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); @@ -425,7 +394,7 @@ out_mem: out_fail: /* condition code 3 */ vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; - return 0; + return rc; } static const intercept_handler_t b2_handlers[256] = { From c51f068c23c76a86d427260b8219430ee6f99516 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Mar 2013 17:22:54 +0100 Subject: [PATCH 067/204] KVM: s390: fix stsi exception handling In case of an exception the guest psw condition code should be left alone. Signed-off-by: Heiko Carstens Acked-By: Cornelia Huck Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/priv.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 23a8370b1045..de1b1b6128e1 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -343,8 +343,8 @@ static int handle_stsi(struct kvm_vcpu *vcpu) int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28; int sel1 = vcpu->run->s.regs.gprs[0] & 0xff; int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff; + unsigned long mem = 0; u64 operand2; - unsigned long mem; int rc = 0; vcpu->stat.instruction_stsi++; @@ -364,36 +364,36 @@ static int handle_stsi(struct kvm_vcpu *vcpu) case 2: mem = get_zeroed_page(GFP_KERNEL); if (!mem) - goto out_fail; + goto out_no_data; if (stsi((void *) mem, fc, sel1, sel2)) - goto out_mem; + goto out_no_data; break; case 3: if (sel1 != 2 || sel2 != 2) - goto out_fail; + goto out_no_data; mem = get_zeroed_page(GFP_KERNEL); if (!mem) - goto out_fail; + goto out_no_data; handle_stsi_3_2_2(vcpu, (void *) mem); break; default: - goto out_fail; + goto out_no_data; } if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) { rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - goto out_mem; + goto out_exception; } trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); free_page(mem); vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); vcpu->run->s.regs.gprs[0] = 0; return 0; -out_mem: - free_page(mem); -out_fail: +out_no_data: /* condition code 3 */ vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; +out_exception: + free_page(mem); return rc; } From b13b5dc7c96d40ebdadbdb752a92ecde5a9f2914 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Mar 2013 17:22:55 +0100 Subject: [PATCH 068/204] KVM: s390: fix compile with !CONFIG_COMPAT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/s390/kvm/priv.c should include both linux/compat.h and asm/compat.h. Fixes this one: In file included from arch/s390/kvm/priv.c:23:0: arch/s390/include/asm/compat.h: In function ‘arch_compat_alloc_user_space’: arch/s390/include/asm/compat.h:258:2: error: implicit declaration of function ‘is_compat_task’ Signed-off-by: Heiko Carstens Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/priv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index de1b1b6128e1..6bbd7b5a0bbe 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include From 3188bf6b809ba5e7df7b9f000634f08e8abbb76a Mon Sep 17 00:00:00 2001 From: Nick Wang Date: Mon, 25 Mar 2013 17:22:56 +0100 Subject: [PATCH 069/204] KVM: s390: Change the virtual memory mapping location for virtio devices The current location for mapping virtio devices does not take into consideration the standby memory. This causes the failure of mapping standby memory since the location for the mapping is already taken by the virtio devices. To fix the problem, we move the location to beyond the end of standby memory. Signed-off-by: Nick Wang Reviewed-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- drivers/s390/kvm/kvm_virtio.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index 6711e65764b5..2ea6165366b6 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c @@ -443,29 +443,30 @@ static int __init test_devices_support(unsigned long addr) } /* * Init function for virtio - * devices are in a single page above top of "normal" mem + * devices are in a single page above top of "normal" + standby mem */ static int __init kvm_devices_init(void) { int rc; + unsigned long total_memory_size = sclp_get_rzm() * sclp_get_rnmax(); if (!MACHINE_IS_KVM) return -ENODEV; - if (test_devices_support(real_memory_size) < 0) + if (test_devices_support(total_memory_size) < 0) return -ENODEV; - rc = vmem_add_mapping(real_memory_size, PAGE_SIZE); + rc = vmem_add_mapping(total_memory_size, PAGE_SIZE); if (rc) return rc; - kvm_devices = (void *) real_memory_size; + kvm_devices = (void *) total_memory_size; kvm_root = root_device_register("kvm_s390"); if (IS_ERR(kvm_root)) { rc = PTR_ERR(kvm_root); printk(KERN_ERR "Could not register kvm_s390 root device"); - vmem_remove_mapping(real_memory_size, PAGE_SIZE); + vmem_remove_mapping(total_memory_size, PAGE_SIZE); return rc; } From dd2887e7c36d0be986ef17a9dbec904e3e334566 Mon Sep 17 00:00:00 2001 From: Nick Wang Date: Mon, 25 Mar 2013 17:22:57 +0100 Subject: [PATCH 070/204] KVM: s390: Remove the sanity checks for kvm memory slot To model the standby memory with memory_region_add_subregion and friends, the guest would have one or more regions of ram. Remove the check allowing only one memory slot and the check requiring the real address of memory slot starts at zero. Signed-off-by: Nick Wang Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/kvm-s390.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d05a59c1eea7..b322ff15751d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -977,18 +977,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { - /* A few sanity checks. We can have exactly one memory slot which has - to start at guest virtual zero and which has to be located at a - page boundary in userland and which has to end at a page boundary. - The memory in userland is ok to be fragmented into various different - vmas. It is okay to mmap() and munmap() stuff in this slot after - doing this call at any time */ - - if (mem->slot) - return -EINVAL; - - if (mem->guest_phys_addr) - return -EINVAL; + /* A few sanity checks. We can have memory slots which have to be + located/ended at a segment boundary (1MB). The memory in userland is + ok to be fragmented into various different vmas. It is okay to mmap() + and munmap() stuff in this slot after doing this call at any time */ if (mem->userspace_addr & 0xffffful) return -EINVAL; From e1e2e605c2ad6791ce6346b22443ce611709fa65 Mon Sep 17 00:00:00 2001 From: Nick Wang Date: Mon, 25 Mar 2013 17:22:58 +0100 Subject: [PATCH 071/204] KVM: s390: Enable KVM_CAP_NR_MEMSLOTS on s390 Return KVM_USER_MEM_SLOTS in kvm_dev_ioctl_check_extension(). Signed-off-by: Nick Wang Reviewed-by: Christian Borntraeger Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- arch/s390/kvm/kvm-s390.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b322ff15751d..c1c7c683fa26 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -149,6 +149,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_USER_MEM_SLOTS; + break; case KVM_CAP_S390_COW: r = MACHINE_HAS_ESOP; break; From afd80d85aefac27e6e2f9dc10f60515357c504d2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Mar 2013 17:18:35 +0100 Subject: [PATCH 072/204] pmu: prepare for migration support In order to migrate the PMU state correctly, we need to restore the values of MSR_CORE_PERF_GLOBAL_STATUS (a read-only register) and MSR_CORE_PERF_GLOBAL_OVF_CTRL (which has side effects when written). We also need to write the full 40-bit value of the performance counter, which would only be possible with a v3 architectural PMU's full-width counter MSRs. To distinguish host-initiated writes from the guest's, pass the full struct msr_data to kvm_pmu_set_msr. Signed-off-by: Paolo Bonzini Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/pmu.c | 14 +++++++++++--- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b5a64621d5af..3dd84c996d56 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1030,7 +1030,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); void kvm_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index cfc258a6bf97..c53e797e7369 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -360,10 +360,12 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) return 1; } -int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = &vcpu->arch.pmu; struct kvm_pmc *pmc; + u32 index = msr_info->index; + u64 data = msr_info->data; switch (index) { case MSR_CORE_PERF_FIXED_CTR_CTRL: @@ -375,6 +377,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } break; case MSR_CORE_PERF_GLOBAL_STATUS: + if (msr_info->host_initiated) { + pmu->global_status = data; + return 0; + } break; /* RO MSR */ case MSR_CORE_PERF_GLOBAL_CTRL: if (pmu->global_ctrl == data) @@ -386,7 +392,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { - pmu->global_status &= ~data; + if (!msr_info->host_initiated) + pmu->global_status &= ~data; pmu->global_ovf_ctrl = data; return 0; } @@ -394,7 +401,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) default: if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || (pmc = get_fixed_pmc(pmu, index))) { - data = (s64)(s32)data; + if (!msr_info->host_initiated) + data = (s64)(s32)data; pmc->counter += data - read_pmc(pmc); return 0; } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2aaba814f1c8..78c6f90a60cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2040,7 +2040,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr, data); + return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " @@ -2086,7 +2086,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr, data); + return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); From 450e0b411f56db9839c9725d4d49e70e24f87ba3 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Fri, 29 Mar 2013 14:05:26 +0900 Subject: [PATCH 073/204] Revert "KVM: MMU: Move kvm_mmu_free_some_pages() into kvm_mmu_alloc_page()" With the following commit, shadow pages can be zapped at random during a shadow page talbe walk: KVM: MMU: Move kvm_mmu_free_some_pages() into kvm_mmu_alloc_page() 7ddca7e43c8f28f9419da81a0e7730b66aa60fe9 This patch reverts it and fixes __direct_map() and FNAME(fetch)(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Gleb Natapov --- arch/x86/kvm/mmu.c | 11 +++++++---- arch/x86/kvm/paging_tmpl.h | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 633e30cfbd63..004cc87b781c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1501,15 +1501,11 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static void make_mmu_pages_available(struct kvm_vcpu *vcpu); - static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; - make_mmu_pages_available(vcpu); - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) @@ -2806,6 +2802,7 @@ exit: static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); +static void make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, gfn_t gfn, bool prefault) @@ -2847,6 +2844,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; + make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, @@ -2924,6 +2922,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); + make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL, NULL); ++sp->root_count; @@ -2935,6 +2934,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); + make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL, @@ -2973,6 +2973,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); + make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); root = __pa(sp->spt); @@ -3006,6 +3007,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return 1; } spin_lock(&vcpu->kvm->mmu_lock); + make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 0, ACC_ALL, NULL); @@ -3311,6 +3313,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; + make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index af143f065532..da20860b457a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -627,6 +627,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); + make_mmu_pages_available(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, From 99437a2782730ec8c7e6cfebb6143d00b091e4a8 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 4 Apr 2013 10:25:06 +0200 Subject: [PATCH 074/204] KVM: s390: virtio_ccw: reset errors for new I/O. ccw_io_helper neglected to reset vcdev->err after a new channel program had been successfully started, resulting in stale errors delivered after one I/O failed. Reset the error after a new channel program has been successfully started with no old I/O pending. Signed-off-by: Cornelia Huck Signed-off-by: Gleb Natapov --- drivers/s390/kvm/virtio_ccw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 42d507c4e06b..5948f2a7b080 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -133,8 +133,11 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev, do { spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags); ret = ccw_device_start(vcdev->cdev, ccw, intparm, 0, 0); - if (!ret) + if (!ret) { + if (!vcdev->curr_io) + vcdev->err = 0; vcdev->curr_io |= flag; + } spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags); cpu_relax(); } while (ret == -EBUSY); From 44944d4d28948c71b110b09a2e924e505cd39e8b Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Sun, 7 Apr 2013 08:25:18 +0800 Subject: [PATCH 075/204] KVM: Call kvm_apic_match_dest() to check destination vcpu For a given vcpu, kvm_apic_match_dest() will tell you whether the vcpu in the destination list quickly. Drop kvm_calculate_eoi_exitmap() and use kvm_apic_match_dest() instead. Signed-off-by: Yang Zhang Signed-off-by: Gleb Natapov --- arch/x86/kvm/lapic.c | 47 -------------------------------------------- arch/x86/kvm/lapic.h | 4 ---- virt/kvm/ioapic.c | 9 +++------ 3 files changed, 3 insertions(+), 57 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a8e9369f41c5..e2274745ab3d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -145,53 +145,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic) return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } -void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, - struct kvm_lapic_irq *irq, - u64 *eoi_exit_bitmap) -{ - struct kvm_lapic **dst; - struct kvm_apic_map *map; - unsigned long bitmap = 1; - int i; - - rcu_read_lock(); - map = rcu_dereference(vcpu->kvm->arch.apic_map); - - if (unlikely(!map)) { - __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap); - goto out; - } - - if (irq->dest_mode == 0) { /* physical mode */ - if (irq->delivery_mode == APIC_DM_LOWEST || - irq->dest_id == 0xff) { - __set_bit(irq->vector, - (unsigned long *)eoi_exit_bitmap); - goto out; - } - dst = &map->phys_map[irq->dest_id & 0xff]; - } else { - u32 mda = irq->dest_id << (32 - map->ldr_bits); - - dst = map->logical_map[apic_cluster_id(map, mda)]; - - bitmap = apic_logical_id(map, mda); - } - - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (dst[i]->vcpu == vcpu) { - __set_bit(irq->vector, - (unsigned long *)eoi_exit_bitmap); - break; - } - } - -out: - rcu_read_unlock(); -} - static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 2c721b986eec..baa20cfb95ae 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -160,10 +160,6 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) return ldr & map->lid_mask; } -void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, - struct kvm_lapic_irq *irq, - u64 *eoi_bitmap); - static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) { return vcpu->arch.apic->pending_events; diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 5ba005c00e2f..914cbe027d06 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -124,7 +124,6 @@ void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; - struct kvm_lapic_irq irqe; int index; spin_lock(&ioapic->lock); @@ -135,11 +134,9 @@ void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index))) { - irqe.dest_id = e->fields.dest_id; - irqe.vector = e->fields.vector; - irqe.dest_mode = e->fields.dest_mode; - irqe.delivery_mode = e->fields.delivery_mode << 8; - kvm_calculate_eoi_exitmap(vcpu, &irqe, eoi_exit_bitmap); + if (kvm_apic_match_dest(vcpu, NULL, 0, + e->fields.dest_id, e->fields.dest_mode)) + __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); } } spin_unlock(&ioapic->lock); From b8c07d55d010702eff61562cf9a77366833d9da2 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 6 Apr 2013 13:51:21 +0200 Subject: [PATCH 076/204] KVM: nVMX: Check exit control for VM_EXIT_SAVE_IA32_PAT, not entry controls Obviously a copy&paste mistake: prepare_vmcs12 has to check L1's exit controls for VM_EXIT_SAVE_IA32_PAT. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 03f574641852..b67f35475b09 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7402,7 +7402,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); From 05e07f9bdb0ea3c1c52029e6c4db77c9f7c92a5c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 4 Apr 2013 13:27:21 +0300 Subject: [PATCH 077/204] kvm: fix MMIO/PIO collision misdetection PIO and MMIO are separate address spaces, but ioeventfd registration code mistakenly detected two eventfds as duplicate if they use the same address, even if one is PIO and another one MMIO. Reviewed-by: Paolo Bonzini Signed-off-by: Michael S. Tsirkin Signed-off-by: Gleb Natapov --- virt/kvm/eventfd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 020522ed9094..48790989f8d2 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -574,6 +574,7 @@ struct _ioeventfd { struct eventfd_ctx *eventfd; u64 datamatch; struct kvm_io_device dev; + u8 bus_idx; bool wildcard; }; @@ -666,7 +667,8 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) struct _ioeventfd *_p; list_for_each_entry(_p, &kvm->ioeventfds, list) - if (_p->addr == p->addr && _p->length == p->length && + if (_p->bus_idx == p->bus_idx && + _p->addr == p->addr && _p->length == p->length && (_p->wildcard || p->wildcard || _p->datamatch == p->datamatch)) return true; @@ -723,6 +725,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) INIT_LIST_HEAD(&p->list); p->addr = args->addr; + p->bus_idx = bus_idx; p->length = args->len; p->eventfd = eventfd; @@ -781,7 +784,8 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); - if (p->eventfd != eventfd || + if (p->bus_idx != bus_idx || + p->eventfd != eventfd || p->addr != args->addr || p->length != args->len || p->wildcard != wildcard) From 458f212e36b291067b74c0cac2bdcf8278817ee7 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Mon, 8 Apr 2013 15:26:33 +0800 Subject: [PATCH 078/204] KVM: x86: fix memory leak in vmx_init Free vmx_msr_bitmap_longmode_x2apic and vmx_msr_bitmap_longmode if kvm_init() fails. Signed-off-by: Yang Zhang Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b67f35475b09..1cf202cb47d5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7741,7 +7741,7 @@ static int __init vmx_init(void) r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - goto out3; + goto out5; #ifdef CONFIG_KEXEC rcu_assign_pointer(crash_vmclear_loaded_vmcss, @@ -7789,6 +7789,8 @@ static int __init vmx_init(void) return 0; +out5: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: free_page((unsigned long)vmx_msr_bitmap_longmode); out3: From a63cb56061239372fce2452dbedd35c95bd665aa Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 8 Apr 2013 11:07:46 +0200 Subject: [PATCH 079/204] KVM: VMX: Add missing braces to avoid redundant error check The code was already properly aligned, now also add the braces to avoid that err is checked even if alloc_apic_access_page didn't run and change it. Found via Coccinelle by Fengguang Wu. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1cf202cb47d5..669b80378731 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6785,10 +6785,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) put_cpu(); if (err) goto free_vmcs; - if (vm_need_virtualize_apic_accesses(kvm)) + if (vm_need_virtualize_apic_accesses(kvm)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; + } if (enable_ept) { if (!kvm->arch.ept_identity_map_addr) From fc1b74925f87f6aca5432eb73f6a57eff30afde7 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Fri, 5 Apr 2013 19:20:30 +0000 Subject: [PATCH 080/204] KVM: Move vm_list kvm_lock declarations out of x86 The variables vm_list and kvm_lock are common to all architectures, so move the declarations from arch/x86/include/asm/kvm_host.h to include/linux/kvm_host.h. Fixes sparse warnings like these when building for arm64: virt/kvm/kvm_main.c: warning: symbol 'kvm_lock' was not declared. Should it be static? virt/kvm/kvm_main.c: warning: symbol 'vm_list' was not declared. Should it be static? Signed-off-by: Geoff Levand Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 3 --- include/linux/kvm_host.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3dd84c996d56..628163c0c6e4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -94,9 +94,6 @@ #define ASYNC_PF_PER_VCPU 64 -extern raw_spinlock_t kvm_lock; -extern struct list_head vm_list; - struct kvm_vcpu; struct kvm; struct kvm_async_pf; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1c0be23f874d..71fed38c7200 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -135,6 +135,9 @@ struct kvm; struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; +extern raw_spinlock_t kvm_lock; +extern struct list_head vm_list; + struct kvm_io_range { gpa_t addr; int len; From 39369f7a8b7314b8f7860c880a2198c11cebdf5a Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Fri, 5 Apr 2013 19:20:30 +0000 Subject: [PATCH 081/204] KVM: Make local routines static The routines get_user_page_nowait(), kvm_io_bus_sort_cmp(), kvm_io_bus_insert_dev() and kvm_io_bus_get_first_dev() are only referenced within kvm_main.c, so give them static linkage. Fixes sparse warnings like these: virt/kvm/kvm_main.c: warning: symbol 'get_user_page_nowait' was not declared. Should it be static? Signed-off-by: Geoff Levand Signed-off-by: Gleb Natapov --- virt/kvm/kvm_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ff7154188b5f..9ad98769dd75 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1078,7 +1078,7 @@ static int kvm_read_hva_atomic(void *data, void __user *hva, int len) return __copy_from_user_inatomic(data, hva, len); } -int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, +static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int write, struct page **page) { int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; @@ -2612,7 +2612,7 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus) kfree(bus); } -int kvm_io_bus_sort_cmp(const void *p1, const void *p2) +static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) { const struct kvm_io_range *r1 = p1; const struct kvm_io_range *r2 = p2; @@ -2624,7 +2624,7 @@ int kvm_io_bus_sort_cmp(const void *p1, const void *p2) return 0; } -int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, +static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, gpa_t addr, int len) { bus->range[bus->dev_count++] = (struct kvm_io_range) { @@ -2639,7 +2639,7 @@ int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, return 0; } -int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, +static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, gpa_t addr, int len) { struct kvm_io_range *range, key; From e3ba45b8041740f4ab8bbba3c6239876661aeed6 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Fri, 5 Apr 2013 19:20:30 +0000 Subject: [PATCH 082/204] KVM: Move kvm_spurious_fault to x86.c The routine kvm_spurious_fault() is an x86 specific routine, so move it from virt/kvm/kvm_main.c to arch/x86/kvm/x86.c. Fixes this sparse warning when building on arm64: virt/kvm/kvm_main.c:warning: symbol 'kvm_spurious_fault' was not declared. Should it be static? Signed-off-by: Geoff Levand Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 7 +++++++ virt/kvm/kvm_main.c | 8 -------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78c6f90a60cc..eb9927e0af11 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -261,6 +261,13 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +asmlinkage void kvm_spurious_fault(void) +{ + /* Fault while not rebooting. We want the trace. */ + BUG(); +} +EXPORT_SYMBOL_GPL(kvm_spurious_fault); + #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 #define EXCPT_PF 2 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9ad98769dd75..5cc53c907d3b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2572,14 +2572,6 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, return NOTIFY_OK; } - -asmlinkage void kvm_spurious_fault(void) -{ - /* Fault while not rebooting. We want the trace. */ - BUG(); -} -EXPORT_SYMBOL_GPL(kvm_spurious_fault); - static int kvm_reboot(struct notifier_block *notifier, unsigned long val, void *v) { From 8b415dcd762607379cf0a69c9dd25940da1d174e Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Fri, 5 Apr 2013 19:20:30 +0000 Subject: [PATCH 083/204] KVM: Move kvm_rebooting declaration out of x86 The variable kvm_rebooting is a common kvm variable, so move its declaration from arch/x86/include/asm/kvm_host.h to include/asm/kvm_host.h. Fixes this sparse warning when building on arm64: virt/kvm/kvm_main.c:warning: symbol 'kvm_rebooting' was not declared. Should it be static? Signed-off-by: Geoff Levand Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 1 - include/linux/kvm_host.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 628163c0c6e4..b2c7263c93b6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -972,7 +972,6 @@ enum { * Trap the fault and ignore the instruction if that happens. */ asmlinkage void kvm_spurious_fault(void); -extern bool kvm_rebooting; #define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ "666: " insn "\n\t" \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 71fed38c7200..20d77d24d764 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1061,6 +1061,8 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } } +extern bool kvm_rebooting; + #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) From f8da94e9e44b237fa5cc8521faeb714dc2e83b54 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 11 Apr 2013 14:06:03 +0200 Subject: [PATCH 084/204] KVM: x86 emulator: Fix segment loading in VM86 This fixes a regression introduced in commit 03ebebeb1 ("KVM: x86 emulator: Leave segment limit and attributs alone in real mode"). The mentioned commit changed the segment descriptors for both real mode and VM86 to only update the segment base instead of creating a completely new descriptor with limit 0xffff so that unreal mode keeps working across a segment register reload. This leads to an invalid segment descriptor in the eyes of VMX, which seems to be okay for real mode because KVM will fix it up before the next VM entry or emulate the state, but it doesn't do this if the guest is in VM86, so we end up with: KVM: entry failed, hardware error 0x80000021 Fix this by effectively reverting commit 03ebebeb1 for VM86 and leaving it only in place for real mode, which is where it's really needed. Signed-off-by: Kevin Wolf Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a335cc6cde72..069d79926e2b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1578,12 +1578,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, memset(&seg_desc, 0, sizeof seg_desc); - if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) - || ctxt->mode == X86EMUL_MODE_REAL) { - /* set real mode segment descriptor */ + if (ctxt->mode == X86EMUL_MODE_REAL) { + /* set real mode segment descriptor (keep limit etc. for + * unreal mode) */ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); set_desc_base(&seg_desc, selector << 4); goto load; + } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) { + /* VM86 needs a clean new segment descriptor */ + set_desc_base(&seg_desc, selector << 4); + set_desc_limit(&seg_desc, 0xffff); + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + seg_desc.dpl = 3; + goto load; } rpl = selector & 3; From 0b789eee2c0204da83278f181428560faf6efefb Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 11 Apr 2013 11:59:55 +0300 Subject: [PATCH 085/204] KVM: emulator: fix unimplemented instruction detection Unimplemented instruction detection is broken for group instructions since it relies on "flags" field of opcode to be zero, but all instructions in a group inherit flags from a group encoding. Fix that by having a separate flag for unimplemented instructions. Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 069d79926e2b..ab6fda4eb98f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -132,8 +132,9 @@ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) #define PageTable (1 << 29) /* instruction used to write page table */ +#define NotImpl (1 << 30) /* instruction is not implemented */ /* Source 2 operand type */ -#define Src2Shift (30) +#define Src2Shift (31) #define Src2None (OpNone << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) @@ -3624,7 +3625,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ .check_perm = (_p) } -#define N D(0) +#define N D(NotImpl) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } @@ -4382,7 +4383,7 @@ done_prefixes: ctxt->intercept = opcode.intercept; /* Unrecognised? */ - if (ctxt->d == 0 || (ctxt->d & Undefined)) + if (ctxt->d == 0 || (ctxt->d & NotImpl) || (ctxt->d & Undefined)) return EMULATION_FAILED; if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) From 991eebf9f8e523e7ff1e4d31ac80641582b2e57a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 11 Apr 2013 12:10:51 +0300 Subject: [PATCH 086/204] KVM: VMX: do not try to reexecute failed instruction while emulating invalid guest state During invalid guest state emulation vcpu cannot enter guest mode to try to reexecute instruction that emulator failed to emulate, so emulation will happen again and again. Prevent that by telling the emulator that instruction reexecution should not be attempted. Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 13 +++++++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b2c7263c93b6..82f1dc67782f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -795,6 +795,7 @@ enum emulation_result { #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_RETRY (1 << 3) +#define EMULTYPE_NO_REEXECUTE (1 << 4) int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, void *insn, int insn_len); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 669b80378731..d2686771481f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5189,7 +5189,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) return 1; - err = emulate_instruction(vcpu, 0); + err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); if (err == EMULATE_DO_MMIO) { ret = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eb9927e0af11..999d1243a6ce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4765,11 +4765,15 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, - bool write_fault_to_shadow_pgtable) + bool write_fault_to_shadow_pgtable, + int emulation_type) { gpa_t gpa = cr2; pfn_t pfn; + if (emulation_type & EMULTYPE_NO_REEXECUTE) + return false; + if (!vcpu->arch.mmu.direct_map) { /* * Write permission should be allowed since only @@ -4912,8 +4916,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; - if (reexecute_instruction(vcpu, cr2, - write_fault_to_spt)) + if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, + emulation_type)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; @@ -4943,7 +4947,8 @@ restart: return EMULATE_DONE; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt)) + if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, + emulation_type)) return EMULATE_DONE; return handle_emulation_failure(vcpu); From 1146a78b8d88fc0e0ca2ab9c549821242e289432 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 11 Apr 2013 12:30:01 +0300 Subject: [PATCH 087/204] KVM: emulator: Do not fail on emulation of undefined opcode Emulation of undefined opcode should inject #UD instead of causing emulation failure. Do that by moving Undefined flag check to emulation stage and injection #UD there. Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ab6fda4eb98f..77b56d5be3e1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4383,7 +4383,7 @@ done_prefixes: ctxt->intercept = opcode.intercept; /* Unrecognised? */ - if (ctxt->d == 0 || (ctxt->d & NotImpl) || (ctxt->d & Undefined)) + if (ctxt->d == 0 || (ctxt->d & NotImpl)) return EMULATION_FAILED; if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) @@ -4521,7 +4521,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) ctxt->mem_read.pos = 0; - if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { + if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || + (ctxt->d & Undefined)) { rc = emulate_ud(ctxt); goto done; } From 188424ba10313bbbc015727d418b7f18593becdd Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 11 Apr 2013 12:32:14 +0300 Subject: [PATCH 088/204] KVM: emulator: mark 0xff 0x7d opcode as undefined. Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 77b56d5be3e1..46f63b8d09f4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3723,7 +3723,7 @@ static const struct opcode group5[] = { I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), I(SrcMem | Stack, em_grp45), I(SrcMemFAddr | ImplicitOps, em_grp45), - I(SrcMem | Stack, em_grp45), N, + I(SrcMem | Stack, em_grp45), D(Undefined), }; static const struct opcode group6[] = { From 3b656cf764cbc43d3efb9bf5f45c618d4cf0989f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 14 Apr 2013 12:12:45 +0200 Subject: [PATCH 089/204] KVM: nVMX: Fix injection of PENDING_INTERRUPT and NMI_WINDOW exits to L1 Check if the interrupt or NMI window exit is for L1 by testing if it has the corresponding controls enabled. This is required when we allow direct injection from L0 to L2 Signed-off-by: Jan Kiszka Reviewed-by: Gleb Natapov Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d2686771481f..5ae969b741f8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6112,14 +6112,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TRIPLE_FAULT: return 1; case EXIT_REASON_PENDING_INTERRUPT: + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); case EXIT_REASON_NMI_WINDOW: - /* - * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit - * (aka Interrupt Window Exiting) only when L1 turned it on, - * so if we got a PENDING_INTERRUPT exit, this must be for L1. - * Same for NMI Window Exiting. - */ - return 1; + return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); case EXIT_REASON_TASK_SWITCH: return 1; case EXIT_REASON_CPUID: From 5f3d5799974b89100268ba813cec8db7bd0693fb Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 14 Apr 2013 12:12:46 +0200 Subject: [PATCH 090/204] KVM: nVMX: Rework event injection and recovery The basic idea is to always transfer the pending event injection on vmexit into the architectural state of the VCPU and then drop it from there if it turns out that we left L2 to enter L1, i.e. if we enter prepare_vmcs12. vmcs12_save_pending_events takes care to transfer pending L0 events into the queue of L1. That is mandatory as L1 may decide to switch the guest state completely, invalidating or preserving the pending events for later injection (including on a different node, once we support migration). This concept is based on the rule that a pending vmlaunch/vmresume is not canceled. Otherwise, we would risk to lose injected events or leak them into the wrong queues. Encode this rule via a WARN_ON_ONCE at the entry of nested_vmx_vmexit. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 99 ++++++++++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5ae969b741f8..4fb72a764dbd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6493,8 +6493,6 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - if (is_guest_mode(&vmx->vcpu)) - return; __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_ERROR_CODE); @@ -6502,8 +6500,6 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) static void vmx_cancel_injection(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) - return; __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), VM_ENTRY_INSTRUCTION_LEN, @@ -6535,21 +6531,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long debugctlmsr; - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_VALID_MASK) { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->idt_vectoring_info_field); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_exit_instruction_len); - if (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_DELIVER_CODE_MASK) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->idt_vectoring_error_code); - } - } - /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); @@ -6708,17 +6689,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - vmcs12->idt_vectoring_error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); - vmcs12->vm_exit_instruction_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - } - } - vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); @@ -7326,6 +7296,48 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu->arch.cr4_guest_owned_bits)); } +static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 idt_vectoring; + unsigned int nr; + + if (vcpu->arch.exception.pending) { + nr = vcpu->arch.exception.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (kvm_exception_is_soft(nr)) { + vmcs12->vm_exit_instruction_len = + vcpu->arch.event_exit_inst_len; + idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; + } else + idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; + + if (vcpu->arch.exception.has_error_code) { + idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; + vmcs12->idt_vectoring_error_code = + vcpu->arch.exception.error_code; + } + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } else if (vcpu->arch.nmi_pending) { + vmcs12->idt_vectoring_info_field = + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; + } else if (vcpu->arch.interrupt.pending) { + nr = vcpu->arch.interrupt.nr; + idt_vectoring = nr | VECTORING_INFO_VALID_MASK; + + if (vcpu->arch.interrupt.soft) { + idt_vectoring |= INTR_TYPE_SOFT_INTR; + vmcs12->vm_entry_instruction_len = + vcpu->arch.event_exit_inst_len; + } else + idt_vectoring |= INTR_TYPE_EXT_INTR; + + vmcs12->idt_vectoring_info_field = idt_vectoring; + } +} + /* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), @@ -7411,15 +7423,29 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - vmcs12->idt_vectoring_info_field = to_vmx(vcpu)->idt_vectoring_info; - vmcs12->idt_vectoring_error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); + vmcs12->idt_vectoring_info_field = 0; vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - /* clear vm-entry fields which are to be cleared on exit */ - if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + /* vm_entry_intr_info_field is cleared on exit. Emulate this + * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; + + /* + * Transfer the event that L0 or L1 may wanted to inject into + * L2 to IDT_VECTORING_INFO_FIELD. + */ + vmcs12_save_pending_event(vcpu, vmcs12); + } + + /* + * Drop what we picked up for L2 via vmx_complete_interrupts. It is + * preserved above and would only end up incorrectly in L1. + */ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); } /* @@ -7519,6 +7545,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) int cpu; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + /* trying to cancel vmlaunch/vmresume is a bug */ + WARN_ON_ONCE(vmx->nested.nested_run_pending); + leave_guest_mode(vcpu); prepare_vmcs12(vcpu, vmcs12); From e8457c67a4ec1268ec616bd8be1d9f1cc20f1493 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 14 Apr 2013 12:12:48 +0200 Subject: [PATCH 091/204] KVM: nVMX: Fix conditions for interrupt injection If we are entering guest mode, we do not want L0 to interrupt this vmentry with all its side effects on the vmcs. Therefore, injection shall be disallowed during L1->L2 transitions, as in the previous version. However, this check is conceptually independent of nested_exit_on_intr, so decouple it. If L1 traps external interrupts, we can kick the guest from L2 to L1, also just like the previous code worked. But we no longer need to consider L1's idt_vectoring_info_field. It will always be empty at this point. Instead, if L2 has pending events, those are now found in the architectural queues and will, thus, prevent vmx_interrupt_allowed from being called at all. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4fb72a764dbd..5e6391112275 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4325,16 +4325,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { + if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (to_vmx(vcpu)->nested.nested_run_pending || - (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_VALID_MASK)) + + if (to_vmx(vcpu)->nested.nested_run_pending) return 0; - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; - vmcs12->vm_exit_intr_info = 0; - /* fall through to normal code, but now in L1, not L2 */ + if (nested_exit_on_intr(vcpu)) { + nested_vmx_vmexit(vcpu); + vmcs12->vm_exit_reason = + EXIT_REASON_EXTERNAL_INTERRUPT; + vmcs12->vm_exit_intr_info = 0; + /* + * fall through to normal code, but now in L1, not L2 + */ + } } return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && From c0d1c770c05ac7051df86914f9627b68f29c1d67 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 14 Apr 2013 12:12:50 +0200 Subject: [PATCH 092/204] KVM: nVMX: Avoid reading VM_EXIT_INTR_ERROR_CODE needlessly on nested exits We only need to update vm_exit_intr_error_code if there is a valid exit interruption information and it comes with a valid error code. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5e6391112275..71755573b7ca 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7426,7 +7426,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + if ((vmcs12->vm_exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); vmcs12->idt_vectoring_info_field = 0; vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); From 1fcc7890dbf32571c25278803ee19182c801c006 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:21:35 +0800 Subject: [PATCH 093/204] KVM: Add vcpu info to ioapic_update_eoi() Add vcpu info to ioapic_update_eoi, so we can know which vcpu issued this EOI. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 2 +- virt/kvm/ioapic.c | 12 ++++++------ virt/kvm/ioapic.h | 3 ++- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e2274745ab3d..3e22536a7031 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -739,7 +739,7 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } } diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 914cbe027d06..1d8906d39214 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -264,8 +264,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id) spin_unlock(&ioapic->lock); } -static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, - int trigger_mode) +static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, + struct kvm_ioapic *ioapic, int vector, int trigger_mode) { int i; @@ -304,12 +304,12 @@ bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) return test_bit(vector, ioapic->handled_vectors); } -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) +void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) { - struct kvm_ioapic *ioapic = kvm->arch.vioapic; + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; spin_lock(&ioapic->lock); - __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); + __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode); spin_unlock(&ioapic->lock); } @@ -407,7 +407,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, break; #ifdef CONFIG_IA64 case IOAPIC_REG_EOI: - __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG); + __kvm_ioapic_update_eoi(NULL, ioapic, data, IOAPIC_LEVEL_TRIG); break; #endif diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 0400a466c50c..2fc61a5013d2 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -70,7 +70,8 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); +void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, + int trigger_mode); bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); From 8dc6aade5bdc9d436d4ab8328cb15f0adbbc47bf Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:21:36 +0800 Subject: [PATCH 094/204] KVM: Introduce struct rtc_status rtc_status is used to track RTC interrupt delivery status. The pending_eoi will be increased by vcpu who received RTC interrupt and will be decreased when EOI to this interrupt. Also, we use dest_map to record the destination vcpu to avoid the case that vcpu who didn't get the RTC interupt, but issued EOI with same vector of RTC and descreased pending_eoi by mistake. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/ioapic.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 2fc61a5013d2..87cd94bae372 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -34,6 +34,17 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 +#ifdef CONFIG_X86 +#define RTC_GSI 8 +#else +#define RTC_GSI -1U +#endif + +struct rtc_status { + int pending_eoi; + DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS); +}; + struct kvm_ioapic { u64 base_address; u32 ioregsel; @@ -47,6 +58,7 @@ struct kvm_ioapic { void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; DECLARE_BITMAP(handled_vectors, 256); + struct rtc_status rtc_status; }; #ifdef DEBUG From b4f2225c07dd4d8eef7aa7f5b36a3b72c3cbbe5b Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:21:37 +0800 Subject: [PATCH 095/204] KVM: Return destination vcpu on interrupt injection Add a new parameter to know vcpus who received the interrupt. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 25 ++++++++++++++++--------- arch/x86/kvm/lapic.h | 5 +++-- virt/kvm/ioapic.c | 2 +- virt/kvm/ioapic.h | 2 +- virt/kvm/irq_comm.c | 12 ++++++------ 5 files changed, 27 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3e22536a7031..0b734025fc87 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -384,14 +384,16 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) } static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode); + int vector, int level, int trig_mode, + unsigned long *dest_map); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map) { struct kvm_lapic *apic = vcpu->arch.apic; return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, - irq->level, irq->trig_mode); + irq->level, irq->trig_mode, dest_map); } static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) @@ -564,7 +566,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, } bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r) + struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map) { struct kvm_apic_map *map; unsigned long bitmap = 1; @@ -575,7 +577,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, *r = -1; if (irq->shorthand == APIC_DEST_SELF) { - *r = kvm_apic_set_irq(src->vcpu, irq); + *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); return true; } @@ -620,7 +622,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, continue; if (*r < 0) *r = 0; - *r += kvm_apic_set_irq(dst[i]->vcpu, irq); + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } ret = true; @@ -634,7 +636,8 @@ out: * Return 1 if successfully added and 0 if discarded. */ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode) + int vector, int level, int trig_mode, + unsigned long *dest_map) { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; @@ -647,6 +650,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (unlikely(!apic_enabled(apic))) break; + if (dest_map) + __set_bit(vcpu->vcpu_id, dest_map); + if (trig_mode) { apic_debug("level trig mode for vector %d", vector); apic_set_vector(vector, apic->regs + APIC_TMR); @@ -805,7 +811,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, irq.vector); - kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); + kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } static u32 apic_get_tmcct(struct kvm_lapic *apic) @@ -1441,7 +1447,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode); + return __apic_accept_irq(apic, mode, vector, 1, trig_mode, + NULL); } return 0; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index baa20cfb95ae..3e5a43160cb7 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -55,11 +55,12 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, + unsigned long *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r); + struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 1d8906d39214..27ae8dd64e29 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -217,7 +217,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) irqe.level = 1; irqe.shorthand = 0; - return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); + return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); } int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 87cd94bae372..761e5b57099e 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -92,7 +92,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq); + struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index e9073cf4d040..2f07d2e59a2d 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -63,7 +63,7 @@ inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) } int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq) + struct kvm_lapic_irq *irq, unsigned long *dest_map) { int i, r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; @@ -74,7 +74,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, irq->delivery_mode = APIC_DM_FIXED; } - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r)) + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) return r; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -88,7 +88,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (!kvm_is_dm_lowest_prio(irq)) { if (r < 0) r = 0; - r += kvm_apic_set_irq(vcpu, irq); + r += kvm_apic_set_irq(vcpu, irq, dest_map); } else if (kvm_lapic_enabled(vcpu)) { if (!lowest) lowest = vcpu; @@ -98,7 +98,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, } if (lowest) - r = kvm_apic_set_irq(lowest, irq); + r = kvm_apic_set_irq(lowest, irq, dest_map); return r; } @@ -130,7 +130,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, kvm_set_msi_irq(e, &irq); - return kvm_irq_delivery_to_apic(kvm, NULL, &irq); + return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } @@ -142,7 +142,7 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, kvm_set_msi_irq(e, &irq); - if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r)) + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) return r; else return -EWOULDBLOCK; From 106069193ce501ea68649037bde7ea50ed492948 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:21:38 +0800 Subject: [PATCH 096/204] KVM: Add reset/restore rtc_status support restore rtc_status from migration or save/restore Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 9 +++++++ arch/x86/kvm/lapic.h | 2 ++ virt/kvm/ioapic.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/ioapic.h | 1 + 4 files changed, 70 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0b734025fc87..67962188c775 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -94,6 +94,14 @@ static inline int apic_test_vector(int vec, void *bitmap) return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + return apic_test_vector(vector, apic->regs + APIC_ISR) || + apic_test_vector(vector, apic->regs + APIC_IRR); +} + static inline void apic_set_vector(int vec, void *bitmap) { set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -1618,6 +1626,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, apic->highest_isr_cache = -1; kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_rtc_eoi_tracking_restore_one(vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 3e5a43160cb7..16304b1a8cba 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -166,4 +166,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) return vcpu->arch.apic->pending_events; } +bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); + #endif diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 27ae8dd64e29..9d76baa78e89 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -90,6 +90,62 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, return result; } +static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) +{ + ioapic->rtc_status.pending_eoi = 0; + bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS); +} + +static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) +{ + bool new_val, old_val; + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + union kvm_ioapic_redirect_entry *e; + + e = &ioapic->redirtbl[RTC_GSI]; + if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, + e->fields.dest_mode)) + return; + + new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); + old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + + if (new_val == old_val) + return; + + if (new_val) { + __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi++; + } else { + __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi--; + } + + WARN_ON(ioapic->rtc_status.pending_eoi < 0); +} + +void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) +{ + struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + + spin_lock(&ioapic->lock); + __rtc_irq_eoi_tracking_restore_one(vcpu); + spin_unlock(&ioapic->lock); +} + +static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) +{ + struct kvm_vcpu *vcpu; + int i; + + if (RTC_GSI >= IOAPIC_NUM_PINS) + return; + + rtc_irq_eoi_tracking_reset(ioapic); + kvm_for_each_vcpu(i, vcpu, ioapic->kvm) + __rtc_irq_eoi_tracking_restore_one(vcpu); +} + static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) { union kvm_ioapic_redirect_entry *pent; @@ -428,6 +484,7 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic) ioapic->ioregsel = 0; ioapic->irr = 0; ioapic->id = 0; + rtc_irq_eoi_tracking_reset(ioapic); update_handled_vectors(ioapic); } @@ -494,6 +551,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); update_handled_vectors(ioapic); kvm_ioapic_make_eoibitmap_request(kvm); + kvm_rtc_eoi_tracking_restore_all(ioapic); spin_unlock(&ioapic->lock); return 0; } diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 761e5b57099e..313fc4ea61d9 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -79,6 +79,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) return kvm->arch.vioapic; } +void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); From f3bff6318fa0f54956b02ed451d9b120441006ea Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:21:39 +0800 Subject: [PATCH 097/204] KVM: Force vmexit with virtual interrupt delivery Need the EOI to track interrupt deliver status, so force vmexit on EOI for rtc interrupt when enabling virtual interrupt delivery. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/ioapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 9d76baa78e89..76528fff252d 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -189,7 +189,7 @@ void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, if (!e->fields.mask && (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, - index))) { + index) || index == RTC_GSI)) { if (kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, e->fields.dest_mode)) __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); From aa2fbe6d44892070d78995f0df875ce930904e29 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:21:40 +0800 Subject: [PATCH 098/204] KVM: Let ioapic know the irq line status Userspace may deliver RTC interrupt without query the status. So we want to track RTC EOI for this case. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 4 ++-- arch/x86/kvm/x86.c | 6 ++++-- include/linux/kvm_host.h | 11 +++++++---- virt/kvm/assigned-dev.c | 13 +++++++------ virt/kvm/eventfd.c | 15 +++++++++------ virt/kvm/ioapic.c | 18 ++++++++++-------- virt/kvm/ioapic.h | 2 +- virt/kvm/irq_comm.c | 19 ++++++++++++------- virt/kvm/kvm_main.c | 3 ++- 9 files changed, 54 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c1d30b2fc9bb..412a5aa0ef94 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -290,8 +290,8 @@ static void pit_do_work(struct kthread_work *work) } spin_unlock(&ps->inject_lock); if (inject) { - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false); /* * Provides NMI watchdog support via Virtual Wire mode. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 999d1243a6ce..ae9744d03c83 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3491,13 +3491,15 @@ out: return r; } -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) { if (!irqchip_in_kernel(kvm)) return -ENXIO; irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event->irq, irq_event->level); + irq_event->irq, irq_event->level, + line_status); return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 20d77d24d764..4a76aca31478 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -292,7 +292,8 @@ struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; int (*set)(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level); + struct kvm *kvm, int irq_source_id, int level, + bool line_status); union { struct { unsigned irqchip; @@ -591,7 +592,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level); +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, + bool line_status); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -722,10 +724,11 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, union kvm_ioapic_redirect_entry *entry, unsigned long *deliver_bitmask); #endif -int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, + bool line_status); int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, - int irq_source_id, int level); + int irq_source_id, int level, bool line_status); bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); void kvm_register_irq_ack_notifier(struct kvm *kvm, diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 3642239252b0..f4c7f591b5d8 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -80,11 +80,12 @@ kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, spin_lock(&assigned_dev->intx_mask_lock); if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1); + assigned_dev->irq_source_id, vector, 1, + false); spin_unlock(&assigned_dev->intx_mask_lock); } else kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1); + vector, 1, false); } static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) @@ -165,7 +166,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) container_of(kian, struct kvm_assigned_dev_kernel, ack_notifier); - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); spin_lock(&dev->intx_mask_lock); @@ -188,7 +189,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) if (reassert) kvm_set_irq(dev->kvm, dev->irq_source_id, - dev->guest_irq, 1); + dev->guest_irq, 1, false); } spin_unlock(&dev->intx_mask_lock); @@ -202,7 +203,7 @@ static void deassign_guest_irq(struct kvm *kvm, &assigned_dev->ack_notifier); kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0); + assigned_dev->guest_irq, 0, false); if (assigned_dev->irq_source_id != -1) kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); @@ -901,7 +902,7 @@ static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { kvm_set_irq(match->kvm, match->irq_source_id, - match->guest_irq, 0); + match->guest_irq, 0, false); /* * Masking at hardware-level is performed on demand, * i.e. when an IRQ actually arrives at the host. diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 48790989f8d2..c5d43ffbf1f3 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -100,11 +100,13 @@ irqfd_inject(struct work_struct *work) struct kvm *kvm = irqfd->kvm; if (!irqfd->resampler) { - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, + false); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, + false); } else kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - irqfd->gsi, 1); + irqfd->gsi, 1, false); } /* @@ -121,7 +123,7 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) resampler = container_of(kian, struct _irqfd_resampler, notifier); kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - resampler->notifier.gsi, 0); + resampler->notifier.gsi, 0, false); rcu_read_lock(); @@ -146,7 +148,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd) list_del(&resampler->link); kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - resampler->notifier.gsi, 0); + resampler->notifier.gsi, 0, false); kfree(resampler); } @@ -225,7 +227,8 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) irq = rcu_dereference(irqfd->irq_entry); /* An event has been signaled, inject an interrupt */ if (irq) - kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); + kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, + false); else schedule_work(&irqfd->inject); rcu_read_unlock(); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 76528fff252d..a49fcd55b378 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -50,7 +50,8 @@ #else #define ioapic_debug(fmt, arg...) #endif -static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq); +static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq, + bool line_status); static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, unsigned long addr, @@ -146,7 +147,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) __rtc_irq_eoi_tracking_restore_one(vcpu); } -static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) +static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx, + bool line_status) { union kvm_ioapic_redirect_entry *pent; int injected = -1; @@ -154,7 +156,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) pent = &ioapic->redirtbl[idx]; if (!pent->fields.mask) { - injected = ioapic_deliver(ioapic, idx); + injected = ioapic_deliver(ioapic, idx, line_status); if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) pent->fields.remote_irr = 1; } @@ -248,13 +250,13 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) - ioapic_service(ioapic, index); + ioapic_service(ioapic, index, false); kvm_ioapic_make_eoibitmap_request(ioapic->kvm); break; } } -static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) +static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status) { union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; struct kvm_lapic_irq irqe; @@ -277,7 +279,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) } int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, - int level) + int level, bool line_status) { u32 old_irr; u32 mask = 1 << irq; @@ -300,7 +302,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, ioapic->irr |= mask; if ((edge && old_irr != ioapic->irr) || (!edge && !entry.fields.remote_irr)) - ret = ioapic_service(ioapic, irq); + ret = ioapic_service(ioapic, irq, line_status); else ret = 0; /* report coalesced interrupt */ } @@ -349,7 +351,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); ent->fields.remote_irr = 0; if (!ent->fields.mask && (ioapic->irr & (1 << i))) - ioapic_service(ioapic, i); + ioapic_service(ioapic, i, false); } } diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 313fc4ea61d9..554157bbb586 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -89,7 +89,7 @@ bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, - int level); + int level, bool line_status); void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 2f07d2e59a2d..8efb580edfef 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -35,7 +35,8 @@ #include "ioapic.h" static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level) + struct kvm *kvm, int irq_source_id, int level, + bool line_status) { #ifdef CONFIG_X86 struct kvm_pic *pic = pic_irqchip(kvm); @@ -46,10 +47,12 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, } static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level) + struct kvm *kvm, int irq_source_id, int level, + bool line_status) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level); + return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, + line_status); } inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) @@ -121,7 +124,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, } int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, int level) + struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_lapic_irq irq; @@ -159,7 +162,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) route.msi.address_hi = msi->address_hi; route.msi.data = msi->data; - return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); + return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); } /* @@ -168,7 +171,8 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) * = 0 Interrupt was coalesced (previous irq is still pending) * > 0 Number of CPUs interrupt was delivered to */ -int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, + bool line_status) { struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; int ret = -1, i = 0; @@ -189,7 +193,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) while(i--) { int r; - r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level); + r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, + line_status); if (r < 0) continue; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5cc53c907d3b..ac3182eed462 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2258,7 +2258,8 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; - r = kvm_vm_ioctl_irq_line(kvm, &irq_event); + r = kvm_vm_ioctl_irq_line(kvm, &irq_event, + ioctl == KVM_IRQ_LINE_STATUS); if (r) goto out; From 2c2bf01136971c33e3b3fabce23925f372c1017e Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:21:41 +0800 Subject: [PATCH 099/204] KVM: Use eoi to track RTC interrupt delivery status Current interrupt coalescing logci which only used by RTC has conflict with Posted Interrupt. This patch introduces a new mechinism to use eoi to track interrupt: When delivering an interrupt to vcpu, the pending_eoi set to number of vcpu that received the interrupt. And decrease it when each vcpu writing eoi. No subsequent RTC interrupt can deliver to vcpu until all vcpus write eoi. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- virt/kvm/ioapic.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index a49fcd55b378..97c67a50f904 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -147,6 +147,22 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) __rtc_irq_eoi_tracking_restore_one(vcpu); } +static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) +{ + if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) + --ioapic->rtc_status.pending_eoi; + + WARN_ON(ioapic->rtc_status.pending_eoi < 0); +} + +static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) +{ + if (ioapic->rtc_status.pending_eoi > 0) + return true; /* coalesced */ + + return false; +} + static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx, bool line_status) { @@ -260,6 +276,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status) { union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; struct kvm_lapic_irq irqe; + int ret; ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " "vector=%x trig_mode=%x\n", @@ -275,7 +292,15 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.level = 1; irqe.shorthand = 0; - return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); + if (irq == RTC_GSI && line_status) { + BUG_ON(ioapic->rtc_status.pending_eoi != 0); + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, + ioapic->rtc_status.dest_map); + ioapic->rtc_status.pending_eoi = ret; + } else + ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); + + return ret; } int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, @@ -299,6 +324,12 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, ret = 1; } else { int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG); + + if (irq == RTC_GSI && line_status && + rtc_irq_check_coalesced(ioapic)) { + ret = 0; /* coalesced */ + goto out; + } ioapic->irr |= mask; if ((edge && old_irr != ioapic->irr) || (!edge && !entry.fields.remote_irr)) @@ -306,6 +337,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, else ret = 0; /* report coalesced interrupt */ } +out: trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0); spin_unlock(&ioapic->lock); @@ -333,6 +365,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, if (ent->fields.vector != vector) continue; + if (i == RTC_GSI) + rtc_irq_eoi(ioapic, vcpu); /* * We are dropping lock while calling ack notifiers because ack * notifier callbacks for assigned devices call into IOAPIC From a547c6db4d2f16ba5ce8e7054bffad6acc248d40 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:25:10 +0800 Subject: [PATCH 100/204] KVM: VMX: Enable acknowledge interupt on vmexit The "acknowledge interrupt on exit" feature controls processor behavior for external interrupt acknowledgement. When this control is set, the processor acknowledges the interrupt controller to acquire the interrupt vector on VM exit. After enabling this feature, an interrupt which arrived when target cpu is running in vmx non-root mode will be handled by vmx handler instead of handler in idt. Currently, vmx handler only fakes an interrupt stack and jump to idt table to let real handler to handle it. Further, we will recognize the interrupt and only delivery the interrupt which not belong to current vcpu through idt table. The interrupt which belonged to current vcpu will be handled inside vmx handler. This will reduce the interrupt handle cost of KVM. Also, interrupt enable logic is changed if this feature is turnning on: Before this patch, hypervior call local_irq_enable() to enable it directly. Now IF bit is set on interrupt stack frame, and will be enabled on a return from interrupt handler if exterrupt interrupt exists. If no external interrupt, still call local_irq_enable() to enable it. Refer to Intel SDM volum 3, chapter 33.2. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++++ arch/x86/kvm/vmx.c | 58 ++++++++++++++++++++++++++++++--- arch/x86/kvm/x86.c | 4 ++- 4 files changed, 64 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 82f1dc67782f..68d438630dd3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -727,6 +727,7 @@ struct kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage); + void (*handle_external_intr)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7a46c1f46861..2f8fe3f06837 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4233,6 +4233,11 @@ out: return ret; } +static void svm_handle_external_intr(struct kvm_vcpu *vcpu) +{ + local_irq_enable(); +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -4328,6 +4333,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tdp_cr3 = set_tdp_cr3, .check_intercept = svm_check_intercept, + .handle_external_intr = svm_handle_external_intr, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 71755573b7ca..7a7605f0444b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -378,6 +378,7 @@ struct vcpu_vmx { struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; + unsigned long host_idt_base; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; @@ -2627,7 +2628,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | + VM_EXIT_ACK_INTR_ON_EXIT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -3879,7 +3881,7 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) * Note that host-state that does change is set elsewhere. E.g., host-state * that is set differently for each CPU is set in vmx_vcpu_load(), not here. */ -static void vmx_set_constant_host_state(void) +static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { u32 low32, high32; unsigned long tmpl; @@ -3907,6 +3909,7 @@ static void vmx_set_constant_host_state(void) native_store_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ + vmx->host_idt_base = dt.address; vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ @@ -4039,7 +4042,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ - vmx_set_constant_host_state(); + vmx_set_constant_host_state(vmx); #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ @@ -6399,6 +6402,52 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) } } +static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) +{ + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* + * If external interrupt exists, IF bit is set in rflags/eflags on the + * interrupt stack frame, and interrupt will be enabled on a return + * from interrupt handler. + */ + if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) + == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { + unsigned int vector; + unsigned long entry; + gate_desc *desc; + struct vcpu_vmx *vmx = to_vmx(vcpu); +#ifdef CONFIG_X86_64 + unsigned long tmp; +#endif + + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + desc = (gate_desc *)vmx->host_idt_base + vector; + entry = gate_offset(*desc); + asm volatile( +#ifdef CONFIG_X86_64 + "mov %%" _ASM_SP ", %[sp]\n\t" + "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" + "push $%c[ss]\n\t" + "push %[sp]\n\t" +#endif + "pushf\n\t" + "orl $0x200, (%%" _ASM_SP ")\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" + "call *%[entry]\n\t" + : +#ifdef CONFIG_X86_64 + [sp]"=&r"(tmp) +#endif + : + [entry]"r"(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); + } else + local_irq_enable(); +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -7041,7 +7090,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Other fields are different per CPU, and will be set later when * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. */ - vmx_set_constant_host_state(); + vmx_set_constant_host_state(vmx); /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before @@ -7718,6 +7767,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tdp_cr3 = vmx_set_cr3, .check_intercept = vmx_check_intercept, + .handle_external_intr = vmx_handle_external_intr, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ae9744d03c83..1562671a8e18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5820,7 +5820,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - local_irq_enable(); + + /* Interrupt is enabled by handle_external_intr() */ + kvm_x86_ops->handle_external_intr(vcpu); ++vcpu->stat.exits; From d78f2664832f8d70e36422af9a10e44276dced48 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:25:11 +0800 Subject: [PATCH 101/204] KVM: VMX: Register a new IPI for posted interrupt Posted Interrupt feature requires a special IPI to deliver posted interrupt to guest. And it should has a high priority so the interrupt will not be blocked by others. Normally, the posted interrupt will be consumed by vcpu if target vcpu is running and transparent to OS. But in some cases, the interrupt will arrive when target vcpu is scheduled out. And host will see it. So we need to register a dump handler to handle it. Signed-off-by: Yang Zhang Acked-by: Ingo Molnar Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/entry_arch.h | 4 ++++ arch/x86/include/asm/hardirq.h | 3 +++ arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 5 +++++ arch/x86/kernel/entry_64.S | 5 +++++ arch/x86/kernel/irq.c | 22 ++++++++++++++++++++++ arch/x86/kernel/irqinit.c | 4 ++++ 7 files changed, 44 insertions(+) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 40afa0005c69..9bd4ecac72be 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -19,6 +19,10 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) +#ifdef CONFIG_HAVE_KVM +BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +#endif + /* * every pentium local APIC has two 'local interrupts', with a * soft-definable vector attached to both interrupts, one of diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 81f04cee5f74..ab0ae1aa6d0a 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -11,6 +11,9 @@ typedef struct { unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; unsigned int icr_read_retry_count; +#endif +#ifdef CONFIG_HAVE_KVM + unsigned int kvm_posted_intr_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 10a78c3d3d5a..1da97efad08a 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,6 +28,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); extern void x86_platform_ipi(void); +extern void kvm_posted_intr_ipi(void); extern void error_interrupt(void); extern void irq_work_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index aac5fa62a86c..5702d7e3111d 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,11 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 +/* Vector for KVM to deliver posted interrupt IPI */ +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * IRQ work vector: */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c1d01e6ca790..727208941030 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1166,6 +1166,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi +#ifdef CONFIG_HAVE_KVM +apicinterrupt POSTED_INTR_VECTOR \ + kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +#endif + apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt apicinterrupt THERMAL_APIC_VECTOR \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e4595f105910..6ae6ea1d27d9 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -228,6 +228,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs) set_irq_regs(old_regs); } +#ifdef CONFIG_HAVE_KVM +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ +void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + irq_enter(); + + exit_idle(); + + inc_irq_stat(kvm_posted_intr_ipis); + + irq_exit(); + + set_irq_regs(old_regs); +} +#endif + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 7dc4e459c2b3..a2a1fbc594ff 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -172,6 +172,10 @@ static void __init apic_intr_init(void) /* IPI for X86 platform specific use */ alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); +#ifdef CONFIG_HAVE_KVM + /* IPI for KVM to deliver posted interrupt */ + alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); +#endif /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); From 01e439be7753c163932538276f04f95cb1b66697 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:25:12 +0800 Subject: [PATCH 102/204] KVM: VMX: Check the posted interrupt capability Detect the posted interrupt feature. If it exists, then set it in vmcs_config. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 4 ++ arch/x86/kvm/vmx.c | 82 ++++++++++++++++++++++++++++---------- 2 files changed, 66 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index fc1c3134473b..6f07f1999138 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,6 +71,7 @@ #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 #define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040 +#define PIN_BASED_POSTED_INTR 0x00000080 #define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 @@ -102,6 +103,7 @@ /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, + POSTED_INTR_NV = 0x00000002, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -136,6 +138,8 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, APIC_ACCESS_ADDR_HIGH = 0x00002015, + POSTED_INTR_DESC_ADDR = 0x00002016, + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, EPT_POINTER = 0x0000201a, EPT_POINTER_HIGH = 0x0000201b, EOI_EXIT_BITMAP0 = 0x0000201c, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7a7605f0444b..7607c6c09c74 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,7 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); -static bool __read_mostly enable_apicv_reg_vid; +static bool __read_mostly enable_apicv; +module_param(enable_apicv, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use @@ -366,6 +367,14 @@ struct nested_vmx { struct page *apic_access_page; }; +#define POSTED_INTR_ON 0 +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + u32 control; /* bit 0 of control is outstanding notification bit */ + u32 rsvd[7]; +} __aligned(64); + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -430,6 +439,9 @@ struct vcpu_vmx { bool rdtscp_enabled; + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -785,6 +797,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +static inline bool cpu_has_vmx_posted_intr(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; +} + +static inline bool cpu_has_vmx_apicv(void) +{ + return cpu_has_vmx_apic_register_virt() && + cpu_has_vmx_virtual_intr_delivery() && + cpu_has_vmx_posted_intr(); +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -2552,12 +2576,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmexit_control = 0; u32 _vmentry_control = 0; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | @@ -2634,6 +2652,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; + min = 0; opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2812,11 +2841,10 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; - if (!cpu_has_vmx_apic_register_virt() || - !cpu_has_vmx_virtual_intr_delivery()) - enable_apicv_reg_vid = 0; + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; - if (enable_apicv_reg_vid) + if (enable_apicv) kvm_x86_ops->update_cr8_intercept = NULL; else kvm_x86_ops->hwapic_irr_update = NULL; @@ -3875,6 +3903,11 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } +static int vmx_vm_has_apicv(struct kvm *kvm) +{ + return enable_apicv && irqchip_in_kernel(kvm); +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -3935,6 +3968,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); } +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +{ + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; + + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + return pin_based_exec_ctrl; +} + static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -3952,11 +3994,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } -static int vmx_vm_has_apicv(struct kvm *kvm) -{ - return enable_apicv_reg_vid && irqchip_in_kernel(kvm); -} - static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -4012,8 +4049,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - vmcs_config.pin_based_exec_ctrl); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -4022,13 +4058,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } - if (enable_apicv_reg_vid) { + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); vmcs_write64(EOI_EXIT_BITMAP3, 0); vmcs_write16(GUEST_INTR_STATUS, 0); + + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } if (ple_gap) { @@ -4170,6 +4209,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + if (vmx_vm_has_apicv(vcpu->kvm)) + memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); + if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); @@ -7842,7 +7884,7 @@ static int __init vmx_init(void) memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); - if (enable_apicv_reg_vid) { + if (enable_apicv) { for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); From 3d81bc7e96d6bca0b8f8b7d1bf6ea72caa3aac57 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:25:13 +0800 Subject: [PATCH 103/204] KVM: Call common update function when ioapic entry changed. Both TMR and EOI exit bitmap need to be updated when ioapic changed or vcpu's id/ldr/dfr changed. So use common function instead eoi exit bitmap specific function. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/ia64/kvm/lapic.h | 6 ------ arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/vmx.c | 3 +++ arch/x86/kvm/x86.c | 11 +++++++---- include/linux/kvm_host.h | 4 ++-- virt/kvm/ioapic.c | 22 +++++++++++++--------- virt/kvm/ioapic.h | 6 ++---- virt/kvm/irq_comm.c | 4 ++-- virt/kvm/kvm_main.c | 4 ++-- 9 files changed, 32 insertions(+), 30 deletions(-) diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h index c3e2935b6db4..c5f92a926a9a 100644 --- a/arch/ia64/kvm/lapic.h +++ b/arch/ia64/kvm/lapic.h @@ -27,10 +27,4 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); #define kvm_apic_present(x) (true) #define kvm_lapic_enabled(x) (true) -static inline bool kvm_apic_vid_enabled(void) -{ - /* IA64 has no apicv supporting, do nothing here */ - return false; -} - #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 67962188c775..34a8ca845280 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -217,7 +217,7 @@ out: if (old) kfree_rcu(old, rcu); - kvm_ioapic_make_eoibitmap_request(kvm); + kvm_vcpu_request_scan_ioapic(kvm); } static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7607c6c09c74..8535519d0352 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6414,6 +6414,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { + if (!vmx_vm_has_apicv(vcpu->kvm)) + return; + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1562671a8e18..87a05df3eae4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5661,13 +5661,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) #endif } -static void update_eoi_exitmap(struct kvm_vcpu *vcpu) +static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; + if (!kvm_apic_hw_enabled(vcpu->arch.apic)) + return; + memset(eoi_exit_bitmap, 0, 32); - kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } @@ -5724,8 +5727,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_handle_pmu_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_deliver_pmi(vcpu); - if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) - update_eoi_exitmap(vcpu); + if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) + vcpu_scan_ioapic(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4a76aca31478..93a50054d46c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -126,7 +126,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MASTERCLOCK_UPDATE 19 #define KVM_REQ_MCLOCK_INPROGRESS 20 #define KVM_REQ_EPR_EXIT 21 -#define KVM_REQ_EOIBITMAP 22 +#define KVM_REQ_SCAN_IOAPIC 22 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -575,7 +575,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); void kvm_make_mclock_inprogress_request(struct kvm *kvm); -void kvm_make_update_eoibitmap_request(struct kvm *kvm); +void kvm_make_scan_ioapic_request(struct kvm *kvm); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 97c67a50f904..f2157a985a1f 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -193,15 +193,13 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic) smp_wmb(); } -void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, - u64 *eoi_exit_bitmap) +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; int index; spin_lock(&ioapic->lock); - /* traverse ioapic entry to set eoi exit bitmap*/ for (index = 0; index < IOAPIC_NUM_PINS; index++) { e = &ioapic->redirtbl[index]; if (!e->fields.mask && @@ -215,16 +213,22 @@ void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, } spin_unlock(&ioapic->lock); } -EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap); -void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm) +#ifdef CONFIG_X86 +void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - if (!kvm_apic_vid_enabled(kvm) || !ioapic) + if (!ioapic) return; - kvm_make_update_eoibitmap_request(kvm); + kvm_make_scan_ioapic_request(kvm); } +#else +void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +{ + return; +} +#endif static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { @@ -267,7 +271,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); - kvm_ioapic_make_eoibitmap_request(ioapic->kvm); + kvm_vcpu_request_scan_ioapic(ioapic->kvm); break; } } @@ -586,7 +590,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) spin_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); update_handled_vectors(ioapic); - kvm_ioapic_make_eoibitmap_request(kvm); + kvm_vcpu_request_scan_ioapic(kvm); kvm_rtc_eoi_tracking_restore_all(ioapic); spin_unlock(&ioapic->lock); return 0; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 554157bbb586..674a388612b4 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -96,9 +96,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm); -void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, - u64 *eoi_exit_bitmap); - +void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); #endif diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 8efb580edfef..25ab48007adb 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -285,7 +285,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); mutex_unlock(&kvm->irq_lock); - kvm_ioapic_make_eoibitmap_request(kvm); + kvm_vcpu_request_scan_ioapic(kvm); } void kvm_unregister_irq_ack_notifier(struct kvm *kvm, @@ -295,7 +295,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); synchronize_rcu(); - kvm_ioapic_make_eoibitmap_request(kvm); + kvm_vcpu_request_scan_ioapic(kvm); } int kvm_request_irq_source_id(struct kvm *kvm) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ac3182eed462..65a9e0716a8d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -217,9 +217,9 @@ void kvm_make_mclock_inprogress_request(struct kvm *kvm) make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } -void kvm_make_update_eoibitmap_request(struct kvm *kvm) +void kvm_make_scan_ioapic_request(struct kvm *kvm) { - make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP); + make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) From cf9e65b773394c5ad8fa7455c43268bc8ec2109f Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:25:14 +0800 Subject: [PATCH 104/204] KVM: Set TMR when programming ioapic entry We already know the trigger mode of a given interrupt when programming the ioapice entry. So it's not necessary to set it in each interrupt delivery. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 15 +++++++++------ arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/x86.c | 5 ++++- virt/kvm/ioapic.c | 12 +++++++++--- virt/kvm/ioapic.h | 3 ++- 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 34a8ca845280..d197579435d0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -468,6 +468,15 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) return result; } +void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + for (i = 0; i < 8; i++) + apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]); +} + static void apic_update_ppr(struct kvm_lapic *apic) { u32 tpr, isrv, ppr, old_ppr; @@ -661,12 +670,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); - result = !apic_test_and_set_irr(vector, apic); trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 16304b1a8cba..7fe0c9180ea1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -53,6 +53,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 87a05df3eae4..276b4a9a5605 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5664,14 +5664,17 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; + u32 tmr[8]; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; memset(eoi_exit_bitmap, 0, 32); + memset(tmr, 0, 32); - kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap); + kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr); kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_apic_update_tmr(vcpu, tmr); } static int vcpu_enter_guest(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index f2157a985a1f..2d682977ce82 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -193,7 +193,8 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic) smp_wmb(); } -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, + u32 *tmr) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; @@ -207,8 +208,13 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI)) { if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode)) - __set_bit(e->fields.vector, (unsigned long *)eoi_exit_bitmap); + e->fields.dest_id, e->fields.dest_mode)) { + __set_bit(e->fields.vector, + (unsigned long *)eoi_exit_bitmap); + if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG) + __set_bit(e->fields.vector, + (unsigned long *)tmr); + } } } spin_unlock(&ioapic->lock); diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 674a388612b4..615d8c995c3c 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -97,6 +97,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap, + u32 *tmr); #endif From a20ed54d6e470bf0d28921b7aadb6ca0da0ff0c3 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:25:15 +0800 Subject: [PATCH 105/204] KVM: VMX: Add the deliver posted interrupt algorithm Only deliver the posted interrupt when target vcpu is running and there is no previous interrupt pending in pir. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/lapic.c | 13 +++++++ arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/svm.c | 6 ++++ arch/x86/kvm/vmx.c | 64 ++++++++++++++++++++++++++++++++- virt/kvm/kvm_main.c | 1 + 6 files changed, 86 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 68d438630dd3..599f98b612d4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -701,6 +701,8 @@ struct kvm_x86_ops { void (*hwapic_isr_update)(struct kvm *kvm, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); + void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); + void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d197579435d0..dbf74c922aa1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -318,6 +318,19 @@ static u8 count_vectors(void *bitmap) return count; } +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ + u32 i, pir_val; + struct kvm_lapic *apic = vcpu->arch.apic; + + for (i = 0; i <= 7; i++) { + pir_val = xchg(&pir[i], 0); + if (pir_val) + *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; + } +} +EXPORT_SYMBOL_GPL(kvm_apic_update_irr); + static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 7fe0c9180ea1..c730ac9fe801 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -54,6 +54,7 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2f8fe3f06837..d6713e18bbc1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3577,6 +3577,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr) return; } +static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4305,6 +4310,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vm_has_apicv = svm_vm_has_apicv, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_isr_update = svm_hwapic_isr_update, + .sync_pir_to_irr = svm_sync_pir_to_irr, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8535519d0352..3a14d8a0ee46 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -375,6 +375,23 @@ struct pi_desc { u32 rsvd[7]; } __aligned(64); +static bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -639,6 +656,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); +static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -2846,8 +2864,11 @@ static __init int hardware_setup(void) if (enable_apicv) kvm_x86_ops->update_cr8_intercept = NULL; - else + else { kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->deliver_posted_interrupt = NULL; + kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; + } if (nested) nested_vmx_setup_ctls_msrs(); @@ -3908,6 +3929,45 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return enable_apicv && irqchip_in_kernel(kvm); } +/* + * Send interrupt to vcpu via posted interrupt way. + * 1. If target vcpu is running(non-root mode), send posted interrupt + * notification to vcpu and hardware will sync PIR to vIRR atomically. + * 2. If target vcpu isn't running(root mode), kick it to pick up the + * interrupt from PIR in next vmentry. + */ +static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) + return; + + r = pi_test_and_set_on(&vmx->pi_desc); + kvm_make_request(KVM_REQ_EVENT, vcpu); + if (!r && (vcpu->mode == IN_GUEST_MODE)) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + else + kvm_vcpu_kick(vcpu); +} + +static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!pi_test_and_clear_on(&vmx->pi_desc)) + return; + + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); +} + +static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) +{ + return; +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -7784,6 +7844,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_posted_interrupt = vmx_deliver_posted_interrupt, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 65a9e0716a8d..aaac1a7a9ea8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1671,6 +1671,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) smp_send_reschedule(cpu); put_cpu(); } +EXPORT_SYMBOL_GPL(kvm_vcpu_kick); #endif /* !CONFIG_S390 */ void kvm_resched(struct kvm_vcpu *vcpu) From 5a71785dde307f6ac80e83c0ad3fd694912010a1 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 11 Apr 2013 19:25:16 +0800 Subject: [PATCH 106/204] KVM: VMX: Use posted interrupt to deliver virtual interrupt If posted interrupt is avaliable, then uses it to inject virtual interrupt to guest. Signed-off-by: Yang Zhang Reviewed-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 30 +++++++++++++++++++----------- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 1 + 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dbf74c922aa1..e29883c604ff 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -353,6 +353,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -683,18 +684,25 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (dest_map) __set_bit(vcpu->vcpu_id, dest_map); - result = !apic_test_and_set_irr(vector, apic); - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); - if (!result) { - if (trig_mode) - apic_debug("level trig mode repeatedly for " - "vector %d", vector); - break; - } + if (kvm_x86_ops->deliver_posted_interrupt) { + result = 1; + kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); + } else { + result = !apic_test_and_set_irr(vector, apic); - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); + if (!result) { + if (trig_mode) + apic_debug("level trig mode repeatedly " + "for vector %d", vector); + goto out; + } + + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } +out: + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector, !result); break; case APIC_DM_REMRD: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3a14d8a0ee46..5a87a58af49d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,7 +84,7 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); -static bool __read_mostly enable_apicv; +static bool __read_mostly enable_apicv = 1; module_param(enable_apicv, bool, S_IRUGO); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 276b4a9a5605..50e2e10b8041 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2692,6 +2692,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + kvm_x86_ops->sync_pir_to_irr(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); return 0; From 79558f112fc0352e057f7b5e158e3d88b8b62c60 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 16 Apr 2013 19:21:41 +0200 Subject: [PATCH 107/204] KVM: ARM: Fix kvm_vm_ioctl_irq_line Commit aa2fbe6d broke the ARM KVM target by introducing a new parameter to irq handling functions. Fix the function prototype to get things compiling again and ignore the parameter just like we did before Signed-off-by: Alexander Graf Acked-by: Christoffer Dall Signed-off-by: Marcelo Tosatti --- arch/arm/kvm/arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index e4ad0bb01843..678596f699f3 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -805,7 +805,8 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) return 0; } -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level) +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, + bool line_status) { u32 irq = irq_level->irq; unsigned int irq_type, vcpu_idx, irq_num; From 26539bd0e446a54665f9d6f4c69a21140b2e1d85 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 15 Apr 2013 15:00:27 +0200 Subject: [PATCH 108/204] KVM: nVMX: check vmcs12 for valid activity state KVM does not use the activity state VMCS field, and does not support it in nested VMX either (the corresponding bits in the misc VMX feature MSR are zero). Fail entry if the activity state is set to anything but "active". Since the value will always be the same for L1 and L2, we do not need to read and write the corresponding VMCS field on L1/L2 transitions, either. Signed-off-by: Paolo Bonzini Reviewed-by: Gleb Natapov Reviewed-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5a87a58af49d..8d52bcf371df 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7106,7 +7106,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->vm_entry_instruction_len); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); - vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); @@ -7325,6 +7324,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } + if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) { + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { /*TODO: Also verify bits beyond physical address width are 0*/ @@ -7555,7 +7559,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); - vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); vmcs12->guest_interruptibility_info = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); vmcs12->guest_pending_dbg_exceptions = From f13882d84df31a8567032b6bcbfbdd76ac378513 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 14 Apr 2013 16:07:37 +0300 Subject: [PATCH 109/204] KVM: VMX: Fix check guest state validity if a guest is in VM86 mode If guest vcpu is in VM86 mode the vcpu state should be checked as if in real mode. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8d52bcf371df..c84f0cb921f9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3605,7 +3605,7 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) return true; /* real mode guest state checks */ - if (!is_protmode(vcpu)) { + if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) From 8c32a2ea655d035798d3270717924ad8be903e24 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Wed, 20 Mar 2013 20:24:58 +0000 Subject: [PATCH 110/204] Added ONE_REG interface for debug instruction This patch adds the one_reg interface to get the special instruction to be used for setting software breakpoint from userspace. Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s.h | 2 ++ arch/powerpc/include/asm/kvm_booke.h | 2 ++ arch/powerpc/include/uapi/asm/kvm.h | 4 ++++ arch/powerpc/kvm/book3s.c | 6 ++++++ arch/powerpc/kvm/booke.c | 6 ++++++ 5 files changed, 20 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 5a56e1c5f851..bc81842ea25a 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -458,6 +458,8 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) #define OSI_SC_MAGIC_R4 0x77810F9B #define INS_DCBZ 0x7c0007ec +/* TO = 31 for unconditional trap */ +#define INS_TW 0x7fe00008 /* LPIDs we support with this build -- runtime limit may be lower */ #define KVMPPC_NR_LPIDS (LPID_RSVD + 1) diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index b7cd3356a532..d3c1eb34c986 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -26,6 +26,8 @@ /* LPIDs we support with this build -- runtime limit may be lower */ #define KVMPPC_NR_LPIDS 64 +#define KVMPPC_INST_EHPRIV 0x7c00021c + static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { vcpu->arch.gpr[num] = val; diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index ef072b1a6e3f..c2ff99c01562 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -422,4 +422,8 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_CLEAR_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x88) #define KVM_REG_PPC_TCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x89) #define KVM_REG_PPC_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8a) + +/* Debugging: Special instruction for software breakpoint */ +#define KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8b) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6548445fd823..2d32ae4bc439 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -529,6 +529,12 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); break; #endif /* CONFIG_ALTIVEC */ + case KVM_REG_PPC_DEBUG_INST: { + u32 opcode = INS_TW; + r = copy_to_user((u32 __user *)(long)reg->addr, + &opcode, sizeof(u32)); + break; + } default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 58057d6f146d..a49a68a25c39 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1447,6 +1447,12 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_TSR: r = put_user(vcpu->arch.tsr, (u32 __user *)(long)reg->addr); break; + case KVM_REG_PPC_DEBUG_INST: { + u32 opcode = KVMPPC_INST_EHPRIV; + r = copy_to_user((u32 __user *)(long)reg->addr, + &opcode, sizeof(u32)); + break; + } default: break; } From c32498ee64165cfcbcac9c4318d537c97fd66428 Mon Sep 17 00:00:00 2001 From: Stuart Yoder Date: Tue, 9 Apr 2013 10:36:23 +0000 Subject: [PATCH 111/204] KVM: PPC: emulate dcbst Signed-off-by: Stuart Yoder Signed-off-by: Alexander Graf --- arch/powerpc/kvm/emulate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 7a73b6f72a8b..631a2650e4e4 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -38,6 +38,7 @@ #define OP_31_XOP_TRAP 4 #define OP_31_XOP_LWZX 23 +#define OP_31_XOP_DCBST 54 #define OP_31_XOP_TRAP_64 68 #define OP_31_XOP_DCBF 86 #define OP_31_XOP_LBZX 87 @@ -370,6 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); break; + case OP_31_XOP_DCBST: case OP_31_XOP_DCBF: case OP_31_XOP_DCBI: /* Do nothing. The guest is performing dcbi because From be28a27c993ca6f806145d02dbe493baac83a8e9 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Mon, 15 Apr 2013 15:07:11 +0000 Subject: [PATCH 112/204] kvm/ppc: don't call complete_mmio_load when it's a store complete_mmio_load writes back the mmio result into the destination register. Doing this on a store results in register corruption. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/powerpc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 16b45954511c..a822659db50a 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -683,7 +683,6 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, bytes, &run->mmio.data)) { - kvmppc_complete_mmio_load(vcpu, run); vcpu->mmio_needed = 0; return EMULATE_DONE; } From 6ffbbbbab34ef84b89f8b9f50a5f0fed5795d79a Mon Sep 17 00:00:00 2001 From: "Zhang, Yang Z" Date: Wed, 17 Apr 2013 23:11:54 -0300 Subject: [PATCH 113/204] KVM: x86: Fix posted interrupt with CONFIG_SMP=n ->send_IPI_mask is not defined on UP. Signed-off-by: Yang Zhang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c84f0cb921f9..a05cca608848 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3946,10 +3946,12 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) r = pi_test_and_set_on(&vmx->pi_desc); kvm_make_request(KVM_REQ_EVENT, vcpu); +#ifdef CONFIG_SMP if (!r && (vcpu->mode == IN_GUEST_MODE)) apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); else +#endif kvm_vcpu_kick(vcpu); } From e606264797d11d6488bc079fb95e38fb5bf5c60a Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 17 Apr 2013 08:46:41 +0800 Subject: [PATCH 114/204] KVM: ia64: Fix kvm_vm_ioctl_irq_line Fix the compile error with kvm_vm_ioctl_irq_line. Signed-off-by: Yang Zhang Signed-off-by: Marcelo Tosatti --- arch/ia64/kvm/kvm-ia64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 7a54455dde39..032c54d63c3d 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -924,13 +924,15 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) { if (!irqchip_in_kernel(kvm)) return -ENXIO; irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event->irq, irq_event->level); + irq_event->irq, irq_event->level, + line_status); return 0; } From 89662e566c73230dab9f1d99dbe2f6d9c8c189a8 Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:34:55 +0300 Subject: [PATCH 115/204] KVM: nVMX: Shadow-vmcs control fields/bits Add definitions for all the vmcs control fields/bits required to enable vmcs-shadowing Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/include/asm/vmx.h | 3 +++ arch/x86/include/uapi/asm/msr-index.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6f07f1999138..f3e01a2cbaa1 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -65,6 +65,7 @@ #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 +#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -150,6 +151,8 @@ enum vmcs_field { EOI_EXIT_BITMAP2_HIGH = 0x00002021, EOI_EXIT_BITMAP3 = 0x00002022, EOI_EXIT_BITMAP3_HIGH = 0x00002023, + VMREAD_BITMAP = 0x00002026, + VMWRITE_BITMAP = 0x00002028, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 892ce40a7470..dc469e3e708c 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -522,6 +522,8 @@ #define VMX_BASIC_MEM_TYPE_WB 6LLU #define VMX_BASIC_INOUT 0x0040000000000000LLU +/* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 From abc4fc58c5ba1a794092bcd97fdb1680b0b3398d Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:35:25 +0300 Subject: [PATCH 116/204] KVM: nVMX: Detect shadow-vmcs capability Add logic required to detect if shadow-vmcs is supported by the processor. Introduce a new kernel module parameter to specify if L0 should use shadow vmcs (or not) to run L1. Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a05cca608848..7042b6921961 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -87,6 +87,8 @@ module_param(fasteoi, bool, S_IRUGO); static bool __read_mostly enable_apicv = 1; module_param(enable_apicv, bool, S_IRUGO); +static bool __read_mostly enable_shadow_vmcs = 1; +module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -940,6 +942,18 @@ static inline bool cpu_has_vmx_wbinvd_exit(void) SECONDARY_EXEC_WBINVD_EXITING; } +static inline bool cpu_has_vmx_shadow_vmcs(void) +{ + u64 vmx_msr; + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + /* check if the cpu supports writing r/o exit information fields */ + if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + return false; + + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_SHADOW_VMCS; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -2632,7 +2646,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_SHADOW_VMCS; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2833,6 +2848,8 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels()) { @@ -4077,6 +4094,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD + (handle_vmptrld). + We can NOT enable shadow_vmcs here because we don't have yet + a current VMCS12 + */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; return exec_control; } From 4607c2d7a2ee90707de2b3b37e4d9156e05cdf29 Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:35:55 +0300 Subject: [PATCH 117/204] KVM: nVMX: Introduce vmread and vmwrite bitmaps Prepare vmread and vmwrite bitmaps according to a pre-specified list of fields. These lists are intended to specifiy most frequent accessed fields so we can minimize the number of fields that are copied from/to the software controlled VMCS12 format to/from to processor-specific shadow vmcs. The lists were built measuring the VMCS fields access rate after L2 Ubuntu 12.04 booted when it was running on top of L1 KVM, also Ubuntu 12.04. Note that during boot there were additional fields which were frequently modified but they were not added to these lists because after boot these fields were not longer accessed by L1. Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 90 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7042b6921961..7dc599630430 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -484,6 +484,64 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ [number##_HIGH] = VMCS12_OFFSET(name)+4 + +static const unsigned long shadow_read_only_fields[] = { + /* + * We do NOT shadow fields that are modified when L0 + * traps and emulates any vmx instruction (e.g. VMPTRLD, + * VMXON...) executed by L1. + * For example, VM_INSTRUCTION_ERROR is read + * by L1 if a vmx instruction fails (part of the error path). + * Note the code assumes this logic. If for some reason + * we start shadowing these fields then we need to + * force a shadow sync when L0 emulates vmx instructions + * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified + * by nested_vmx_failValid) + */ + VM_EXIT_REASON, + VM_EXIT_INTR_INFO, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_INFO_FIELD, + IDT_VECTORING_ERROR_CODE, + VM_EXIT_INTR_ERROR_CODE, + EXIT_QUALIFICATION, + GUEST_LINEAR_ADDRESS, + GUEST_PHYSICAL_ADDRESS +}; +static const int max_shadow_read_only_fields = + ARRAY_SIZE(shadow_read_only_fields); + +static const unsigned long shadow_read_write_fields[] = { + GUEST_RIP, + GUEST_RSP, + GUEST_CR0, + GUEST_CR3, + GUEST_CR4, + GUEST_INTERRUPTIBILITY_INFO, + GUEST_RFLAGS, + GUEST_CS_SELECTOR, + GUEST_CS_AR_BYTES, + GUEST_CS_LIMIT, + GUEST_CS_BASE, + GUEST_ES_BASE, + CR0_GUEST_HOST_MASK, + CR0_READ_SHADOW, + CR4_READ_SHADOW, + TSC_OFFSET, + EXCEPTION_BITMAP, + CPU_BASED_VM_EXEC_CONTROL, + VM_ENTRY_EXCEPTION_ERROR_CODE, + VM_ENTRY_INTR_INFO_FIELD, + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE, + HOST_FS_BASE, + HOST_GS_BASE, + HOST_FS_SELECTOR, + HOST_GS_SELECTOR +}; +static const int max_shadow_read_write_fields = + ARRAY_SIZE(shadow_read_write_fields); + static const unsigned short vmcs_field_to_offset_table[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), FIELD(GUEST_ES_SELECTOR, guest_es_selector), @@ -675,6 +733,8 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_vmread_bitmap; +static unsigned long *vmx_vmwrite_bitmap; static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -4128,6 +4188,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); + if (enable_shadow_vmcs) { + vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); + vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); + } if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); @@ -7941,6 +8005,24 @@ static int __init vmx_init(void) (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) goto out4; + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) + goto out5; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) + goto out6; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + /* shadowed read/write fields */ + for (i = 0; i < max_shadow_read_write_fields; i++) { + clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap); + clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap); + } + /* shadowed read only fields */ + for (i = 0; i < max_shadow_read_only_fields; i++) + clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap); /* * Allow direct access to the PC debug port (it is often used for I/O @@ -7959,7 +8041,7 @@ static int __init vmx_init(void) r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - goto out5; + goto out7; #ifdef CONFIG_KEXEC rcu_assign_pointer(crash_vmclear_loaded_vmcss, @@ -8007,6 +8089,10 @@ static int __init vmx_init(void) return 0; +out7: + free_page((unsigned long)vmx_vmwrite_bitmap); +out6: + free_page((unsigned long)vmx_vmread_bitmap); out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: @@ -8030,6 +8116,8 @@ static void __exit vmx_exit(void) free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((unsigned long)vmx_vmread_bitmap); #ifdef CONFIG_KEXEC rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); From 20b97feaf615da2c43361e1dda7efd068a4446ef Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:36:25 +0300 Subject: [PATCH 118/204] KVM: nVMX: Refactor handle_vmwrite Refactor existent code so we re-use vmcs12_write_any to copy fields from the shadow vmcs specified by the link pointer (used by the processor, implementation-specific) to the VMCS12 software format used by L0 to hold the fields in L1 memory address space. Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 52 +++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7dc599630430..1ee63cae82cd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5842,6 +5842,33 @@ static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, } } + +static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 field_value){ + short offset = vmcs_field_to_offset(field); + char *p = ((char *) get_vmcs12(vcpu)) + offset; + if (offset < 0) + return false; + + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_U16: + *(u16 *)p = field_value; + return true; + case VMCS_FIELD_TYPE_U32: + *(u32 *)p = field_value; + return true; + case VMCS_FIELD_TYPE_U64: + *(u64 *)p = field_value; + return true; + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + *(natural_width *)p = field_value; + return true; + default: + return false; /* can never happen. */ + } + +} + /* * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was * used before) all generate the same failure when it is missing. @@ -5906,8 +5933,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) gva_t gva; unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - char *p; - short offset; /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -5944,28 +5969,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; } - offset = vmcs_field_to_offset(field); - if (offset < 0) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; - } - p = ((char *) get_vmcs12(vcpu)) + offset; - - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - *(u16 *)p = field_value; - break; - case VMCS_FIELD_TYPE_U32: - *(u32 *)p = field_value; - break; - case VMCS_FIELD_TYPE_U64: - *(u64 *)p = field_value; - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - *(natural_width *)p = field_value; - break; - default: + if (!vmcs12_write_any(vcpu, field, field_value)) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; From 145c28dd19a2a05aa798e77bd7e679847fe8bee3 Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:36:55 +0300 Subject: [PATCH 119/204] KVM: nVMX: Fix VMXON emulation handle_vmon doesn't check if L1 is already in root mode (VMXON was previously called). This patch adds this missing check and calls nested_vmx_failValid if VMX is already ON. We need this check because L0 will allocate the shadow vmcs when L1 executes VMXON and we want to avoid host leaks (due to shadow vmcs allocation) if L1 executes VMXON repeatedly. Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1ee63cae82cd..1cdfb5d7580a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5512,6 +5512,9 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) free_loaded_vmcs(&vmx->vmcs01); } +static void nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error); + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -5547,6 +5550,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu) kvm_inject_gp(vcpu, 0); return 1; } + if (vmx->nested.vmxon) { + nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + skip_emulated_instruction(vcpu); + return 1; + } INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; From 8de48833708a7834f1ba65a80944b1045082553a Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:37:25 +0300 Subject: [PATCH 120/204] KVM: nVMX: Allocate shadow vmcs Allocate a shadow vmcs used by the processor to shadow part of the fields stored in the software defined VMCS12 (let L1 access fields without causing exits). Note we keep a shadow vmcs only for the current vmcs12. Once a vmcs12 becomes non-current, its shadow vmcs is released. Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1cdfb5d7580a..7b27af9a14d8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -355,6 +355,7 @@ struct nested_vmx { /* The host-usable pointer to the above */ struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; + struct vmcs *current_shadow_vmcs; /* vmcs02_list cache of VMCSs recently used to run L2 guests */ struct list_head vmcs02_pool; @@ -5527,6 +5528,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) { struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs *shadow_vmcs; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -5555,6 +5557,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } + if (enable_shadow_vmcs) { + shadow_vmcs = alloc_vmcs(); + if (!shadow_vmcs) + return -ENOMEM; + /* mark vmcs as shadow */ + shadow_vmcs->revision_id |= (1u << 31); + /* init shadow vmcs */ + vmcs_clear(shadow_vmcs); + vmx->nested.current_shadow_vmcs = shadow_vmcs; + } INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); vmx->nested.vmcs02_num = 0; From e7953d7fab47a72e0d82e5d4dbb022864d8c2a41 Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:37:55 +0300 Subject: [PATCH 121/204] KVM: nVMX: Release shadow vmcs Unmap vmcs12 and release the corresponding shadow vmcs Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7b27af9a14d8..ad3957e5f2e8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5607,6 +5607,12 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } +static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) +{ + kunmap(vmx->nested.current_vmcs12_page); + nested_release_page(vmx->nested.current_vmcs12_page); +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -5617,11 +5623,12 @@ static void free_nested(struct vcpu_vmx *vmx) return; vmx->nested.vmxon = false; if (vmx->nested.current_vmptr != -1ull) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; } + if (enable_shadow_vmcs) + free_vmcs(vmx->nested.current_shadow_vmcs); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); @@ -5762,8 +5769,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) } if (vmptr == vmx->nested.current_vmptr) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; } @@ -6045,10 +6051,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - if (vmx->nested.current_vmptr != -1ull) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); - } + if (vmx->nested.current_vmptr != -1ull) + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; From 16f5b9034b699dea2f8862c324a0fd3275b4a50f Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:38:25 +0300 Subject: [PATCH 122/204] KVM: nVMX: Copy processor-specific shadow-vmcs to VMCS12 Introduce a function used to copy fields from the processor-specific shadow vmcs to the software controlled VMCS12 Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad3957e5f2e8..628d012112f5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -718,6 +718,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); +static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -5895,6 +5896,40 @@ static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, } +static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) +{ + int i; + unsigned long field; + u64 field_value; + struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; + unsigned long *fields = (unsigned long *)shadow_read_write_fields; + int num_fields = max_shadow_read_write_fields; + + vmcs_load(shadow_vmcs); + + for (i = 0; i < num_fields; i++) { + field = fields[i]; + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_U16: + field_value = vmcs_read16(field); + break; + case VMCS_FIELD_TYPE_U32: + field_value = vmcs_read32(field); + break; + case VMCS_FIELD_TYPE_U64: + field_value = vmcs_read64(field); + break; + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + field_value = vmcs_readl(field); + break; + } + vmcs12_write_any(&vmx->vcpu, field, field_value); + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); +} + /* * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was * used before) all generate the same failure when it is missing. From c3114420d1c7a6075fb0cfdc69b567423e5cfc13 Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:38:55 +0300 Subject: [PATCH 123/204] KVM: nVMX: Copy VMCS12 to processor-specific shadow vmcs Introduce a function used to copy fields from the software controlled VMCS12 to the processor-specific shadow vmcs Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 628d012112f5..8dc59aaab3db 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -718,6 +718,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); +static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static DEFINE_PER_CPU(struct vmcs *, vmxarea); @@ -5930,6 +5931,50 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) vmcs_load(vmx->loaded_vmcs->vmcs); } +static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) +{ + unsigned long *fields[] = { + (unsigned long *)shadow_read_write_fields, + (unsigned long *)shadow_read_only_fields + }; + int num_lists = ARRAY_SIZE(fields); + int max_fields[] = { + max_shadow_read_write_fields, + max_shadow_read_only_fields + }; + int i, q; + unsigned long field; + u64 field_value = 0; + struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; + + vmcs_load(shadow_vmcs); + + for (q = 0; q < num_lists; q++) { + for (i = 0; i < max_fields[q]; i++) { + field = fields[q][i]; + vmcs12_read_any(&vmx->vcpu, field, &field_value); + + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_U16: + vmcs_write16(field, (u16)field_value); + break; + case VMCS_FIELD_TYPE_U32: + vmcs_write32(field, (u32)field_value); + break; + case VMCS_FIELD_TYPE_U64: + vmcs_write64(field, (u64)field_value); + break; + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + vmcs_writel(field, (long)field_value); + break; + } + } + } + + vmcs_clear(shadow_vmcs); + vmcs_load(vmx->loaded_vmcs->vmcs); +} + /* * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was * used before) all generate the same failure when it is missing. From 012f83cb2f8d7b9b7ad3b65e7e53a9365a357014 Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:39:25 +0300 Subject: [PATCH 124/204] KVM: nVMX: Synchronize VMCS12 content with the shadow vmcs Synchronize between the VMCS12 software controlled structure and the processor-specific shadow vmcs Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8dc59aaab3db..c5baecc37c98 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -356,6 +356,11 @@ struct nested_vmx { struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; struct vmcs *current_shadow_vmcs; + /* + * Indicates if the shadow vmcs must be updated with the + * data hold by vmcs12 + */ + bool sync_shadow_vmcs; /* vmcs02_list cache of VMCSs recently used to run L2 guests */ struct list_head vmcs02_pool; @@ -5611,6 +5616,14 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { + if (enable_shadow_vmcs) { + if (vmx->nested.current_vmcs12 != NULL) { + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.sync_shadow_vmcs = false; + } + } kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); } @@ -5739,6 +5752,10 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, X86_EFLAGS_SF | X86_EFLAGS_OF)) | X86_EFLAGS_ZF); get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; + /* + * We don't need to force a shadow sync because + * VM_INSTRUCTION_ERROR is not shadowed + */ } /* Emulate the VMCLEAR instruction */ @@ -6137,6 +6154,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; + if (enable_shadow_vmcs) { + vmx->nested.sync_shadow_vmcs = true; + } } nested_vmx_succeed(vcpu); @@ -6895,6 +6915,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return; + if (vmx->nested.sync_shadow_vmcs) { + copy_vmcs12_to_shadow(vmx); + vmx->nested.sync_shadow_vmcs = false; + } + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) @@ -7504,6 +7529,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) skip_emulated_instruction(vcpu); vmcs12 = get_vmcs12(vcpu); + if (enable_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + /* * The nested entry process starts with enforcing various prerequisites * on vmcs12 as required by the Intel SDM, and act appropriately when @@ -7950,6 +7978,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); } else nested_vmx_succeed(vcpu); + if (enable_shadow_vmcs) + vmx->nested.sync_shadow_vmcs = true; } /* @@ -7967,6 +7997,8 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = qualification; nested_vmx_succeed(vcpu); + if (enable_shadow_vmcs) + to_vmx(vcpu)->nested.sync_shadow_vmcs = true; } static int vmx_check_intercept(struct kvm_vcpu *vcpu, From 8a1b9dd0006bce5cc770fd80bc95f9916670c151 Mon Sep 17 00:00:00 2001 From: Abel Gordon Date: Thu, 18 Apr 2013 14:39:55 +0300 Subject: [PATCH 125/204] KVM: nVMX: Enable and disable shadow vmcs functionality Once L1 loads VMCS12 we enable shadow-vmcs capability and copy all the VMCS12 shadowed fields to the shadow vmcs. When we release the VMCS12, we also disable shadow-vmcs capability. Signed-off-by: Abel Gordon Reviewed-by: Orit Wasserman Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c5baecc37c98..6abce2d5d18e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5616,12 +5616,17 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { + u32 exec_control; if (enable_shadow_vmcs) { if (vmx->nested.current_vmcs12 != NULL) { /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, -1ull); } } kunmap(vmx->nested.current_vmcs12_page); @@ -6110,6 +6115,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) gva_t gva; gpa_t vmptr; struct x86_exception e; + u32 exec_control; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -6155,6 +6161,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; if (enable_shadow_vmcs) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, + __pa(vmx->nested.current_shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; } } From f1797359216c1daa145a354d07b8b2b7459668f4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 18 Apr 2013 07:41:00 +0800 Subject: [PATCH 126/204] KVM: x86: fix error return code in kvm_arch_vcpu_init() Fix to return a negative error code from the error handling case instead of 0, as returned elsewhere in this function. Signed-off-by: Wei Yongjun Reviewed-by: Paolo Bonzini Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 50e2e10b8041..cbe16b4cad5c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6752,8 +6752,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { + r = -ENOMEM; goto fail_free_mce_banks; + } r = fx_init(vcpu); if (r) From 27469d29b3caf889ddf81c7d89f0676e45eb551d Mon Sep 17 00:00:00 2001 From: Andrew Honig Date: Thu, 18 Apr 2013 09:38:14 -0700 Subject: [PATCH 127/204] KVM: x86: Fix memory leak in vmx.c If userspace creates and destroys multiple VMs within the same process we leak 20k of memory in the userspace process context per VM. This patch frees the memory in kvm_arch_destroy_vm. If the process exits without closing the VM file descriptor or the file descriptor has been shared with another process then we don't free the memory. It's still possible for a user space process to leak memory if the last process to close the fd for the VM is not the process that created it. However, this is an unexpected case that's only caused by a user space process that's misbehaving. Signed-off-by: Andrew Honig Signed-off-by: Gleb Natapov --- arch/x86/kvm/x86.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cbe16b4cad5c..e730a462ed05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6859,6 +6859,23 @@ void kvm_arch_sync_events(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + if (current->mm == kvm->mm) { + /* + * Free memory regions allocated on behalf of userspace, + * unless the the memory map has changed due to process exit + * or fd copying. + */ + struct kvm_userspace_memory_region mem; + memset(&mem, 0, sizeof(mem)); + mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; + kvm_set_memory_region(kvm, &mem); + + mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; + kvm_set_memory_region(kvm, &mem); + + mem.slot = TSS_PRIVATE_MEMSLOT; + kvm_set_memory_region(kvm, &mem); + } kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); From 2505dc9fad44491ac4ce06290f358c030abe6be3 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 14 Apr 2013 12:12:47 +0200 Subject: [PATCH 128/204] KVM: VMX: Move vmx_nmi_allowed after vmx_set_nmi_mask vmx_set_nmi_mask will soon be used by vmx_nmi_allowed. No functional changes. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6abce2d5d18e..451ab2a57ca0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4486,16 +4486,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) -{ - if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - return 0; - - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI - | GUEST_INTR_STATE_NMI)); -} - static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) @@ -4525,6 +4515,16 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } } +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) +{ + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) + return 0; + + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI + | GUEST_INTR_STATE_NMI)); +} + static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { From ea8ceb8354e1c84a13cf2a8e915dc74f96759393 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 14 Apr 2013 21:04:26 +0200 Subject: [PATCH 129/204] KVM: nVMX: Fix conditions for NMI injection The logic for checking if interrupts can be injected has to be applied also on NMIs. The difference is that if NMI interception is on these events are consumed and blocked by the VM exit. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 451ab2a57ca0..11ea3cbbb78c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4392,6 +4392,12 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) PIN_BASED_EXT_INTR_MASK; } +static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) +{ + return get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_NMI_EXITING; +} + static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -4517,6 +4523,26 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; + if (nested_exit_on_nmi(vcpu)) { + nested_vmx_vmexit(vcpu); + vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI; + vmcs12->vm_exit_intr_info = NMI_VECTOR | + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK; + /* + * The NMI-triggered VM exit counts as injection: + * clear this one and block further NMIs. + */ + vcpu->arch.nmi_pending = 0; + vmx_set_nmi_mask(vcpu, true); + return 0; + } + } + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) return 0; From 384bb783275145b70d769acf4c687957d1c61802 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 20 Apr 2013 10:52:36 +0200 Subject: [PATCH 130/204] KVM: nVMX: Validate EFER values for VM_ENTRY/EXIT_LOAD_IA32_EFER As we may emulate the loading of EFER on VM-entry and VM-exit, implement the checks that VMX performs on the guest and host values on vmlaunch/ vmresume. Factor out kvm_valid_efer for this purpose which checks for set reserved bits. Signed-off-by: Jan Kiszka Reviewed-by: Paolo Bonzini Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 40 +++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 29 +++++++++++++++--------- 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 599f98b612d4..18635ae42a8e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -809,6 +809,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, } void kvm_enable_efer_bits(u64); +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 11ea3cbbb78c..d7ef55686227 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7558,6 +7558,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; struct loaded_vmcs *vmcs02; + bool ia32e; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -7648,6 +7649,45 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } + /* + * If the “load IA32_EFER” VM-entry control is 1, the following checks + * are performed on the field for the IA32_EFER MSR: + * - Bits reserved in the IA32_EFER MSR must be 0. + * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of + * the IA-32e mode guest VM-exit control. It must also be identical + * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to + * CR0.PG) is 1. + */ + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || + ((vmcs12->guest_cr0 & X86_CR0_PG) && + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + return 1; + } + } + + /* + * If the load IA32_EFER VM-exit control is 1, bits reserved in the + * IA32_EFER MSR must be 0 in the field for that register. In addition, + * the values of the LMA and LME bits in the field must each be that of + * the host address-space size VM-exit control. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); + return 1; + } + } + /* * We're finally done with prerequisite checking, and can start with * the nested entry. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e730a462ed05..2a434bf3918d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -845,23 +845,17 @@ static const u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, }; -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - u64 old_efer = vcpu->arch.efer; - if (efer & efer_reserved_bits) - return 1; - - if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; + return false; if (efer & EFER_FFXSR) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) - return 1; + return false; } if (efer & EFER_SVME) { @@ -869,9 +863,24 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) - return 1; + return false; } + return true; +} +EXPORT_SYMBOL_GPL(kvm_valid_efer); + +static int set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + u64 old_efer = vcpu->arch.efer; + + if (!kvm_valid_efer(vcpu, efer)) + return 1; + + if (is_paging(vcpu) + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; + efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; From d1fa0352a151a597e8749e7be84121a1ff0d3502 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 14 Apr 2013 12:44:54 +0200 Subject: [PATCH 131/204] KVM: nVMX: VM_ENTRY/EXIT_LOAD_IA32_EFER overrides EFER.LMA settings If we load the complete EFER MSR on entry or exit, EFER.LMA (and LME) loading is skipped. Their consistency is already checked now before starting the transition. Signed-off-by: Jan Kiszka Reviewed-by: Paolo Bonzini Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d7ef55686227..5863adf71ede 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7519,7 +7519,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->guest_ia32_efer; - if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) vcpu->arch.efer |= (EFER_LMA | EFER_LME); else vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); @@ -7929,7 +7929,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, { if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; - if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) + else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) vcpu->arch.efer |= (EFER_LMA | EFER_LME); else vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); From 660696d1d16a71e15549ce1bf74953be1592bcd3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 24 Apr 2013 13:38:36 +0300 Subject: [PATCH 132/204] KVM: X86 emulator: fix source operand decoding for 8bit mov[zs]x instructions Source operand for one byte mov[zs]x is decoded incorrectly if it is in high byte register. Fix that. Cc: stable@vger.kernel.org Signed-off-by: Gleb Natapov --- arch/x86/kvm/emulate.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 46f63b8d09f4..8e517bba6a7c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4172,6 +4172,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, break; case OpMem8: ctxt->memop.bytes = 1; + if (ctxt->memop.type == OP_REG) { + ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1); + fetch_register_operand(&ctxt->memop); + } goto mem_common; case OpMem16: ctxt->memop.bytes = 2; From adccf65ca431b41733483f476e8de9e3cf171c44 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Thu, 25 Apr 2013 06:33:57 +0000 Subject: [PATCH 133/204] KVM: PPC: cache flush for kernel managed pages Kernel can only access pages which maps as memory. So flush only the valid kernel pages. Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_ppc.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index f58930779ae8..4794de6ea379 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -282,8 +282,15 @@ void kvmppc_init_lpid(unsigned long nr_lpids); static inline void kvmppc_mmu_flush_icache(pfn_t pfn) { - /* Clear i-cache for new pages */ struct page *page; + /* + * We can only access pages that the kernel maps + * as memory. Bail out for unmapped ones. + */ + if (!pfn_valid(pfn)) + return; + + /* Clear i-cache for new pages */ page = pfn_to_page(pfn); if (!test_bit(PG_arch_1, &page->flags)) { flush_dcache_icache_page(page); From 092d62ee93039bfccbb3a36c69d0c3ee0966a97a Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Mon, 8 Apr 2013 00:32:12 +0000 Subject: [PATCH 134/204] KVM: PPC: debug stub interface parameter defined This patch defines the interface parameter for KVM_SET_GUEST_DEBUG ioctl support. Follow up patches will use this for setting up hardware breakpoints, watchpoints and software breakpoints. Also kvm_arch_vcpu_ioctl_set_guest_debug() is brought one level below. This is because I am not sure what is required for book3s. So this ioctl behaviour will not change for book3s. Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- arch/powerpc/include/uapi/asm/kvm.h | 23 +++++++++++++++++++++++ arch/powerpc/kvm/book3s.c | 6 ++++++ arch/powerpc/kvm/booke.c | 6 ++++++ arch/powerpc/kvm/powerpc.c | 6 ------ 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c2ff99c01562..c0c38ed9c97d 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -272,8 +272,31 @@ struct kvm_debug_exit_arch { /* for KVM_SET_GUEST_DEBUG */ struct kvm_guest_debug_arch { + struct { + /* H/W breakpoint/watchpoint address */ + __u64 addr; + /* + * Type denotes h/w breakpoint, read watchpoint, write + * watchpoint or watchpoint (both read and write). + */ +#define KVMPPC_DEBUG_NONE 0x0 +#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) +#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) + __u32 type; + __u32 reserved; + } bp[16]; }; +/* Debug related defines */ +/* + * kvm_guest_debug->control is a 32 bit field. The lower 16 bits are generic + * and upper 16 bits are architecture specific. Architecture specific defines + * that ioctl is for setting hardware breakpoint or software breakpoint. + */ +#define KVM_GUESTDBG_USE_SW_BP 0x00010000 +#define KVM_GUESTDBG_USE_HW_BP 0x00020000 + /* definition of registers in kvm_run */ struct kvm_sync_regs { }; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 2d32ae4bc439..128ed3a856b9 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -612,6 +612,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return 0; } +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return -EINVAL; +} + void kvmppc_decrementer_func(unsigned long data) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index a49a68a25c39..a3e2db0cf2f9 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1526,6 +1526,12 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return -EINVAL; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a822659db50a..6b8108624851 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -531,12 +531,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { From c402a3f457b9689451c4e422781026633a5b6287 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Mon, 8 Apr 2013 00:32:13 +0000 Subject: [PATCH 135/204] Rename EMULATE_DO_PAPR to EMULATE_EXIT_USER Instruction emulation return EMULATE_DO_PAPR when it requires exit to userspace on book3s. Similar return is required for booke. EMULATE_DO_PAPR reads out to be confusing so it is renamed to EMULATE_EXIT_USER. Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/kvm/book3s_emulate.c | 2 +- arch/powerpc/kvm/book3s_pr.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 4794de6ea379..bcc68b1afc66 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -44,7 +44,7 @@ enum emulation_result { EMULATE_DO_DCR, /* kvm_run filled with DCR request */ EMULATE_FAIL, /* can't emulate this instruction */ EMULATE_AGAIN, /* something went wrong. go again */ - EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */ + EMULATE_EXIT_USER, /* emulation requires exit to user-space */ }; extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 836c56975e21..cdd19d64e354 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -194,7 +194,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, run->papr_hcall.args[i] = gpr; } - emulated = EMULATE_DO_PAPR; + emulated = EMULATE_EXIT_USER; break; } #endif diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 286e23e6b92d..b960faf990bf 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -762,7 +762,7 @@ program_interrupt: run->exit_reason = KVM_EXIT_MMIO; r = RESUME_HOST_NV; break; - case EMULATE_DO_PAPR: + case EMULATE_EXIT_USER: run->exit_reason = KVM_EXIT_PAPR_HCALL; vcpu->arch.hcall_needed = 1; r = RESUME_HOST_NV; From 0f47f9b51734290968189286e6b65f1139d0f72d Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Mon, 8 Apr 2013 00:32:14 +0000 Subject: [PATCH 136/204] KVM: extend EMULATE_EXIT_USER to support different exit reasons Currently the instruction emulator code returns EMULATE_EXIT_USER and common code initializes the "run->exit_reason = .." and "vcpu->arch.hcall_needed = .." with one fixed reason. But there can be different reasons when emulator need to exit to user space. To support that the "run->exit_reason = .." and "vcpu->arch.hcall_needed = .." initialization is moved a level up to emulator. Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_emulate.c | 2 ++ arch/powerpc/kvm/book3s_pr.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index cdd19d64e354..1f6344c4408d 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -194,6 +194,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, run->papr_hcall.args[i] = gpr; } + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; emulated = EMULATE_EXIT_USER; break; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b960faf990bf..c1cffa882a69 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -763,8 +763,6 @@ program_interrupt: r = RESUME_HOST_NV; break; case EMULATE_EXIT_USER: - run->exit_reason = KVM_EXIT_PAPR_HCALL; - vcpu->arch.hcall_needed = 1; r = RESUME_HOST_NV; break; default: From 9b4f530807e4243d8c4bc02c5bbd5b9e3e2ed9e0 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Mon, 8 Apr 2013 00:32:15 +0000 Subject: [PATCH 137/204] booke: exit to user space if emulator request This allows the exit to user space if emulator request by returning EMULATE_EXIT_USER. This will be used in subsequent patches in list Signed-off-by: Bharat Bhushan Signed-off-by: Alexander Graf --- arch/powerpc/kvm/booke.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index a3e2db0cf2f9..97ae158ca4c8 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -745,6 +745,9 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) kvmppc_core_queue_program(vcpu, ESR_PIL); return RESUME_HOST; + case EMULATE_EXIT_USER: + return RESUME_HOST; + default: BUG(); } From 35b299e279eb5bc4622ec883b5f388c3224cbd61 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Apr 2013 00:03:07 +0000 Subject: [PATCH 138/204] KVM: PPC: Book3E: Refactor ONE_REG ioctl implementation Refactor Book3E ONE_REG ioctl implementation to use kvmppc_get_one_reg/ kvmppc_set_one_reg delegation interface introduced by Book3S. This is necessary for MMU SPRs which are platform specifics. Get rid of useless case braces in the process. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf --- arch/powerpc/kvm/44x.c | 12 +++++ arch/powerpc/kvm/booke.c | 102 ++++++++++++++++++++------------------ arch/powerpc/kvm/e500.c | 12 +++++ arch/powerpc/kvm/e500mc.c | 12 +++++ 4 files changed, 91 insertions(+), 47 deletions(-) diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 3d7fd21c65f9..2f5c6b6d6877 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -124,6 +124,18 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvmppc_vcpu_44x *vcpu_44x; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 97ae158ca4c8..02756537cd90 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1415,117 +1415,125 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { - int r = -EINVAL; + int r = 0; + union kvmppc_one_reg val; + int size; + long int i; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; switch (reg->id) { case KVM_REG_PPC_IAC1: case KVM_REG_PPC_IAC2: case KVM_REG_PPC_IAC3: - case KVM_REG_PPC_IAC4: { - int iac = reg->id - KVM_REG_PPC_IAC1; - r = copy_to_user((u64 __user *)(long)reg->addr, - &vcpu->arch.dbg_reg.iac[iac], sizeof(u64)); + case KVM_REG_PPC_IAC4: + i = reg->id - KVM_REG_PPC_IAC1; + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]); break; - } case KVM_REG_PPC_DAC1: - case KVM_REG_PPC_DAC2: { - int dac = reg->id - KVM_REG_PPC_DAC1; - r = copy_to_user((u64 __user *)(long)reg->addr, - &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); + case KVM_REG_PPC_DAC2: + i = reg->id - KVM_REG_PPC_DAC1; + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]); break; - } case KVM_REG_PPC_EPR: { u32 epr = get_guest_epr(vcpu); - r = put_user(epr, (u32 __user *)(long)reg->addr); + val = get_reg_val(reg->id, epr); break; } #if defined(CONFIG_64BIT) case KVM_REG_PPC_EPCR: - r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); + val = get_reg_val(reg->id, vcpu->arch.epcr); break; #endif case KVM_REG_PPC_TCR: - r = put_user(vcpu->arch.tcr, (u32 __user *)(long)reg->addr); + val = get_reg_val(reg->id, vcpu->arch.tcr); break; case KVM_REG_PPC_TSR: - r = put_user(vcpu->arch.tsr, (u32 __user *)(long)reg->addr); + val = get_reg_val(reg->id, vcpu->arch.tsr); break; - case KVM_REG_PPC_DEBUG_INST: { - u32 opcode = KVMPPC_INST_EHPRIV; - r = copy_to_user((u32 __user *)(long)reg->addr, - &opcode, sizeof(u32)); + case KVM_REG_PPC_DEBUG_INST: + val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV); break; - } default: + r = kvmppc_get_one_reg(vcpu, reg->id, &val); break; } + + if (r) + return r; + + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) + r = -EFAULT; + return r; } int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { - int r = -EINVAL; + int r = 0; + union kvmppc_one_reg val; + int size; + long int i; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) + return -EFAULT; switch (reg->id) { case KVM_REG_PPC_IAC1: case KVM_REG_PPC_IAC2: case KVM_REG_PPC_IAC3: - case KVM_REG_PPC_IAC4: { - int iac = reg->id - KVM_REG_PPC_IAC1; - r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac], - (u64 __user *)(long)reg->addr, sizeof(u64)); + case KVM_REG_PPC_IAC4: + i = reg->id - KVM_REG_PPC_IAC1; + vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val); break; - } case KVM_REG_PPC_DAC1: - case KVM_REG_PPC_DAC2: { - int dac = reg->id - KVM_REG_PPC_DAC1; - r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac], - (u64 __user *)(long)reg->addr, sizeof(u64)); + case KVM_REG_PPC_DAC2: + i = reg->id - KVM_REG_PPC_DAC1; + vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val); break; - } case KVM_REG_PPC_EPR: { - u32 new_epr; - r = get_user(new_epr, (u32 __user *)(long)reg->addr); - if (!r) - kvmppc_set_epr(vcpu, new_epr); + u32 new_epr = set_reg_val(reg->id, val); + kvmppc_set_epr(vcpu, new_epr); break; } #if defined(CONFIG_64BIT) case KVM_REG_PPC_EPCR: { - u32 new_epcr; - r = get_user(new_epcr, (u32 __user *)(long)reg->addr); - if (r == 0) - kvmppc_set_epcr(vcpu, new_epcr); + u32 new_epcr = set_reg_val(reg->id, val); + kvmppc_set_epcr(vcpu, new_epcr); break; } #endif case KVM_REG_PPC_OR_TSR: { - u32 tsr_bits; - r = get_user(tsr_bits, (u32 __user *)(long)reg->addr); + u32 tsr_bits = set_reg_val(reg->id, val); kvmppc_set_tsr_bits(vcpu, tsr_bits); break; } case KVM_REG_PPC_CLEAR_TSR: { - u32 tsr_bits; - r = get_user(tsr_bits, (u32 __user *)(long)reg->addr); + u32 tsr_bits = set_reg_val(reg->id, val); kvmppc_clr_tsr_bits(vcpu, tsr_bits); break; } case KVM_REG_PPC_TSR: { - u32 tsr; - r = get_user(tsr, (u32 __user *)(long)reg->addr); + u32 tsr = set_reg_val(reg->id, val); kvmppc_set_tsr(vcpu, tsr); break; } case KVM_REG_PPC_TCR: { - u32 tcr; - r = get_user(tcr, (u32 __user *)(long)reg->addr); + u32 tcr = set_reg_val(reg->id, val); kvmppc_set_tcr(vcpu, tcr); break; } default: + r = kvmppc_set_one_reg(vcpu, reg->id, &val); break; } + return r; } diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 6dd4de7802bf..576010f6c27d 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -425,6 +425,18 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 1f89d26e65fb..b071bdcbe37d 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -255,6 +255,18 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + return -EINVAL; +} + struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; From a85d2aa23e51a9460e034e283da2513930b4f183 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Apr 2013 00:03:08 +0000 Subject: [PATCH 139/204] KVM: PPC: e500: Expose MMU registers via ONE_REG MMU registers were exposed to user-space using sregs interface. Add them to ONE_REG interface using kvmppc_get_one_reg/kvmppc_set_one_reg delegation mechanism. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 11 ++++ arch/powerpc/include/uapi/asm/kvm.h | 17 ++++++ arch/powerpc/kvm/e500.c | 6 +- arch/powerpc/kvm/e500.h | 4 ++ arch/powerpc/kvm/e500_mmu.c | 94 +++++++++++++++++++++++++++++ arch/powerpc/kvm/e500mc.c | 6 +- 6 files changed, 134 insertions(+), 4 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 976eb650e7ef..1a766637ac21 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1792,6 +1792,17 @@ registers, find a list below: PPC | KVM_REG_PPC_TSR | 32 PPC | KVM_REG_PPC_OR_TSR | 32 PPC | KVM_REG_PPC_CLEAR_TSR | 32 + PPC | KVM_REG_PPC_MAS0 | 32 + PPC | KVM_REG_PPC_MAS1 | 32 + PPC | KVM_REG_PPC_MAS2 | 64 + PPC | KVM_REG_PPC_MAS7_3 | 64 + PPC | KVM_REG_PPC_MAS4 | 32 + PPC | KVM_REG_PPC_MAS6 | 32 + PPC | KVM_REG_PPC_MMUCFG | 32 + PPC | KVM_REG_PPC_TLB0CFG | 32 + PPC | KVM_REG_PPC_TLB1CFG | 32 + PPC | KVM_REG_PPC_TLB2CFG | 32 + PPC | KVM_REG_PPC_TLB3CFG | 32 ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c0c38ed9c97d..0c5cffb6a58e 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -449,4 +449,21 @@ struct kvm_get_htab_header { /* Debugging: Special instruction for software breakpoint */ #define KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8b) +/* MMU registers */ +#define KVM_REG_PPC_MAS0 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8c) +#define KVM_REG_PPC_MAS1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8d) +#define KVM_REG_PPC_MAS2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8e) +#define KVM_REG_PPC_MAS7_3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8f) +#define KVM_REG_PPC_MAS4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x90) +#define KVM_REG_PPC_MAS6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x91) +#define KVM_REG_PPC_MMUCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x92) +/* + * TLBnCFG fields TLBnCFG_N_ENTRY and TLBnCFG_ASSOC can be changed only using + * KVM_CAP_SW_TLB ioctl + */ +#define KVM_REG_PPC_TLB0CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x93) +#define KVM_REG_PPC_TLB1CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x94) +#define KVM_REG_PPC_TLB2CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x95) +#define KVM_REG_PPC_TLB3CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x96) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 576010f6c27d..ce6b73c29612 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -428,13 +428,15 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - return -EINVAL; + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; } int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - return -EINVAL; + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; } struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 33db48a8ce24..b73ca7a1c09f 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -131,6 +131,10 @@ void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); #ifdef CONFIG_KVM_E500V2 unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 5c4475983f78..44f7762694ba 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -596,6 +596,100 @@ int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return 0; } +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_MAS0: + *val = get_reg_val(id, vcpu->arch.shared->mas0); + break; + case KVM_REG_PPC_MAS1: + *val = get_reg_val(id, vcpu->arch.shared->mas1); + break; + case KVM_REG_PPC_MAS2: + *val = get_reg_val(id, vcpu->arch.shared->mas2); + break; + case KVM_REG_PPC_MAS7_3: + *val = get_reg_val(id, vcpu->arch.shared->mas7_3); + break; + case KVM_REG_PPC_MAS4: + *val = get_reg_val(id, vcpu->arch.shared->mas4); + break; + case KVM_REG_PPC_MAS6: + *val = get_reg_val(id, vcpu->arch.shared->mas6); + break; + case KVM_REG_PPC_MMUCFG: + *val = get_reg_val(id, vcpu->arch.mmucfg); + break; + case KVM_REG_PPC_TLB0CFG: + case KVM_REG_PPC_TLB1CFG: + case KVM_REG_PPC_TLB2CFG: + case KVM_REG_PPC_TLB3CFG: + i = id - KVM_REG_PPC_TLB0CFG; + *val = get_reg_val(id, vcpu->arch.tlbcfg[i]); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_MAS0: + vcpu->arch.shared->mas0 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS1: + vcpu->arch.shared->mas1 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS2: + vcpu->arch.shared->mas2 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS7_3: + vcpu->arch.shared->mas7_3 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS4: + vcpu->arch.shared->mas4 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS6: + vcpu->arch.shared->mas6 = set_reg_val(id, *val); + break; + /* Only allow MMU registers to be set to the config supported by KVM */ + case KVM_REG_PPC_MMUCFG: { + u32 reg = set_reg_val(id, *val); + if (reg != vcpu->arch.mmucfg) + r = -EINVAL; + break; + } + case KVM_REG_PPC_TLB0CFG: + case KVM_REG_PPC_TLB1CFG: + case KVM_REG_PPC_TLB2CFG: + case KVM_REG_PPC_TLB3CFG: { + /* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */ + u32 reg = set_reg_val(id, *val); + i = id - KVM_REG_PPC_TLB0CFG; + if (reg != vcpu->arch.tlbcfg[i]) + r = -EINVAL; + break; + } + default: + r = -EINVAL; + break; + } + + return r; +} + int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, struct kvm_config_tlb *cfg) { diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index b071bdcbe37d..ab073a80cd45 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -258,13 +258,15 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - return -EINVAL; + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; } int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - return -EINVAL; + int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val); + return r; } struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) From 8893a188b13160ee4b228fab02d802cf4f0a3e78 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Apr 2013 00:03:09 +0000 Subject: [PATCH 140/204] KVM: PPC: e500: Move vcpu's MMU configuration to dedicated functions Vcpu's MMU default configuration and geometry update logic was buried in a chunk of code. Move them to dedicated functions to add more clarity. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500_mmu.c | 60 +++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 44f7762694ba..08a5b0d296fa 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -690,6 +690,20 @@ int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, return r; } +static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_params *params) +{ + vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + if (params->tlb_sizes[0] <= 2048) + vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0]; + vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; + + vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1]; + vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + return 0; +} + int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, struct kvm_config_tlb *cfg) { @@ -786,16 +800,8 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, vcpu_e500->gtlb_offset[0] = 0; vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; - vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; - - vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - if (params.tlb_sizes[0] <= 2048) - vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0]; - vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; - - vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1]; - vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + /* Update vcpu's MMU geometry based on SW_TLB input */ + vcpu_mmu_geometry_update(vcpu, ¶ms); vcpu_e500->shared_tlb_pages = pages; vcpu_e500->num_shared_tlb_pages = num_pages; @@ -831,6 +837,27 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, return 0; } +/* Vcpu's MMU default configuration */ +static int vcpu_mmu_init(struct kvm_vcpu *vcpu, + struct kvmppc_e500_tlb_params *params) +{ + /* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/ + vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; + + /* Initialize TLBnCFG fields with host values and SW_TLB geometry*/ + vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[0] |= params[0].entries; + vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT; + + vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params[1].entries; + vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT; + + return 0; +} + int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; @@ -875,18 +902,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (!vcpu_e500->g2h_tlb1_map) goto err; - /* Init TLB configuration register */ - vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & - ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries; - vcpu->arch.tlbcfg[0] |= - vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT; - - vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & - ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries; - vcpu->arch.tlbcfg[1] |= - vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT; + vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params); kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; From 307d9008ed4f28920e0e78719e10d0f407341e00 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Apr 2013 00:03:10 +0000 Subject: [PATCH 141/204] KVM: PPC: e500: Add support for TLBnPS registers Add support for TLBnPS registers available in MMU Architecture Version (MAV) 2.0. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 4 ++++ arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/uapi/asm/kvm.h | 4 ++++ arch/powerpc/kvm/e500.h | 18 ++++++++++++++++++ arch/powerpc/kvm/e500_emulate.c | 10 ++++++++++ arch/powerpc/kvm/e500_mmu.c | 22 ++++++++++++++++++++++ 6 files changed, 59 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 1a766637ac21..f045377ae5a0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1803,6 +1803,10 @@ registers, find a list below: PPC | KVM_REG_PPC_TLB1CFG | 32 PPC | KVM_REG_PPC_TLB2CFG | 32 PPC | KVM_REG_PPC_TLB3CFG | 32 + PPC | KVM_REG_PPC_TLB0PS | 32 + PPC | KVM_REG_PPC_TLB1PS | 32 + PPC | KVM_REG_PPC_TLB2PS | 32 + PPC | KVM_REG_PPC_TLB3PS | 32 ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e34f8fee9080..3b6cee3e33a8 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -502,6 +502,7 @@ struct kvm_vcpu_arch { spinlock_t wdt_lock; struct timer_list wdt_timer; u32 tlbcfg[4]; + u32 tlbps[4]; u32 mmucfg; u32 epr; u32 crit_save; diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 0c5cffb6a58e..4dd36c399842 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -465,5 +465,9 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_TLB1CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x94) #define KVM_REG_PPC_TLB2CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x95) #define KVM_REG_PPC_TLB3CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x96) +#define KVM_REG_PPC_TLB0PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x97) +#define KVM_REG_PPC_TLB1PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x98) +#define KVM_REG_PPC_TLB2PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x99) +#define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index b73ca7a1c09f..c2e5e98453a6 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -23,6 +23,10 @@ #include #include +enum vcpu_ftr { + VCPU_FTR_MMU_V2 +}; + #define E500_PID_NUM 3 #define E500_TLB_NUM 2 @@ -299,4 +303,18 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu) #define get_tlb_sts(gtlbe) (MAS1_TS) #endif /* !BOOKE_HV */ +static inline bool has_feature(const struct kvm_vcpu *vcpu, + enum vcpu_ftr ftr) +{ + bool has_ftr; + switch (ftr) { + case VCPU_FTR_MMU_V2: + has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2); + break; + default: + return false; + } + return has_ftr; +} + #endif /* KVM_E500_H */ diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index e78f353a836a..12b8de2f91ed 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -284,6 +284,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_TLB1CFG: *spr_val = vcpu->arch.tlbcfg[1]; break; + case SPRN_TLB0PS: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + *spr_val = vcpu->arch.tlbps[0]; + break; + case SPRN_TLB1PS: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + *spr_val = vcpu->arch.tlbps[1]; + break; case SPRN_L1CSR0: *spr_val = vcpu_e500->l1csr0; break; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 08a5b0d296fa..a863dc1791eb 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -631,6 +631,13 @@ int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, i = id - KVM_REG_PPC_TLB0CFG; *val = get_reg_val(id, vcpu->arch.tlbcfg[i]); break; + case KVM_REG_PPC_TLB0PS: + case KVM_REG_PPC_TLB1PS: + case KVM_REG_PPC_TLB2PS: + case KVM_REG_PPC_TLB3PS: + i = id - KVM_REG_PPC_TLB0PS; + *val = get_reg_val(id, vcpu->arch.tlbps[i]); + break; default: r = -EINVAL; break; @@ -682,6 +689,16 @@ int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, r = -EINVAL; break; } + case KVM_REG_PPC_TLB0PS: + case KVM_REG_PPC_TLB1PS: + case KVM_REG_PPC_TLB2PS: + case KVM_REG_PPC_TLB3PS: { + u32 reg = set_reg_val(id, *val); + i = id - KVM_REG_PPC_TLB0PS; + if (reg != vcpu->arch.tlbps[i]) + r = -EINVAL; + break; + } default: r = -EINVAL; break; @@ -855,6 +872,11 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu, vcpu->arch.tlbcfg[1] |= params[1].entries; vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT; + if (has_feature(vcpu, VCPU_FTR_MMU_V2)) { + vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS); + vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS); + } + return 0; } From 9a6061d7fdedbf025549adf5c9d920d90bbf4a69 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Apr 2013 00:03:11 +0000 Subject: [PATCH 142/204] KVM: PPC: e500: Add support for EPTCFG register EPTCFG register defined by E.PT is accessed unconditionally by Linux guests in the presence of MAV 2.0. Emulate it now. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 1 + arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/uapi/asm/kvm.h | 1 + arch/powerpc/kvm/e500_emulate.c | 9 +++++++++ arch/powerpc/kvm/e500_mmu.c | 12 ++++++++++++ 5 files changed, 24 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f045377ae5a0..a1f2200e43d0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1807,6 +1807,7 @@ registers, find a list below: PPC | KVM_REG_PPC_TLB1PS | 32 PPC | KVM_REG_PPC_TLB2PS | 32 PPC | KVM_REG_PPC_TLB3PS | 32 + PPC | KVM_REG_PPC_EPTCFG | 32 ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3b6cee3e33a8..8a48e686a755 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -504,6 +504,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 tlbps[4]; u32 mmucfg; + u32 eptcfg; u32 epr; u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 4dd36c399842..41d59d8bdfe8 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -469,5 +469,6 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_TLB1PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x98) #define KVM_REG_PPC_TLB2PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x99) #define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a) +#define KVM_REG_PPC_EPTCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 12b8de2f91ed..b10a01243abd 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -317,6 +317,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_MMUCFG: *spr_val = vcpu->arch.mmucfg; break; + case SPRN_EPTCFG: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + /* + * Legacy Linux guests access EPTCFG register even if the E.PT + * category is disabled in the VM. Give them a chance to live. + */ + *spr_val = vcpu->arch.eptcfg; + break; /* extra exceptions */ case SPRN_IVOR32: diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index a863dc1791eb..1c1c5cb78495 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -624,6 +624,9 @@ int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_MMUCFG: *val = get_reg_val(id, vcpu->arch.mmucfg); break; + case KVM_REG_PPC_EPTCFG: + *val = get_reg_val(id, vcpu->arch.eptcfg); + break; case KVM_REG_PPC_TLB0CFG: case KVM_REG_PPC_TLB1CFG: case KVM_REG_PPC_TLB2CFG: @@ -678,6 +681,12 @@ int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, r = -EINVAL; break; } + case KVM_REG_PPC_EPTCFG: { + u32 reg = set_reg_val(id, *val); + if (reg != vcpu->arch.eptcfg) + r = -EINVAL; + break; + } case KVM_REG_PPC_TLB0CFG: case KVM_REG_PPC_TLB1CFG: case KVM_REG_PPC_TLB2CFG: @@ -875,6 +884,9 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu, if (has_feature(vcpu, VCPU_FTR_MMU_V2)) { vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS); vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS); + + /* Guest mmu emulation currently doesn't handle E.PT */ + vcpu->arch.eptcfg = 0; } return 0; From 5b2150104534f9d715d4339b9a074a1190b30336 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Apr 2013 00:03:12 +0000 Subject: [PATCH 143/204] KVM: PPC: e500: Remove E.PT and E.HV.LRAT categories from VCPUs Embedded.Page Table (E.PT) category is not supported yet in e6500 kernel. Configure TLBnCFG to remove E.PT and E.HV.LRAT categories from VCPUs. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500_mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 1c1c5cb78495..c41a5a96b558 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -885,8 +885,12 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu, vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS); vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS); + vcpu->arch.mmucfg &= ~MMUCFG_LRAT; + /* Guest mmu emulation currently doesn't handle E.PT */ vcpu->arch.eptcfg = 0; + vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT; + vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND; } return 0; From ea17a971c2b1b35b446a98c2e437a8611eb2b80a Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Apr 2013 00:03:13 +0000 Subject: [PATCH 144/204] KVM: PPC: e500mc: Enable e6500 cores Extend processor compatibility names to e6500 cores. Signed-off-by: Mihai Caraman Reviewed-by: Alexander Graf Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500mc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index ab073a80cd45..c3bdc0aeabe2 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -172,6 +172,8 @@ int kvmppc_core_check_processor_compat(void) r = 0; else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0) r = 0; + else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0) + r = 0; else r = -ENOTSUPP; From d9ce6041b39cf0b48978db63713f9b7e25ef9792 Mon Sep 17 00:00:00 2001 From: Mihai Caraman Date: Thu, 11 Apr 2013 00:03:14 +0000 Subject: [PATCH 145/204] KVM: PPC: e500: Add e6500 core to Kconfig description Add e6500 core to Kconfig description. Signed-off-by: Mihai Caraman Signed-off-by: Alexander Graf --- arch/powerpc/kvm/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 63c67ec72e43..448952035b6e 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -136,15 +136,15 @@ config KVM_E500V2 If unsure, say N. config KVM_E500MC - bool "KVM support for PowerPC E500MC/E5500 processors" + bool "KVM support for PowerPC E500MC/E5500/E6500 processors" depends on PPC_E500MC select KVM select KVM_MMIO select KVM_BOOKE_HV select MMU_NOTIFIER ---help--- - Support running unmodified E500MC/E5500 (32-bit) guest kernels in - virtual machines on E500MC/E5500 host processors. + Support running unmodified E500MC/E5500/E6500 guest kernels in + virtual machines on E500MC/E5500/E6500 host processors. This module provides access to the hardware capabilities through a character device node named /dev/kvm. From a1b4a0f6064aacad0d708105e6f60a06e93fbf37 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 18 Apr 2013 19:50:24 +0000 Subject: [PATCH 146/204] KVM: PPC: Book3S HV: Make HPT reading code notice R/C bit changes At present, the code that determines whether a HPT entry has changed, and thus needs to be sent to userspace when it is copying the HPT, doesn't consider a hardware update to the reference and change bits (R and C) in the HPT entries to constitute a change that needs to be sent to userspace. This adds code to check for changes in R and C when we are scanning the HPT to find changed entries, and adds code to set the changed flag for the HPTE when we update the R and C bits in the guest view of the HPTE. Since we now need to set the HPTE changed flag in book3s_64_mmu_hv.c as well as book3s_hv_rm_mmu.c, we move the note_hpte_modification() function into kvm_book3s_64.h. Current Linux guest kernels don't use the hardware updates of R and C in the HPT, so this change won't affect them. Linux (or other) kernels might in future want to use the R and C bits and have them correctly transferred across when a guest is migrated, so it is better to correct this deficiency. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s_64.h | 13 ++++++ arch/powerpc/kvm/book3s_64_mmu_hv.c | 59 ++++++++++++++++++++---- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 11 ----- 3 files changed, 63 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 38bec1dc9928..9c1ff330c805 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -268,4 +268,17 @@ static inline int is_vrma_hpte(unsigned long hpte_v) (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))); } +#ifdef CONFIG_KVM_BOOK3S_64_HV +/* + * Note modification of an HPTE; set the HPTE modified bit + * if anyone is interested. + */ +static inline void note_hpte_modification(struct kvm *kvm, + struct revmap_entry *rev) +{ + if (atomic_read(&kvm->arch.hpte_mod_interest)) + rev->guest_rpte |= HPTE_GR_MODIFIED; +} +#endif /* CONFIG_KVM_BOOK3S_64_HV */ + #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8cc18abd6dde..d641a6634b02 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -893,7 +893,10 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, /* Harvest R and C */ rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; - rev[i].guest_rpte = ptel | rcbits; + if (rcbits & ~rev[i].guest_rpte) { + rev[i].guest_rpte = ptel | rcbits; + note_hpte_modification(kvm, &rev[i]); + } } unlock_rmap(rmapp); hptep[0] &= ~HPTE_V_HVLOCK; @@ -976,7 +979,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, /* Now check and modify the HPTE */ if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { kvmppc_clear_ref_hpte(kvm, hptep, i); - rev[i].guest_rpte |= HPTE_R_R; + if (!(rev[i].guest_rpte & HPTE_R_R)) { + rev[i].guest_rpte |= HPTE_R_R; + note_hpte_modification(kvm, &rev[i]); + } ret = 1; } hptep[0] &= ~HPTE_V_HVLOCK; @@ -1080,7 +1086,10 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) hptep[1] &= ~HPTE_R_C; eieio(); hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; - rev[i].guest_rpte |= HPTE_R_C; + if (!(rev[i].guest_rpte & HPTE_R_C)) { + rev[i].guest_rpte |= HPTE_R_C; + note_hpte_modification(kvm, &rev[i]); + } ret = 1; } hptep[0] &= ~HPTE_V_HVLOCK; @@ -1193,16 +1202,36 @@ struct kvm_htab_ctx { #define HPTE_SIZE (2 * sizeof(unsigned long)) +/* + * Returns 1 if this HPT entry has been modified or has pending + * R/C bit changes. + */ +static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp) +{ + unsigned long rcbits_unset; + + if (revp->guest_rpte & HPTE_GR_MODIFIED) + return 1; + + /* Also need to consider changes in reference and changed bits */ + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); + if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset)) + return 1; + + return 0; +} + static long record_hpte(unsigned long flags, unsigned long *hptp, unsigned long *hpte, struct revmap_entry *revp, int want_valid, int first_pass) { unsigned long v, r; + unsigned long rcbits_unset; int ok = 1; int valid, dirty; /* Unmodified entries are uninteresting except on the first pass */ - dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + dirty = hpte_dirty(revp, hptp); if (!first_pass && !dirty) return 0; @@ -1223,16 +1252,28 @@ static long record_hpte(unsigned long flags, unsigned long *hptp, while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) cpu_relax(); v = hptp[0]; + + /* re-evaluate valid and dirty from synchronized HPTE value */ + valid = !!(v & HPTE_V_VALID); + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + + /* Harvest R and C into guest view if necessary */ + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); + if (valid && (rcbits_unset & hptp[1])) { + revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) | + HPTE_GR_MODIFIED; + dirty = 1; + } + if (v & HPTE_V_ABSENT) { v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; + valid = 1; } - /* re-evaluate valid and dirty from synchronized HPTE value */ - valid = !!(v & HPTE_V_VALID); if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) valid = 0; - r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C)); - dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + + r = revp->guest_rpte; /* only clear modified if this is the right sort of entry */ if (valid == want_valid && dirty) { r &= ~HPTE_GR_MODIFIED; @@ -1288,7 +1329,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, /* Skip uninteresting entries, i.e. clean on not-first pass */ if (!first_pass) { while (i < kvm->arch.hpt_npte && - !(revp->guest_rpte & HPTE_GR_MODIFIED)) { + !hpte_dirty(revp, hptp)) { ++i; hptp += 2; ++revp; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 19c93bae1aea..6dcbb49105a4 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -97,17 +97,6 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); -/* - * Note modification of an HPTE; set the HPTE modified bit - * if anyone is interested. - */ -static inline void note_hpte_modification(struct kvm *kvm, - struct revmap_entry *rev) -{ - if (atomic_read(&kvm->arch.hpte_mod_interest)) - rev->guest_rpte |= HPTE_GR_MODIFIED; -} - /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, struct revmap_entry *rev, From c35635efdc0312e013ebda1c8f3b5dd038c0d0e7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 18 Apr 2013 19:51:04 +0000 Subject: [PATCH 147/204] KVM: PPC: Book3S HV: Report VPA and DTL modifications in dirty map At present, the KVM_GET_DIRTY_LOG ioctl doesn't report modifications done by the host to the virtual processor areas (VPAs) and dispatch trace logs (DTLs) registered by the guest. This is because those modifications are done either in real mode or in the host kernel context, and in neither case does the access go through the guest's HPT, and thus no change (C) bit gets set in the guest's HPT. However, the changes done by the host do need to be tracked so that the modified pages get transferred when doing live migration. In order to track these modifications, this adds a dirty flag to the struct representing the VPA/DTL areas, and arranges to set the flag when the VPA/DTL gets modified by the host. Then, when we are collecting the dirty log, we also check the dirty flags for the VPA and DTL for each vcpu and set the relevant bit in the dirty log if necessary. Doing this also means we now need to keep track of the guest physical address of the VPA/DTL areas. So as not to lose track of modifications to a VPA/DTL area when it gets unregistered, or when a new area gets registered in its place, we need to transfer the dirty state to the rmap chain. This adds code to kvmppc_unpin_guest_page() to do that if the area was dirty. To simplify that code, we now require that all VPA, DTL and SLB shadow buffer areas fit within a single host page. Guests already comply with this requirement because pHyp requires that these areas not cross a 4k boundary. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s.h | 3 +- arch/powerpc/include/asm/kvm_host.h | 2 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 61 +++++++++++++++++++++---- arch/powerpc/kvm/book3s_hv.c | 30 +++++++----- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++ 6 files changed, 80 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index bc81842ea25a..c55f7e6affaa 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -156,7 +156,8 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, unsigned long pte_index); extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, unsigned long *nb_ret); -extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); +extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr, + unsigned long gpa, bool dirty); extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel); extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a48e686a755..1443768a6588 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -301,11 +301,13 @@ struct kvmppc_vcore { * that a guest can register. */ struct kvmppc_vpa { + unsigned long gpa; /* Current guest phys addr */ void *pinned_addr; /* Address in kernel linear mapping */ void *pinned_end; /* End of region */ unsigned long next_gpa; /* Guest phys addr for update */ unsigned long len; /* Number of bytes required */ u8 update_pending; /* 1 => update pinned_addr from next_gpa */ + bool dirty; /* true => area has been modified by kernel */ }; struct kvmppc_pte { diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index d87c90886c75..dbfd5498f440 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -477,6 +477,7 @@ int main(void) DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); + DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty)); #endif #ifdef CONFIG_PPC_BOOK3S DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d641a6634b02..69efe0d6cedc 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1099,11 +1099,30 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) return ret; } +static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, + struct kvm_memory_slot *memslot, + unsigned long *map) +{ + unsigned long gfn; + + if (!vpa->dirty || !vpa->pinned_addr) + return; + gfn = vpa->gpa >> PAGE_SHIFT; + if (gfn < memslot->base_gfn || + gfn >= memslot->base_gfn + memslot->npages) + return; + + vpa->dirty = false; + if (map) + __set_bit_le(gfn - memslot->base_gfn, map); +} + long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map) { unsigned long i; unsigned long *rmapp; + struct kvm_vcpu *vcpu; preempt_disable(); rmapp = memslot->arch.rmap; @@ -1112,6 +1131,15 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, __set_bit_le(i, map); ++rmapp; } + + /* Harvest dirty bits from VPA and DTL updates */ + /* Note: we never modify the SLB shadow buffer areas */ + kvm_for_each_vcpu(i, vcpu, kvm) { + spin_lock(&vcpu->arch.vpa_update_lock); + harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); + harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); + spin_unlock(&vcpu->arch.vpa_update_lock); + } preempt_enable(); return 0; } @@ -1123,7 +1151,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, unsigned long gfn = gpa >> PAGE_SHIFT; struct page *page, *pages[1]; int npages; - unsigned long hva, psize, offset; + unsigned long hva, offset; unsigned long pa; unsigned long *physp; int srcu_idx; @@ -1155,14 +1183,9 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, } srcu_read_unlock(&kvm->srcu, srcu_idx); - psize = PAGE_SIZE; - if (PageHuge(page)) { - page = compound_head(page); - psize <<= compound_order(page); - } - offset = gpa & (psize - 1); + offset = gpa & (PAGE_SIZE - 1); if (nb_ret) - *nb_ret = psize - offset; + *nb_ret = PAGE_SIZE - offset; return page_address(page) + offset; err: @@ -1170,11 +1193,31 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, return NULL; } -void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) +void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, + bool dirty) { struct page *page = virt_to_page(va); + struct kvm_memory_slot *memslot; + unsigned long gfn; + unsigned long *rmap; + int srcu_idx; put_page(page); + + if (!dirty || !kvm->arch.using_mmu_notifiers) + return; + + /* We need to mark this page dirty in the rmap chain */ + gfn = gpa >> PAGE_SHIFT; + srcu_idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, gfn); + if (memslot) { + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_CHANGED; + unlock_rmap(rmap); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); } /* diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1e521baf9a7d..5af0f2979833 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -259,7 +259,7 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, len = ((struct reg_vpa *)va)->length.hword; else len = ((struct reg_vpa *)va)->length.word; - kvmppc_unpin_guest_page(kvm, va); + kvmppc_unpin_guest_page(kvm, va, vpa, false); /* Check length */ if (len > nb || len < sizeof(struct reg_vpa)) @@ -359,13 +359,13 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) va = NULL; nb = 0; if (gpa) - va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb); + va = kvmppc_pin_guest_page(kvm, gpa, &nb); spin_lock(&vcpu->arch.vpa_update_lock); if (gpa == vpap->next_gpa) break; /* sigh... unpin that one and try again */ if (va) - kvmppc_unpin_guest_page(kvm, va); + kvmppc_unpin_guest_page(kvm, va, gpa, false); } vpap->update_pending = 0; @@ -375,12 +375,15 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) * has changed the mappings underlying guest memory, * so unregister the region. */ - kvmppc_unpin_guest_page(kvm, va); + kvmppc_unpin_guest_page(kvm, va, gpa, false); va = NULL; } if (vpap->pinned_addr) - kvmppc_unpin_guest_page(kvm, vpap->pinned_addr); + kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, + vpap->dirty); + vpap->gpa = gpa; vpap->pinned_addr = va; + vpap->dirty = false; if (va) vpap->pinned_end = va + vpap->len; } @@ -472,6 +475,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, /* order writing *dt vs. writing vpa->dtl_idx */ smp_wmb(); vpa->dtl_idx = ++vcpu->arch.dtl_index; + vcpu->arch.dtl.dirty = true; } int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) @@ -913,15 +917,19 @@ out: return ERR_PTR(err); } +static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) +{ + if (vpa->pinned_addr) + kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa, + vpa->dirty); +} + void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->arch.vpa_update_lock); - if (vcpu->arch.dtl.pinned_addr) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr); - if (vcpu->arch.slb_shadow.pinned_addr) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr); - if (vcpu->arch.vpa.pinned_addr) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr); + unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); + unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); + unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); spin_unlock(&vcpu->arch.vpa_update_lock); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index e33d11f1b977..0f23bb851711 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -260,6 +260,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) lwz r5, LPPACA_YIELDCOUNT(r3) addi r5, r5, 1 stw r5, LPPACA_YIELDCOUNT(r3) + li r6, 1 + stb r6, VCPU_VPA_DIRTY(r4) 25: /* Load up DAR and DSISR */ ld r5, VCPU_DAR(r4) @@ -1018,6 +1020,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) lwz r3, LPPACA_YIELDCOUNT(r8) addi r3, r3, 1 stw r3, LPPACA_YIELDCOUNT(r8) + li r3, 1 + stb r3, VCPU_VPA_DIRTY(r9) 25: /* Save PMU registers if requested */ /* r8 and cr0.eq are live here */ From 8175e5b79c38a1d85225da516fa1a0ecbf2fdbca Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Apr 2013 10:42:33 +0200 Subject: [PATCH 148/204] KVM: Add KVM_IRQCHIP_NUM_PINS in addition to KVM_IOAPIC_NUM_PINS The concept of routing interrupt lines to an irqchip is nothing that is IOAPIC specific. Every irqchip has a maximum number of pins that can be linked to irq lines. So let's add a new define that allows us to reuse generic code for non-IOAPIC platforms. Signed-off-by: Alexander Graf Acked-by: Michael S. Tsirkin --- arch/x86/include/asm/kvm_host.h | 2 ++ include/linux/kvm_host.h | 2 +- virt/kvm/irq_comm.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 18635ae42a8e..14337fa464bc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,6 +43,8 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS + #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 93a50054d46c..bf3b1dcb8b3d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -307,7 +307,7 @@ struct kvm_kernel_irq_routing_entry { #ifdef __KVM_HAVE_IOAPIC struct kvm_irq_routing_table { - int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS]; + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; struct kvm_kernel_irq_routing_entry *rt_entries; u32 nr_rt_entries; /* diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 25ab48007adb..7c0071de9e85 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -480,7 +480,7 @@ int kvm_set_irq_routing(struct kvm *kvm, new->nr_rt_entries = nr_rt_entries; for (i = 0; i < 3; i++) - for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++) + for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) new->chip[i][j] = -1; for (i = 0; i < nr; ++i) { From a725d56a02ec3582bb5b9756f261fdc6962c79ee Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 17 Apr 2013 13:29:30 +0200 Subject: [PATCH 149/204] KVM: Introduce CONFIG_HAVE_KVM_IRQ_ROUTING Quite a bit of code in KVM has been conditionalized on availability of IOAPIC emulation. However, most of it is generically applicable to platforms that don't have an IOPIC, but a different type of irq chip. Make code that only relies on IRQ routing, not an APIC itself, on CONFIG_HAVE_KVM_IRQ_ROUTING, so that we can reuse it later. Signed-off-by: Alexander Graf Acked-by: Michael S. Tsirkin --- arch/x86/kvm/Kconfig | 1 + include/linux/kvm_host.h | 6 +++--- virt/kvm/Kconfig | 3 +++ virt/kvm/eventfd.c | 6 +++--- virt/kvm/kvm_main.c | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 586f00059805..9d50efdb719d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select MMU_NOTIFIER select ANON_INODES select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select KVM_ASYNC_PF diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bf3b1dcb8b3d..4215d4f3d86f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -304,7 +304,7 @@ struct kvm_kernel_irq_routing_entry { struct hlist_node link; }; -#ifdef __KVM_HAVE_IOAPIC +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING struct kvm_irq_routing_table { int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS]; @@ -432,7 +432,7 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); int __must_check vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); -#ifdef __KVM_HAVE_IOAPIC +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING int kvm_irqfd_init(void); void kvm_irqfd_exit(void); #else @@ -957,7 +957,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) } #endif -#ifdef KVM_CAP_IRQ_ROUTING +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING #define KVM_MAX_IRQ_ROUTES 1024 diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index d01b24b72c61..779262f59e25 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -6,6 +6,9 @@ config HAVE_KVM config HAVE_KVM_IRQCHIP bool +config HAVE_KVM_IRQ_ROUTING + bool + config HAVE_KVM_EVENTFD bool select EVENTFD diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index c5d43ffbf1f3..64ee720b75c7 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -35,7 +35,7 @@ #include "iodev.h" -#ifdef __KVM_HAVE_IOAPIC +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING /* * -------------------------------------------------------------------- * irqfd: Allows an fd to be used to inject an interrupt to the guest @@ -433,7 +433,7 @@ fail: void kvm_eventfd_init(struct kvm *kvm) { -#ifdef __KVM_HAVE_IOAPIC +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING spin_lock_init(&kvm->irqfds.lock); INIT_LIST_HEAD(&kvm->irqfds.items); INIT_LIST_HEAD(&kvm->irqfds.resampler_list); @@ -442,7 +442,7 @@ kvm_eventfd_init(struct kvm *kvm) INIT_LIST_HEAD(&kvm->ioeventfds); } -#ifdef __KVM_HAVE_IOAPIC +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING /* * shutdown any irqfd's that match fd+gsi */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aaac1a7a9ea8..2c3b226bc13b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2404,7 +2404,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_SIGNAL_MSI: #endif return 1; -#ifdef KVM_CAP_IRQ_ROUTING +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif From 948a902c842aaef49af9e48b00469229b04a43a9 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Apr 2013 10:49:31 +0200 Subject: [PATCH 150/204] KVM: Drop __KVM_HAVE_IOAPIC condition on irq routing We have a capability enquire system that allows user space to ask kvm whether a feature is available. The point behind this system is that we can have different kernel configurations with different capabilities and user space can adjust accordingly. Because features can always be non existent, we can drop any #ifdefs on CAP defines that could be used generically, like the irq routing bits. These can be easily reused for non-IOAPIC systems as well. Signed-off-by: Alexander Graf Acked-by: Michael S. Tsirkin --- include/uapi/linux/kvm.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 74d0ff3dfd66..c741902c9e0b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -579,9 +579,7 @@ struct kvm_ppc_smmu_info { #ifdef __KVM_HAVE_PIT #define KVM_CAP_REINJECT_CONTROL 24 #endif -#ifdef __KVM_HAVE_IOAPIC #define KVM_CAP_IRQ_ROUTING 25 -#endif #define KVM_CAP_IRQ_INJECT_STATUS 26 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT #define KVM_CAP_DEVICE_DEASSIGNMENT 27 From 7eee2efdc6978aa70ab4046394128c1a10bc2d80 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Apr 2013 10:50:54 +0200 Subject: [PATCH 151/204] KVM: Remove kvm_get_intr_delivery_bitmask The prototype has been stale for a while, I can't spot any real function define behind it. Let's just remove it. Signed-off-by: Alexander Graf Acked-by: Michael S. Tsirkin --- include/linux/kvm_host.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4215d4f3d86f..a7bfe9d3aa5e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -719,11 +719,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, bool mask); -#ifdef __KVM_HAVE_IOAPIC -void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, - union kvm_ioapic_redirect_entry *entry, - unsigned long *deliver_bitmask); -#endif int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); From aa8d5944b8b2809e574581abbf41894089b7def2 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Apr 2013 21:12:53 +0200 Subject: [PATCH 152/204] KVM: Move irq routing to generic code The IRQ routing set ioctl lives in the hacky device assignment code inside of KVM today. This is definitely the wrong place for it. Move it to the much more natural kvm_main.c. Signed-off-by: Alexander Graf Acked-by: Michael S. Tsirkin --- virt/kvm/assigned-dev.c | 30 ------------------------------ virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index f4c7f591b5d8..8db43701016f 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -983,36 +983,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, goto out; break; } -#ifdef KVM_CAP_IRQ_ROUTING - case KVM_SET_GSI_ROUTING: { - struct kvm_irq_routing routing; - struct kvm_irq_routing __user *urouting; - struct kvm_irq_routing_entry *entries; - - r = -EFAULT; - if (copy_from_user(&routing, argp, sizeof(routing))) - goto out; - r = -EINVAL; - if (routing.nr >= KVM_MAX_IRQ_ROUTES) - goto out; - if (routing.flags) - goto out; - r = -ENOMEM; - entries = vmalloc(routing.nr * sizeof(*entries)); - if (!entries) - goto out; - r = -EFAULT; - urouting = argp; - if (copy_from_user(entries, urouting->entries, - routing.nr * sizeof(*entries))) - goto out_free_irq_routing; - r = kvm_set_irq_routing(kvm, entries, routing.nr, - routing.flags); - out_free_irq_routing: - vfree(entries); - break; - } -#endif /* KVM_CAP_IRQ_ROUTING */ #ifdef __KVM_HAVE_MSIX case KVM_ASSIGN_SET_MSIX_NR: { struct kvm_assigned_msix_nr entry_nr; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2c3b226bc13b..b6f33547b3db 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2274,6 +2274,36 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING + case KVM_SET_GSI_ROUTING: { + struct kvm_irq_routing routing; + struct kvm_irq_routing __user *urouting; + struct kvm_irq_routing_entry *entries; + + r = -EFAULT; + if (copy_from_user(&routing, argp, sizeof(routing))) + goto out; + r = -EINVAL; + if (routing.nr >= KVM_MAX_IRQ_ROUTES) + goto out; + if (routing.flags) + goto out; + r = -ENOMEM; + entries = vmalloc(routing.nr * sizeof(*entries)); + if (!entries) + goto out; + r = -EFAULT; + urouting = argp; + if (copy_from_user(entries, urouting->entries, + routing.nr * sizeof(*entries))) + goto out_free_irq_routing; + r = kvm_set_irq_routing(kvm, entries, routing.nr, + routing.flags); + out_free_irq_routing: + vfree(entries); + break; + } +#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) From 1c9f8520bda73c07fed9bcdb307854b45a3a60c4 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Apr 2013 23:04:10 +0200 Subject: [PATCH 153/204] KVM: Extract generic irqchip logic into irqchip.c The current irq_comm.c file contains pieces of code that are generic across different irqchip implementations, as well as code that is fully IOAPIC specific. Split the generic bits out into irqchip.c. Signed-off-by: Alexander Graf Acked-by: Michael S. Tsirkin --- arch/x86/kvm/Makefile | 2 +- include/trace/events/kvm.h | 12 ++- virt/kvm/irq_comm.c | 118 ---------------------------- virt/kvm/irqchip.c | 152 +++++++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 121 deletions(-) create mode 100644 virt/kvm/irqchip.c diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 04d30401c5cb..a797b8e43ade 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o irqchip.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 19911dddaeb7..7005d1109ec9 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -37,7 +37,7 @@ TRACE_EVENT(kvm_userspace_exit, __entry->errno < 0 ? -__entry->errno : __entry->reason) ); -#if defined(__KVM_HAVE_IRQ_LINE) +#if defined(CONFIG_HAVE_KVM_IRQCHIP) TRACE_EVENT(kvm_set_irq, TP_PROTO(unsigned int gsi, int level, int irq_source_id), TP_ARGS(gsi, level, irq_source_id), @@ -122,6 +122,10 @@ TRACE_EVENT(kvm_msi_set_irq, {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ {KVM_IRQCHIP_IOAPIC, "IOAPIC"} +#endif /* defined(__KVM_HAVE_IOAPIC) */ + +#if defined(CONFIG_HAVE_KVM_IRQCHIP) + TRACE_EVENT(kvm_ack_irq, TP_PROTO(unsigned int irqchip, unsigned int pin), TP_ARGS(irqchip, pin), @@ -136,14 +140,18 @@ TRACE_EVENT(kvm_ack_irq, __entry->pin = pin; ), +#ifdef kvm_irqchips TP_printk("irqchip %s pin %u", __print_symbolic(__entry->irqchip, kvm_irqchips), __entry->pin) +#else + TP_printk("irqchip %d pin %u", __entry->irqchip, __entry->pin) +#endif ); +#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */ -#endif /* defined(__KVM_HAVE_IOAPIC) */ #define KVM_TRACE_MMIO_READ_UNSATISFIED 0 #define KVM_TRACE_MMIO_READ 1 diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 7c0071de9e85..d5008f4aade7 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -151,59 +151,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, return -EWOULDBLOCK; } -int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) -{ - struct kvm_kernel_irq_routing_entry route; - - if (!irqchip_in_kernel(kvm) || msi->flags != 0) - return -EINVAL; - - route.msi.address_lo = msi->address_lo; - route.msi.address_hi = msi->address_hi; - route.msi.data = msi->data; - - return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); -} - -/* - * Return value: - * < 0 Interrupt was ignored (masked or not delivered for other reasons) - * = 0 Interrupt was coalesced (previous irq is still pending) - * > 0 Number of CPUs interrupt was delivered to - */ -int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, - bool line_status) -{ - struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; - int ret = -1, i = 0; - struct kvm_irq_routing_table *irq_rt; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* Not possible to detect if the guest uses the PIC or the - * IOAPIC. So set the bit in both. The guest will ignore - * writes to the unused one. - */ - rcu_read_lock(); - irq_rt = rcu_dereference(kvm->irq_routing); - if (irq < irq_rt->nr_rt_entries) - hlist_for_each_entry(e, &irq_rt->map[irq], link) - irq_set[i++] = *e; - rcu_read_unlock(); - - while(i--) { - int r; - r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, - line_status); - if (r < 0) - continue; - - ret = r + ((ret < 0) ? 0 : ret); - } - - return ret; -} - /* * Deliver an IRQ in an atomic context if we can, or return a failure, * user can retry in a process context. @@ -241,63 +188,6 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) return ret; } -bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) -{ - struct kvm_irq_ack_notifier *kian; - int gsi; - - rcu_read_lock(); - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; - if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) - if (kian->gsi == gsi) { - rcu_read_unlock(); - return true; - } - - rcu_read_unlock(); - - return false; -} -EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); - -void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) -{ - struct kvm_irq_ack_notifier *kian; - int gsi; - - trace_kvm_ack_irq(irqchip, pin); - - rcu_read_lock(); - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; - if (gsi != -1) - hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, - link) - if (kian->gsi == gsi) - kian->irq_acked(kian); - rcu_read_unlock(); -} - -void kvm_register_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - mutex_lock(&kvm->irq_lock); - hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); - mutex_unlock(&kvm->irq_lock); - kvm_vcpu_request_scan_ioapic(kvm); -} - -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - mutex_lock(&kvm->irq_lock); - hlist_del_init_rcu(&kian->link); - mutex_unlock(&kvm->irq_lock); - synchronize_rcu(); - kvm_vcpu_request_scan_ioapic(kvm); -} - int kvm_request_irq_source_id(struct kvm *kvm) { unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; @@ -381,13 +271,6 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, rcu_read_unlock(); } -void kvm_free_irq_routing(struct kvm *kvm) -{ - /* Called only during vm destruction. Nobody can use the pointer - at this stage */ - kfree(kvm->irq_routing); -} - static int setup_routing_entry(struct kvm_irq_routing_table *rt, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) @@ -451,7 +334,6 @@ out: return r; } - int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *ue, unsigned nr, diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c new file mode 100644 index 000000000000..12f7f26af4f8 --- /dev/null +++ b/virt/kvm/irqchip.c @@ -0,0 +1,152 @@ +/* + * irqchip.c: Common API for in kernel interrupt controllers + * Copyright (c) 2007, Intel Corporation. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright (c) 2013, Alexander Graf + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * This file is derived from virt/kvm/irq_comm.c. + * + * Authors: + * Yaozu (Eddie) Dong + * Alexander Graf + */ + +#include +#include +#include +#include +#include "irq.h" + +bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ + struct kvm_irq_ack_notifier *kian; + int gsi; + + rcu_read_lock(); + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) { + rcu_read_unlock(); + return true; + } + + rcu_read_unlock(); + + return false; +} +EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ + struct kvm_irq_ack_notifier *kian; + int gsi; + + trace_kvm_ack_irq(irqchip, pin); + + rcu_read_lock(); + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list, + link) + if (kian->gsi == gsi) + kian->irq_acked(kian); + rcu_read_unlock(); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + mutex_lock(&kvm->irq_lock); + hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); + mutex_unlock(&kvm->irq_lock); +#ifdef __KVM_HAVE_IOAPIC + kvm_vcpu_request_scan_ioapic(kvm); +#endif +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + mutex_lock(&kvm->irq_lock); + hlist_del_init_rcu(&kian->link); + mutex_unlock(&kvm->irq_lock); + synchronize_rcu(); +#ifdef __KVM_HAVE_IOAPIC + kvm_vcpu_request_scan_ioapic(kvm); +#endif +} + +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + struct kvm_kernel_irq_routing_entry route; + + if (!irqchip_in_kernel(kvm) || msi->flags != 0) + return -EINVAL; + + route.msi.address_lo = msi->address_lo; + route.msi.address_hi = msi->address_hi; + route.msi.data = msi->data; + + return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); +} + +/* + * Return value: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, + bool line_status) +{ + struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS]; + int ret = -1, i = 0; + struct kvm_irq_routing_table *irq_rt; + + trace_kvm_set_irq(irq, level, irq_source_id); + + /* Not possible to detect if the guest uses the PIC or the + * IOAPIC. So set the bit in both. The guest will ignore + * writes to the unused one. + */ + rcu_read_lock(); + irq_rt = rcu_dereference(kvm->irq_routing); + if (irq < irq_rt->nr_rt_entries) + hlist_for_each_entry(e, &irq_rt->map[irq], link) + irq_set[i++] = *e; + rcu_read_unlock(); + + while(i--) { + int r; + r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, + line_status); + if (r < 0) + continue; + + ret = r + ((ret < 0) ? 0 : ret); + } + + return ret; +} + +void kvm_free_irq_routing(struct kvm *kvm) +{ + /* Called only during vm destruction. Nobody can use the pointer + at this stage */ + kfree(kvm->irq_routing); +} From e8cde0939d8ebe9c8ebe5a4bdf71af51a0d56053 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Apr 2013 23:23:21 +0200 Subject: [PATCH 154/204] KVM: Move irq routing setup to irqchip.c Setting up IRQ routes is nothing IOAPIC specific. Extract everything that really is generic code into irqchip.c and only leave the ioapic specific bits to irq_comm.c. Signed-off-by: Alexander Graf Acked-by: Michael S. Tsirkin --- include/linux/kvm_host.h | 3 ++ virt/kvm/irq_comm.c | 76 ++--------------------------------- virt/kvm/irqchip.c | 85 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 73 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a7bfe9d3aa5e..dcef724f4ba6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -961,6 +961,9 @@ int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index d5008f4aade7..e2e6b4473a96 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -271,27 +271,14 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, rcu_read_unlock(); } -static int setup_routing_entry(struct kvm_irq_routing_table *rt, - struct kvm_kernel_irq_routing_entry *e, - const struct kvm_irq_routing_entry *ue) +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; int delta; unsigned max_pin; - struct kvm_kernel_irq_routing_entry *ei; - /* - * Do not allow GSI to be mapped to the same irqchip more than once. - * Allow only one to one mapping between GSI and MSI. - */ - hlist_for_each_entry(ei, &rt->map[ue->gsi], link) - if (ei->type == KVM_IRQ_ROUTING_MSI || - ue->type == KVM_IRQ_ROUTING_MSI || - ue->u.irqchip.irqchip == ei->irqchip.irqchip) - return r; - - e->gsi = ue->gsi; - e->type = ue->type; switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; @@ -328,68 +315,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, goto out; } - hlist_add_head(&e->link, &rt->map[e->gsi]); r = 0; out: return r; } -int kvm_set_irq_routing(struct kvm *kvm, - const struct kvm_irq_routing_entry *ue, - unsigned nr, - unsigned flags) -{ - struct kvm_irq_routing_table *new, *old; - u32 i, j, nr_rt_entries = 0; - int r; - - for (i = 0; i < nr; ++i) { - if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) - return -EINVAL; - nr_rt_entries = max(nr_rt_entries, ue[i].gsi); - } - - nr_rt_entries += 1; - - new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) - + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), - GFP_KERNEL); - - if (!new) - return -ENOMEM; - - new->rt_entries = (void *)&new->map[nr_rt_entries]; - - new->nr_rt_entries = nr_rt_entries; - for (i = 0; i < 3; i++) - for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) - new->chip[i][j] = -1; - - for (i = 0; i < nr; ++i) { - r = -EINVAL; - if (ue->flags) - goto out; - r = setup_routing_entry(new, &new->rt_entries[i], ue); - if (r) - goto out; - ++ue; - } - - mutex_lock(&kvm->irq_lock); - old = kvm->irq_routing; - kvm_irq_routing_update(kvm, new); - mutex_unlock(&kvm->irq_lock); - - synchronize_rcu(); - - new = old; - r = 0; - -out: - kfree(new); - return r; -} - #define IOAPIC_ROUTING_ENTRY(irq) \ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \ .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) } diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 12f7f26af4f8..20dc9e4a8f6c 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -150,3 +150,88 @@ void kvm_free_irq_routing(struct kvm *kvm) at this stage */ kfree(kvm->irq_routing); } + +static int setup_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + struct kvm_kernel_irq_routing_entry *ei; + + /* + * Do not allow GSI to be mapped to the same irqchip more than once. + * Allow only one to one mapping between GSI and MSI. + */ + hlist_for_each_entry(ei, &rt->map[ue->gsi], link) + if (ei->type == KVM_IRQ_ROUTING_MSI || + ue->type == KVM_IRQ_ROUTING_MSI || + ue->u.irqchip.irqchip == ei->irqchip.irqchip) + return r; + + e->gsi = ue->gsi; + e->type = ue->type; + r = kvm_set_routing_entry(rt, e, ue); + if (r) + goto out; + + hlist_add_head(&e->link, &rt->map[e->gsi]); + r = 0; +out: + return r; +} + +int kvm_set_irq_routing(struct kvm *kvm, + const struct kvm_irq_routing_entry *ue, + unsigned nr, + unsigned flags) +{ + struct kvm_irq_routing_table *new, *old; + u32 i, j, nr_rt_entries = 0; + int r; + + for (i = 0; i < nr; ++i) { + if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) + return -EINVAL; + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); + } + + nr_rt_entries += 1; + + new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)) + + (nr * sizeof(struct kvm_kernel_irq_routing_entry)), + GFP_KERNEL); + + if (!new) + return -ENOMEM; + + new->rt_entries = (void *)&new->map[nr_rt_entries]; + + new->nr_rt_entries = nr_rt_entries; + for (i = 0; i < KVM_NR_IRQCHIPS; i++) + for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) + new->chip[i][j] = -1; + + for (i = 0; i < nr; ++i) { + r = -EINVAL; + if (ue->flags) + goto out; + r = setup_routing_entry(new, &new->rt_entries[i], ue); + if (r) + goto out; + ++ue; + } + + mutex_lock(&kvm->irq_lock); + old = kvm->irq_routing; + kvm_irq_routing_update(kvm, new); + mutex_unlock(&kvm->irq_lock); + + synchronize_rcu(); + + new = old; + r = 0; + +out: + kfree(new); + return r; +} From 7df35f549606e8a9004a77ef31dc80dfa893a590 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 16 Apr 2013 12:12:49 +0200 Subject: [PATCH 155/204] KVM: Move irqfd resample cap handling to generic code Now that we have most irqfd code completely platform agnostic, let's move irqfd's resample capability return to generic code as well. Signed-off-by: Alexander Graf Acked-by: Michael S. Tsirkin --- arch/x86/kvm/x86.c | 1 - virt/kvm/kvm_main.c | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a434bf3918d..f6c3f03623a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2522,7 +2522,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: - case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b6f33547b3db..f9492f3847d6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2432,6 +2432,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_INTERNAL_ERROR_DATA: #ifdef CONFIG_HAVE_KVM_MSI case KVM_CAP_SIGNAL_MSI: +#endif +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING + case KVM_CAP_IRQFD_RESAMPLE: #endif return 1; #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING From 852b6d57dc7fa378019786fa84727036e56839ea Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 12 Apr 2013 14:08:42 +0000 Subject: [PATCH 156/204] kvm: add device control API Currently, devices that are emulated inside KVM are configured in a hardcoded manner based on an assumption that any given architecture only has one way to do it. If there's any need to access device state, it is done through inflexible one-purpose-only IOCTLs (e.g. KVM_GET/SET_LAPIC). Defining new IOCTLs for every little thing is cumbersome and depletes a limited numberspace. This API provides a mechanism to instantiate a device of a certain type, returning an ID that can be used to set/get attributes of the device. Attributes may include configuration parameters (e.g. register base address), device state, operational commands, etc. It is similar to the ONE_REG API, except that it acts on devices rather than vcpus. Both device types and individual attributes can be tested without having to create the device or get/set the attribute, without the need for separately managing enumerated capabilities. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 70 ++++++++++++ Documentation/virtual/kvm/devices/README | 1 + include/linux/kvm_host.h | 35 ++++++ include/uapi/linux/kvm.h | 27 +++++ virt/kvm/kvm_main.c | 129 +++++++++++++++++++++++ 5 files changed, 262 insertions(+) create mode 100644 Documentation/virtual/kvm/devices/README diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a1f2200e43d0..66b58e494935 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2189,6 +2189,76 @@ header; first `n_valid' valid entries with contents from the data written, then `n_invalid' invalid entries, invalidating any previously valid entries found. +4.79 KVM_CREATE_DEVICE + +Capability: KVM_CAP_DEVICE_CTRL +Type: vm ioctl +Parameters: struct kvm_create_device (in/out) +Returns: 0 on success, -1 on error +Errors: + ENODEV: The device type is unknown or unsupported + EEXIST: Device already created, and this type of device may not + be instantiated multiple times + + Other error conditions may be defined by individual device types or + have their standard meanings. + +Creates an emulated device in the kernel. The file descriptor returned +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR. + +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the +device type is supported (not necessarily whether it can be created +in the current vm). + +Individual devices should not define flags. Attributes should be used +for specifying any behavior that is not implied by the device type +number. + +struct kvm_create_device { + __u32 type; /* in: KVM_DEV_TYPE_xxx */ + __u32 fd; /* out: device handle */ + __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ +}; + +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR + +Capability: KVM_CAP_DEVICE_CTRL +Type: device ioctl +Parameters: struct kvm_device_attr +Returns: 0 on success, -1 on error +Errors: + ENXIO: The group or attribute is unknown/unsupported for this device + EPERM: The attribute cannot (currently) be accessed this way + (e.g. read-only attribute, or attribute that only makes + sense when the device is in a different state) + + Other error conditions may be defined by individual device types. + +Gets/sets a specified piece of device configuration and/or state. The +semantics are device-specific. See individual device documentation in +the "devices" directory. As with ONE_REG, the size of the data +transferred is defined by the particular attribute. + +struct kvm_device_attr { + __u32 flags; /* no flags currently defined */ + __u32 group; /* device-defined */ + __u64 attr; /* group-defined */ + __u64 addr; /* userspace address of attr data */ +}; + +4.81 KVM_HAS_DEVICE_ATTR + +Capability: KVM_CAP_DEVICE_CTRL +Type: device ioctl +Parameters: struct kvm_device_attr +Returns: 0 on success, -1 on error +Errors: + ENXIO: The group or attribute is unknown/unsupported for this device + +Tests whether a device supports a particular attribute. A successful +return indicates the attribute is implemented. It does not necessarily +indicate that the attribute can be read or written in the device's +current state. "addr" is ignored. 4.77 KVM_ARM_VCPU_INIT diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README new file mode 100644 index 000000000000..34a69834124a --- /dev/null +++ b/Documentation/virtual/kvm/devices/README @@ -0,0 +1 @@ +This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL. diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dcef724f4ba6..6dab6b5b3e3b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1064,6 +1064,41 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) extern bool kvm_rebooting; +struct kvm_device_ops; + +struct kvm_device { + struct kvm_device_ops *ops; + struct kvm *kvm; + atomic_t users; + void *private; +}; + +/* create, destroy, and name are mandatory */ +struct kvm_device_ops { + const char *name; + int (*create)(struct kvm_device *dev, u32 type); + + /* + * Destroy is responsible for freeing dev. + * + * Destroy may be called before or after destructors are called + * on emulated I/O regions, depending on whether a reference is + * held by a vcpu or other kvm component that gets destroyed + * after the emulated I/O. + */ + void (*destroy)(struct kvm_device *dev); + + int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); + int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); + int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr); + long (*ioctl)(struct kvm_device *dev, unsigned int ioctl, + unsigned long arg); +}; + +void kvm_device_get(struct kvm_device *dev); +void kvm_device_put(struct kvm_device *dev); +struct kvm_device *kvm_device_from_filp(struct file *filp); + #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c741902c9e0b..38a0be0c199f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_EPR 86 #define KVM_CAP_ARM_PSCI 87 #define KVM_CAP_ARM_SET_DEVICE_ADDR 88 +#define KVM_CAP_DEVICE_CTRL 89 #ifdef KVM_CAP_IRQ_ROUTING @@ -818,6 +819,24 @@ struct kvm_arm_device_addr { __u64 addr; }; +/* + * Device control API, available with KVM_CAP_DEVICE_CTRL + */ +#define KVM_CREATE_DEVICE_TEST 1 + +struct kvm_create_device { + __u32 type; /* in: KVM_DEV_TYPE_xxx */ + __u32 fd; /* out: device handle */ + __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ +}; + +struct kvm_device_attr { + __u32 flags; /* no flags currently defined */ + __u32 group; /* device-defined */ + __u64 attr; /* group-defined */ + __u64 addr; /* userspace address of attr data */ +}; + /* * ioctls for VM fds */ @@ -906,6 +925,14 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */ #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) +/* ioctl for vm fd */ +#define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) + +/* ioctls for fds returned by KVM_CREATE_DEVICE */ +#define KVM_SET_DEVICE_ATTR _IOW(KVMIO, 0xe1, struct kvm_device_attr) +#define KVM_GET_DEVICE_ATTR _IOW(KVMIO, 0xe2, struct kvm_device_attr) +#define KVM_HAS_DEVICE_ATTR _IOW(KVMIO, 0xe3, struct kvm_device_attr) + /* * ioctls for vcpu fds */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f9492f3847d6..5f0d78c2537b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2159,6 +2159,117 @@ out: } #endif +static int kvm_device_ioctl_attr(struct kvm_device *dev, + int (*accessor)(struct kvm_device *dev, + struct kvm_device_attr *attr), + unsigned long arg) +{ + struct kvm_device_attr attr; + + if (!accessor) + return -EPERM; + + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + return -EFAULT; + + return accessor(dev, &attr); +} + +static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct kvm_device *dev = filp->private_data; + + switch (ioctl) { + case KVM_SET_DEVICE_ATTR: + return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); + case KVM_GET_DEVICE_ATTR: + return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); + case KVM_HAS_DEVICE_ATTR: + return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); + default: + if (dev->ops->ioctl) + return dev->ops->ioctl(dev, ioctl, arg); + + return -ENOTTY; + } +} + +void kvm_device_get(struct kvm_device *dev) +{ + atomic_inc(&dev->users); +} + +void kvm_device_put(struct kvm_device *dev) +{ + if (atomic_dec_and_test(&dev->users)) + dev->ops->destroy(dev); +} + +static int kvm_device_release(struct inode *inode, struct file *filp) +{ + struct kvm_device *dev = filp->private_data; + struct kvm *kvm = dev->kvm; + + kvm_device_put(dev); + kvm_put_kvm(kvm); + return 0; +} + +static const struct file_operations kvm_device_fops = { + .unlocked_ioctl = kvm_device_ioctl, + .release = kvm_device_release, +}; + +struct kvm_device *kvm_device_from_filp(struct file *filp) +{ + if (filp->f_op != &kvm_device_fops) + return NULL; + + return filp->private_data; +} + +static int kvm_ioctl_create_device(struct kvm *kvm, + struct kvm_create_device *cd) +{ + struct kvm_device_ops *ops = NULL; + struct kvm_device *dev; + bool test = cd->flags & KVM_CREATE_DEVICE_TEST; + int ret; + + switch (cd->type) { + default: + return -ENODEV; + } + + if (test) + return 0; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->ops = ops; + dev->kvm = kvm; + atomic_set(&dev->users, 1); + + ret = ops->create(dev, cd->type); + if (ret < 0) { + kfree(dev); + return ret; + } + + ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR); + if (ret < 0) { + ops->destroy(dev); + return ret; + } + + kvm_get_kvm(kvm); + cd->fd = ret; + return 0; +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2304,6 +2415,24 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ + case KVM_CREATE_DEVICE: { + struct kvm_create_device cd; + + r = -EFAULT; + if (copy_from_user(&cd, argp, sizeof(cd))) + goto out; + + r = kvm_ioctl_create_device(kvm, &cd); + if (r) + goto out; + + r = -EFAULT; + if (copy_to_user(argp, &cd, sizeof(cd))) + goto out; + + r = 0; + break; + } default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) From b823f98f890c9b069d64f8af0996029b1f1af00e Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 12 Apr 2013 14:08:43 +0000 Subject: [PATCH 157/204] kvm/ppc/mpic: import hw/openpic.c from QEMU This is QEMU's hw/openpic.c from commit abd8d4a4d6dfea7ddea72f095f993e1de941614e ("Update version for 1.4.0-rc0"), run through Lindent with no other changes to ease merging future changes between Linux and QEMU. Remaining style issues (including those introduced by Lindent) will be fixed in a later patch. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/mpic.c | 1686 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 1686 insertions(+) create mode 100644 arch/powerpc/kvm/mpic.c diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c new file mode 100644 index 000000000000..57655b988288 --- /dev/null +++ b/arch/powerpc/kvm/mpic.c @@ -0,0 +1,1686 @@ +/* + * OpenPIC emulation + * + * Copyright (c) 2004 Jocelyn Mayer + * 2011 Alexander Graf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +/* + * + * Based on OpenPic implementations: + * - Intel GW80314 I/O companion chip developer's manual + * - Motorola MPC8245 & MPC8540 user manuals. + * - Motorola MCP750 (aka Raven) programmer manual. + * - Motorola Harrier programmer manuel + * + * Serial interrupts, as implemented in Raven chipset are not supported yet. + * + */ +#include "hw.h" +#include "ppc/mac.h" +#include "pci/pci.h" +#include "openpic.h" +#include "sysbus.h" +#include "pci/msi.h" +#include "qemu/bitops.h" +#include "ppc.h" + +//#define DEBUG_OPENPIC + +#ifdef DEBUG_OPENPIC +static const int debug_openpic = 1; +#else +static const int debug_openpic = 0; +#endif + +#define DPRINTF(fmt, ...) do { \ + if (debug_openpic) { \ + printf(fmt , ## __VA_ARGS__); \ + } \ + } while (0) + +#define MAX_CPU 32 +#define MAX_SRC 256 +#define MAX_TMR 4 +#define MAX_IPI 4 +#define MAX_MSI 8 +#define MAX_IRQ (MAX_SRC + MAX_IPI + MAX_TMR) +#define VID 0x03 /* MPIC version ID */ + +/* OpenPIC capability flags */ +#define OPENPIC_FLAG_IDR_CRIT (1 << 0) +#define OPENPIC_FLAG_ILR (2 << 0) + +/* OpenPIC address map */ +#define OPENPIC_GLB_REG_START 0x0 +#define OPENPIC_GLB_REG_SIZE 0x10F0 +#define OPENPIC_TMR_REG_START 0x10F0 +#define OPENPIC_TMR_REG_SIZE 0x220 +#define OPENPIC_MSI_REG_START 0x1600 +#define OPENPIC_MSI_REG_SIZE 0x200 +#define OPENPIC_SUMMARY_REG_START 0x3800 +#define OPENPIC_SUMMARY_REG_SIZE 0x800 +#define OPENPIC_SRC_REG_START 0x10000 +#define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20) +#define OPENPIC_CPU_REG_START 0x20000 +#define OPENPIC_CPU_REG_SIZE 0x100 + ((MAX_CPU - 1) * 0x1000) + +/* Raven */ +#define RAVEN_MAX_CPU 2 +#define RAVEN_MAX_EXT 48 +#define RAVEN_MAX_IRQ 64 +#define RAVEN_MAX_TMR MAX_TMR +#define RAVEN_MAX_IPI MAX_IPI + +/* Interrupt definitions */ +#define RAVEN_FE_IRQ (RAVEN_MAX_EXT) /* Internal functional IRQ */ +#define RAVEN_ERR_IRQ (RAVEN_MAX_EXT + 1) /* Error IRQ */ +#define RAVEN_TMR_IRQ (RAVEN_MAX_EXT + 2) /* First timer IRQ */ +#define RAVEN_IPI_IRQ (RAVEN_TMR_IRQ + RAVEN_MAX_TMR) /* First IPI IRQ */ +/* First doorbell IRQ */ +#define RAVEN_DBL_IRQ (RAVEN_IPI_IRQ + (RAVEN_MAX_CPU * RAVEN_MAX_IPI)) + +typedef struct FslMpicInfo { + int max_ext; +} FslMpicInfo; + +static FslMpicInfo fsl_mpic_20 = { + .max_ext = 12, +}; + +static FslMpicInfo fsl_mpic_42 = { + .max_ext = 12, +}; + +#define FRR_NIRQ_SHIFT 16 +#define FRR_NCPU_SHIFT 8 +#define FRR_VID_SHIFT 0 + +#define VID_REVISION_1_2 2 +#define VID_REVISION_1_3 3 + +#define VIR_GENERIC 0x00000000 /* Generic Vendor ID */ + +#define GCR_RESET 0x80000000 +#define GCR_MODE_PASS 0x00000000 +#define GCR_MODE_MIXED 0x20000000 +#define GCR_MODE_PROXY 0x60000000 + +#define TBCR_CI 0x80000000 /* count inhibit */ +#define TCCR_TOG 0x80000000 /* toggles when decrement to zero */ + +#define IDR_EP_SHIFT 31 +#define IDR_EP_MASK (1 << IDR_EP_SHIFT) +#define IDR_CI0_SHIFT 30 +#define IDR_CI1_SHIFT 29 +#define IDR_P1_SHIFT 1 +#define IDR_P0_SHIFT 0 + +#define ILR_INTTGT_MASK 0x000000ff +#define ILR_INTTGT_INT 0x00 +#define ILR_INTTGT_CINT 0x01 /* critical */ +#define ILR_INTTGT_MCP 0x02 /* machine check */ + +/* The currently supported INTTGT values happen to be the same as QEMU's + * openpic output codes, but don't depend on this. The output codes + * could change (unlikely, but...) or support could be added for + * more INTTGT values. + */ +static const int inttgt_output[][2] = { + {ILR_INTTGT_INT, OPENPIC_OUTPUT_INT}, + {ILR_INTTGT_CINT, OPENPIC_OUTPUT_CINT}, + {ILR_INTTGT_MCP, OPENPIC_OUTPUT_MCK}, +}; + +static int inttgt_to_output(int inttgt) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(inttgt_output); i++) { + if (inttgt_output[i][0] == inttgt) { + return inttgt_output[i][1]; + } + } + + fprintf(stderr, "%s: unsupported inttgt %d\n", __func__, inttgt); + return OPENPIC_OUTPUT_INT; +} + +static int output_to_inttgt(int output) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(inttgt_output); i++) { + if (inttgt_output[i][1] == output) { + return inttgt_output[i][0]; + } + } + + abort(); +} + +#define MSIIR_OFFSET 0x140 +#define MSIIR_SRS_SHIFT 29 +#define MSIIR_SRS_MASK (0x7 << MSIIR_SRS_SHIFT) +#define MSIIR_IBS_SHIFT 24 +#define MSIIR_IBS_MASK (0x1f << MSIIR_IBS_SHIFT) + +static int get_current_cpu(void) +{ + CPUState *cpu_single_cpu; + + if (!cpu_single_env) { + return -1; + } + + cpu_single_cpu = ENV_GET_CPU(cpu_single_env); + return cpu_single_cpu->cpu_index; +} + +static uint32_t openpic_cpu_read_internal(void *opaque, hwaddr addr, int idx); +static void openpic_cpu_write_internal(void *opaque, hwaddr addr, + uint32_t val, int idx); + +typedef enum IRQType { + IRQ_TYPE_NORMAL = 0, + IRQ_TYPE_FSLINT, /* FSL internal interrupt -- level only */ + IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */ +} IRQType; + +typedef struct IRQQueue { + /* Round up to the nearest 64 IRQs so that the queue length + * won't change when moving between 32 and 64 bit hosts. + */ + unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)]; + int next; + int priority; +} IRQQueue; + +typedef struct IRQSource { + uint32_t ivpr; /* IRQ vector/priority register */ + uint32_t idr; /* IRQ destination register */ + uint32_t destmask; /* bitmap of CPU destinations */ + int last_cpu; + int output; /* IRQ level, e.g. OPENPIC_OUTPUT_INT */ + int pending; /* TRUE if IRQ is pending */ + IRQType type; + bool level:1; /* level-triggered */ + bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */ +} IRQSource; + +#define IVPR_MASK_SHIFT 31 +#define IVPR_MASK_MASK (1 << IVPR_MASK_SHIFT) +#define IVPR_ACTIVITY_SHIFT 30 +#define IVPR_ACTIVITY_MASK (1 << IVPR_ACTIVITY_SHIFT) +#define IVPR_MODE_SHIFT 29 +#define IVPR_MODE_MASK (1 << IVPR_MODE_SHIFT) +#define IVPR_POLARITY_SHIFT 23 +#define IVPR_POLARITY_MASK (1 << IVPR_POLARITY_SHIFT) +#define IVPR_SENSE_SHIFT 22 +#define IVPR_SENSE_MASK (1 << IVPR_SENSE_SHIFT) + +#define IVPR_PRIORITY_MASK (0xF << 16) +#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16)) +#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask) + +/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */ +#define IDR_EP 0x80000000 /* external pin */ +#define IDR_CI 0x40000000 /* critical interrupt */ + +typedef struct IRQDest { + int32_t ctpr; /* CPU current task priority */ + IRQQueue raised; + IRQQueue servicing; + qemu_irq *irqs; + + /* Count of IRQ sources asserting on non-INT outputs */ + uint32_t outputs_active[OPENPIC_OUTPUT_NB]; +} IRQDest; + +typedef struct OpenPICState { + SysBusDevice busdev; + MemoryRegion mem; + + /* Behavior control */ + FslMpicInfo *fsl; + uint32_t model; + uint32_t flags; + uint32_t nb_irqs; + uint32_t vid; + uint32_t vir; /* Vendor identification register */ + uint32_t vector_mask; + uint32_t tfrr_reset; + uint32_t ivpr_reset; + uint32_t idr_reset; + uint32_t brr1; + uint32_t mpic_mode_mask; + + /* Sub-regions */ + MemoryRegion sub_io_mem[6]; + + /* Global registers */ + uint32_t frr; /* Feature reporting register */ + uint32_t gcr; /* Global configuration register */ + uint32_t pir; /* Processor initialization register */ + uint32_t spve; /* Spurious vector register */ + uint32_t tfrr; /* Timer frequency reporting register */ + /* Source registers */ + IRQSource src[MAX_IRQ]; + /* Local registers per output pin */ + IRQDest dst[MAX_CPU]; + uint32_t nb_cpus; + /* Timer registers */ + struct { + uint32_t tccr; /* Global timer current count register */ + uint32_t tbcr; /* Global timer base count register */ + } timers[MAX_TMR]; + /* Shared MSI registers */ + struct { + uint32_t msir; /* Shared Message Signaled Interrupt Register */ + } msi[MAX_MSI]; + uint32_t max_irq; + uint32_t irq_ipi0; + uint32_t irq_tim0; + uint32_t irq_msi; +} OpenPICState; + +static inline void IRQ_setbit(IRQQueue * q, int n_IRQ) +{ + set_bit(n_IRQ, q->queue); +} + +static inline void IRQ_resetbit(IRQQueue * q, int n_IRQ) +{ + clear_bit(n_IRQ, q->queue); +} + +static inline int IRQ_testbit(IRQQueue * q, int n_IRQ) +{ + return test_bit(n_IRQ, q->queue); +} + +static void IRQ_check(OpenPICState * opp, IRQQueue * q) +{ + int irq = -1; + int next = -1; + int priority = -1; + + for (;;) { + irq = find_next_bit(q->queue, opp->max_irq, irq + 1); + if (irq == opp->max_irq) { + break; + } + + DPRINTF("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n", + irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority); + + if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) { + next = irq; + priority = IVPR_PRIORITY(opp->src[irq].ivpr); + } + } + + q->next = next; + q->priority = priority; +} + +static int IRQ_get_next(OpenPICState * opp, IRQQueue * q) +{ + /* XXX: optimize */ + IRQ_check(opp, q); + + return q->next; +} + +static void IRQ_local_pipe(OpenPICState * opp, int n_CPU, int n_IRQ, + bool active, bool was_active) +{ + IRQDest *dst; + IRQSource *src; + int priority; + + dst = &opp->dst[n_CPU]; + src = &opp->src[n_IRQ]; + + DPRINTF("%s: IRQ %d active %d was %d\n", + __func__, n_IRQ, active, was_active); + + if (src->output != OPENPIC_OUTPUT_INT) { + DPRINTF("%s: output %d irq %d active %d was %d count %d\n", + __func__, src->output, n_IRQ, active, was_active, + dst->outputs_active[src->output]); + + /* On Freescale MPIC, critical interrupts ignore priority, + * IACK, EOI, etc. Before MPIC v4.1 they also ignore + * masking. + */ + if (active) { + if (!was_active + && dst->outputs_active[src->output]++ == 0) { + DPRINTF + ("%s: Raise OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); + qemu_irq_raise(dst->irqs[src->output]); + } + } else { + if (was_active + && --dst->outputs_active[src->output] == 0) { + DPRINTF + ("%s: Lower OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); + qemu_irq_lower(dst->irqs[src->output]); + } + } + + return; + } + + priority = IVPR_PRIORITY(src->ivpr); + + /* Even if the interrupt doesn't have enough priority, + * it is still raised, in case ctpr is lowered later. + */ + if (active) { + IRQ_setbit(&dst->raised, n_IRQ); + } else { + IRQ_resetbit(&dst->raised, n_IRQ); + } + + IRQ_check(opp, &dst->raised); + + if (active && priority <= dst->ctpr) { + DPRINTF + ("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n", + __func__, n_IRQ, priority, dst->ctpr, n_CPU); + active = 0; + } + + if (active) { + if (IRQ_get_next(opp, &dst->servicing) >= 0 && + priority <= dst->servicing.priority) { + DPRINTF + ("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n", + __func__, n_IRQ, dst->servicing.next, n_CPU); + } else { + DPRINTF + ("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", + __func__, n_CPU, n_IRQ, dst->raised.next); + qemu_irq_raise(opp->dst[n_CPU]. + irqs[OPENPIC_OUTPUT_INT]); + } + } else { + IRQ_get_next(opp, &dst->servicing); + if (dst->raised.priority > dst->ctpr && + dst->raised.priority > dst->servicing.priority) { + DPRINTF + ("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n", + __func__, n_IRQ, dst->raised.next, + dst->raised.priority, dst->ctpr, + dst->servicing.priority, n_CPU); + /* IRQ line stays asserted */ + } else { + DPRINTF + ("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", + __func__, n_IRQ, dst->ctpr, + dst->servicing.priority, n_CPU); + qemu_irq_lower(opp->dst[n_CPU]. + irqs[OPENPIC_OUTPUT_INT]); + } + } +} + +/* update pic state because registers for n_IRQ have changed value */ +static void openpic_update_irq(OpenPICState * opp, int n_IRQ) +{ + IRQSource *src; + bool active, was_active; + int i; + + src = &opp->src[n_IRQ]; + active = src->pending; + + if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) { + /* Interrupt source is disabled */ + DPRINTF("%s: IRQ %d is disabled\n", __func__, n_IRQ); + active = false; + } + + was_active = ! !(src->ivpr & IVPR_ACTIVITY_MASK); + + /* + * We don't have a similar check for already-active because + * ctpr may have changed and we need to withdraw the interrupt. + */ + if (!active && !was_active) { + DPRINTF("%s: IRQ %d is already inactive\n", __func__, n_IRQ); + return; + } + + if (active) { + src->ivpr |= IVPR_ACTIVITY_MASK; + } else { + src->ivpr &= ~IVPR_ACTIVITY_MASK; + } + + if (src->destmask == 0) { + /* No target */ + DPRINTF("%s: IRQ %d has no target\n", __func__, n_IRQ); + return; + } + + if (src->destmask == (1 << src->last_cpu)) { + /* Only one CPU is allowed to receive this IRQ */ + IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active); + } else if (!(src->ivpr & IVPR_MODE_MASK)) { + /* Directed delivery mode */ + for (i = 0; i < opp->nb_cpus; i++) { + if (src->destmask & (1 << i)) { + IRQ_local_pipe(opp, i, n_IRQ, active, + was_active); + } + } + } else { + /* Distributed delivery mode */ + for (i = src->last_cpu + 1; i != src->last_cpu; i++) { + if (i == opp->nb_cpus) { + i = 0; + } + if (src->destmask & (1 << i)) { + IRQ_local_pipe(opp, i, n_IRQ, active, + was_active); + src->last_cpu = i; + break; + } + } + } +} + +static void openpic_set_irq(void *opaque, int n_IRQ, int level) +{ + OpenPICState *opp = opaque; + IRQSource *src; + + if (n_IRQ >= MAX_IRQ) { + fprintf(stderr, "%s: IRQ %d out of range\n", __func__, n_IRQ); + abort(); + } + + src = &opp->src[n_IRQ]; + DPRINTF("openpic: set irq %d = %d ivpr=0x%08x\n", + n_IRQ, level, src->ivpr); + if (src->level) { + /* level-sensitive irq */ + src->pending = level; + openpic_update_irq(opp, n_IRQ); + } else { + /* edge-sensitive irq */ + if (level) { + src->pending = 1; + openpic_update_irq(opp, n_IRQ); + } + + if (src->output != OPENPIC_OUTPUT_INT) { + /* Edge-triggered interrupts shouldn't be used + * with non-INT delivery, but just in case, + * try to make it do something sane rather than + * cause an interrupt storm. This is close to + * what you'd probably see happen in real hardware. + */ + src->pending = 0; + openpic_update_irq(opp, n_IRQ); + } + } +} + +static void openpic_reset(DeviceState * d) +{ + OpenPICState *opp = FROM_SYSBUS(typeof(*opp), SYS_BUS_DEVICE(d)); + int i; + + opp->gcr = GCR_RESET; + /* Initialise controller registers */ + opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) | + ((opp->nb_cpus - 1) << FRR_NCPU_SHIFT) | + (opp->vid << FRR_VID_SHIFT); + + opp->pir = 0; + opp->spve = -1 & opp->vector_mask; + opp->tfrr = opp->tfrr_reset; + /* Initialise IRQ sources */ + for (i = 0; i < opp->max_irq; i++) { + opp->src[i].ivpr = opp->ivpr_reset; + opp->src[i].idr = opp->idr_reset; + + switch (opp->src[i].type) { + case IRQ_TYPE_NORMAL: + opp->src[i].level = + ! !(opp->ivpr_reset & IVPR_SENSE_MASK); + break; + + case IRQ_TYPE_FSLINT: + opp->src[i].ivpr |= IVPR_POLARITY_MASK; + break; + + case IRQ_TYPE_FSLSPECIAL: + break; + } + } + /* Initialise IRQ destinations */ + for (i = 0; i < MAX_CPU; i++) { + opp->dst[i].ctpr = 15; + memset(&opp->dst[i].raised, 0, sizeof(IRQQueue)); + opp->dst[i].raised.next = -1; + memset(&opp->dst[i].servicing, 0, sizeof(IRQQueue)); + opp->dst[i].servicing.next = -1; + } + /* Initialise timers */ + for (i = 0; i < MAX_TMR; i++) { + opp->timers[i].tccr = 0; + opp->timers[i].tbcr = TBCR_CI; + } + /* Go out of RESET state */ + opp->gcr = 0; +} + +static inline uint32_t read_IRQreg_idr(OpenPICState * opp, int n_IRQ) +{ + return opp->src[n_IRQ].idr; +} + +static inline uint32_t read_IRQreg_ilr(OpenPICState * opp, int n_IRQ) +{ + if (opp->flags & OPENPIC_FLAG_ILR) { + return output_to_inttgt(opp->src[n_IRQ].output); + } + + return 0xffffffff; +} + +static inline uint32_t read_IRQreg_ivpr(OpenPICState * opp, int n_IRQ) +{ + return opp->src[n_IRQ].ivpr; +} + +static inline void write_IRQreg_idr(OpenPICState * opp, int n_IRQ, uint32_t val) +{ + IRQSource *src = &opp->src[n_IRQ]; + uint32_t normal_mask = (1UL << opp->nb_cpus) - 1; + uint32_t crit_mask = 0; + uint32_t mask = normal_mask; + int crit_shift = IDR_EP_SHIFT - opp->nb_cpus; + int i; + + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { + crit_mask = mask << crit_shift; + mask |= crit_mask | IDR_EP; + } + + src->idr = val & mask; + DPRINTF("Set IDR %d to 0x%08x\n", n_IRQ, src->idr); + + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { + if (src->idr & crit_mask) { + if (src->idr & normal_mask) { + DPRINTF + ("%s: IRQ configured for multiple output types, using " + "critical\n", __func__); + } + + src->output = OPENPIC_OUTPUT_CINT; + src->nomask = true; + src->destmask = 0; + + for (i = 0; i < opp->nb_cpus; i++) { + int n_ci = IDR_CI0_SHIFT - i; + + if (src->idr & (1UL << n_ci)) { + src->destmask |= 1UL << i; + } + } + } else { + src->output = OPENPIC_OUTPUT_INT; + src->nomask = false; + src->destmask = src->idr & normal_mask; + } + } else { + src->destmask = src->idr; + } +} + +static inline void write_IRQreg_ilr(OpenPICState * opp, int n_IRQ, uint32_t val) +{ + if (opp->flags & OPENPIC_FLAG_ILR) { + IRQSource *src = &opp->src[n_IRQ]; + + src->output = inttgt_to_output(val & ILR_INTTGT_MASK); + DPRINTF("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, + src->output); + + /* TODO: on MPIC v4.0 only, set nomask for non-INT */ + } +} + +static inline void write_IRQreg_ivpr(OpenPICState * opp, int n_IRQ, + uint32_t val) +{ + uint32_t mask; + + /* NOTE when implementing newer FSL MPIC models: starting with v4.0, + * the polarity bit is read-only on internal interrupts. + */ + mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK | + IVPR_POLARITY_MASK | opp->vector_mask; + + /* ACTIVITY bit is read-only */ + opp->src[n_IRQ].ivpr = + (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask); + + /* For FSL internal interrupts, The sense bit is reserved and zero, + * and the interrupt is always level-triggered. Timers and IPIs + * have no sense or polarity bits, and are edge-triggered. + */ + switch (opp->src[n_IRQ].type) { + case IRQ_TYPE_NORMAL: + opp->src[n_IRQ].level = + ! !(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK); + break; + + case IRQ_TYPE_FSLINT: + opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK; + break; + + case IRQ_TYPE_FSLSPECIAL: + opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK); + break; + } + + openpic_update_irq(opp, n_IRQ); + DPRINTF("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val, + opp->src[n_IRQ].ivpr); +} + +static void openpic_gcr_write(OpenPICState * opp, uint64_t val) +{ + bool mpic_proxy = false; + + if (val & GCR_RESET) { + openpic_reset(&opp->busdev.qdev); + return; + } + + opp->gcr &= ~opp->mpic_mode_mask; + opp->gcr |= val & opp->mpic_mode_mask; + + /* Set external proxy mode */ + if ((val & opp->mpic_mode_mask) == GCR_MODE_PROXY) { + mpic_proxy = true; + } + + ppce500_set_mpic_proxy(mpic_proxy); +} + +static void openpic_gbl_write(void *opaque, hwaddr addr, uint64_t val, + unsigned len) +{ + OpenPICState *opp = opaque; + IRQDest *dst; + int idx; + + DPRINTF("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", + __func__, addr, val); + if (addr & 0xF) { + return; + } + switch (addr) { + case 0x00: /* Block Revision Register1 (BRR1) is Readonly */ + break; + case 0x40: + case 0x50: + case 0x60: + case 0x70: + case 0x80: + case 0x90: + case 0xA0: + case 0xB0: + openpic_cpu_write_internal(opp, addr, val, get_current_cpu()); + break; + case 0x1000: /* FRR */ + break; + case 0x1020: /* GCR */ + openpic_gcr_write(opp, val); + break; + case 0x1080: /* VIR */ + break; + case 0x1090: /* PIR */ + for (idx = 0; idx < opp->nb_cpus; idx++) { + if ((val & (1 << idx)) && !(opp->pir & (1 << idx))) { + DPRINTF + ("Raise OpenPIC RESET output for CPU %d\n", + idx); + dst = &opp->dst[idx]; + qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_RESET]); + } else if (!(val & (1 << idx)) + && (opp->pir & (1 << idx))) { + DPRINTF + ("Lower OpenPIC RESET output for CPU %d\n", + idx); + dst = &opp->dst[idx]; + qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_RESET]); + } + } + opp->pir = val; + break; + case 0x10A0: /* IPI_IVPR */ + case 0x10B0: + case 0x10C0: + case 0x10D0: + { + int idx; + idx = (addr - 0x10A0) >> 4; + write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val); + } + break; + case 0x10E0: /* SPVE */ + opp->spve = val & opp->vector_mask; + break; + default: + break; + } +} + +static uint64_t openpic_gbl_read(void *opaque, hwaddr addr, unsigned len) +{ + OpenPICState *opp = opaque; + uint32_t retval; + + DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + retval = 0xFFFFFFFF; + if (addr & 0xF) { + return retval; + } + switch (addr) { + case 0x1000: /* FRR */ + retval = opp->frr; + break; + case 0x1020: /* GCR */ + retval = opp->gcr; + break; + case 0x1080: /* VIR */ + retval = opp->vir; + break; + case 0x1090: /* PIR */ + retval = 0x00000000; + break; + case 0x00: /* Block Revision Register1 (BRR1) */ + retval = opp->brr1; + break; + case 0x40: + case 0x50: + case 0x60: + case 0x70: + case 0x80: + case 0x90: + case 0xA0: + case 0xB0: + retval = + openpic_cpu_read_internal(opp, addr, get_current_cpu()); + break; + case 0x10A0: /* IPI_IVPR */ + case 0x10B0: + case 0x10C0: + case 0x10D0: + { + int idx; + idx = (addr - 0x10A0) >> 4; + retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx); + } + break; + case 0x10E0: /* SPVE */ + retval = opp->spve; + break; + default: + break; + } + DPRINTF("%s: => 0x%08x\n", __func__, retval); + + return retval; +} + +static void openpic_tmr_write(void *opaque, hwaddr addr, uint64_t val, + unsigned len) +{ + OpenPICState *opp = opaque; + int idx; + + addr += 0x10f0; + + DPRINTF("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", + __func__, addr, val); + if (addr & 0xF) { + return; + } + + if (addr == 0x10f0) { + /* TFRR */ + opp->tfrr = val; + return; + } + + idx = (addr >> 6) & 0x3; + addr = addr & 0x30; + + switch (addr & 0x30) { + case 0x00: /* TCCR */ + break; + case 0x10: /* TBCR */ + if ((opp->timers[idx].tccr & TCCR_TOG) != 0 && + (val & TBCR_CI) == 0 && + (opp->timers[idx].tbcr & TBCR_CI) != 0) { + opp->timers[idx].tccr &= ~TCCR_TOG; + } + opp->timers[idx].tbcr = val; + break; + case 0x20: /* TVPR */ + write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val); + break; + case 0x30: /* TDR */ + write_IRQreg_idr(opp, opp->irq_tim0 + idx, val); + break; + } +} + +static uint64_t openpic_tmr_read(void *opaque, hwaddr addr, unsigned len) +{ + OpenPICState *opp = opaque; + uint32_t retval = -1; + int idx; + + DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + if (addr & 0xF) { + goto out; + } + idx = (addr >> 6) & 0x3; + if (addr == 0x0) { + /* TFRR */ + retval = opp->tfrr; + goto out; + } + switch (addr & 0x30) { + case 0x00: /* TCCR */ + retval = opp->timers[idx].tccr; + break; + case 0x10: /* TBCR */ + retval = opp->timers[idx].tbcr; + break; + case 0x20: /* TIPV */ + retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx); + break; + case 0x30: /* TIDE (TIDR) */ + retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx); + break; + } + +out: + DPRINTF("%s: => 0x%08x\n", __func__, retval); + + return retval; +} + +static void openpic_src_write(void *opaque, hwaddr addr, uint64_t val, + unsigned len) +{ + OpenPICState *opp = opaque; + int idx; + + DPRINTF("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", + __func__, addr, val); + + addr = addr & 0xffff; + idx = addr >> 5; + + switch (addr & 0x1f) { + case 0x00: + write_IRQreg_ivpr(opp, idx, val); + break; + case 0x10: + write_IRQreg_idr(opp, idx, val); + break; + case 0x18: + write_IRQreg_ilr(opp, idx, val); + break; + } +} + +static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len) +{ + OpenPICState *opp = opaque; + uint32_t retval; + int idx; + + DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + retval = 0xFFFFFFFF; + + addr = addr & 0xffff; + idx = addr >> 5; + + switch (addr & 0x1f) { + case 0x00: + retval = read_IRQreg_ivpr(opp, idx); + break; + case 0x10: + retval = read_IRQreg_idr(opp, idx); + break; + case 0x18: + retval = read_IRQreg_ilr(opp, idx); + break; + } + + DPRINTF("%s: => 0x%08x\n", __func__, retval); + return retval; +} + +static void openpic_msi_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + OpenPICState *opp = opaque; + int idx = opp->irq_msi; + int srs, ibs; + + DPRINTF("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n", + __func__, addr, val); + if (addr & 0xF) { + return; + } + + switch (addr) { + case MSIIR_OFFSET: + srs = val >> MSIIR_SRS_SHIFT; + idx += srs; + ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT; + opp->msi[srs].msir |= 1 << ibs; + openpic_set_irq(opp, idx, 1); + break; + default: + /* most registers are read-only, thus ignored */ + break; + } +} + +static uint64_t openpic_msi_read(void *opaque, hwaddr addr, unsigned size) +{ + OpenPICState *opp = opaque; + uint64_t r = 0; + int i, srs; + + DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + if (addr & 0xF) { + return -1; + } + + srs = addr >> 4; + + switch (addr) { + case 0x00: + case 0x10: + case 0x20: + case 0x30: + case 0x40: + case 0x50: + case 0x60: + case 0x70: /* MSIRs */ + r = opp->msi[srs].msir; + /* Clear on read */ + opp->msi[srs].msir = 0; + openpic_set_irq(opp, opp->irq_msi + srs, 0); + break; + case 0x120: /* MSISR */ + for (i = 0; i < MAX_MSI; i++) { + r |= (opp->msi[i].msir ? 1 : 0) << i; + } + break; + } + + return r; +} + +static uint64_t openpic_summary_read(void *opaque, hwaddr addr, unsigned size) +{ + uint64_t r = 0; + + DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + + /* TODO: EISR/EIMR */ + + return r; +} + +static void openpic_summary_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + DPRINTF("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n", + __func__, addr, val); + + /* TODO: EISR/EIMR */ +} + +static void openpic_cpu_write_internal(void *opaque, hwaddr addr, + uint32_t val, int idx) +{ + OpenPICState *opp = opaque; + IRQSource *src; + IRQDest *dst; + int s_IRQ, n_IRQ; + + DPRINTF("%s: cpu %d addr %#" HWADDR_PRIx " <= 0x%08x\n", __func__, idx, + addr, val); + + if (idx < 0) { + return; + } + + if (addr & 0xF) { + return; + } + dst = &opp->dst[idx]; + addr &= 0xFF0; + switch (addr) { + case 0x40: /* IPIDR */ + case 0x50: + case 0x60: + case 0x70: + idx = (addr - 0x40) >> 4; + /* we use IDE as mask which CPUs to deliver the IPI to still. */ + opp->src[opp->irq_ipi0 + idx].destmask |= val; + openpic_set_irq(opp, opp->irq_ipi0 + idx, 1); + openpic_set_irq(opp, opp->irq_ipi0 + idx, 0); + break; + case 0x80: /* CTPR */ + dst->ctpr = val & 0x0000000F; + + DPRINTF("%s: set CPU %d ctpr to %d, raised %d servicing %d\n", + __func__, idx, dst->ctpr, dst->raised.priority, + dst->servicing.priority); + + if (dst->raised.priority <= dst->ctpr) { + DPRINTF + ("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", + __func__, idx); + qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]); + } else if (dst->raised.priority > dst->servicing.priority) { + DPRINTF("%s: Raise OpenPIC INT output cpu %d irq %d\n", + __func__, idx, dst->raised.next); + qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_INT]); + } + + break; + case 0x90: /* WHOAMI */ + /* Read-only register */ + break; + case 0xA0: /* IACK */ + /* Read-only register */ + break; + case 0xB0: /* EOI */ + DPRINTF("EOI\n"); + s_IRQ = IRQ_get_next(opp, &dst->servicing); + + if (s_IRQ < 0) { + DPRINTF("%s: EOI with no interrupt in service\n", + __func__); + break; + } + + IRQ_resetbit(&dst->servicing, s_IRQ); + /* Set up next servicing IRQ */ + s_IRQ = IRQ_get_next(opp, &dst->servicing); + /* Check queued interrupts. */ + n_IRQ = IRQ_get_next(opp, &dst->raised); + src = &opp->src[n_IRQ]; + if (n_IRQ != -1 && + (s_IRQ == -1 || + IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { + DPRINTF("Raise OpenPIC INT output cpu %d irq %d\n", + idx, n_IRQ); + qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]); + } + break; + default: + break; + } +} + +static void openpic_cpu_write(void *opaque, hwaddr addr, uint64_t val, + unsigned len) +{ + openpic_cpu_write_internal(opaque, addr, val, (addr & 0x1f000) >> 12); +} + +static uint32_t openpic_iack(OpenPICState * opp, IRQDest * dst, int cpu) +{ + IRQSource *src; + int retval, irq; + + DPRINTF("Lower OpenPIC INT output\n"); + qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]); + + irq = IRQ_get_next(opp, &dst->raised); + DPRINTF("IACK: irq=%d\n", irq); + + if (irq == -1) { + /* No more interrupt pending */ + return opp->spve; + } + + src = &opp->src[irq]; + if (!(src->ivpr & IVPR_ACTIVITY_MASK) || + !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) { + fprintf(stderr, "%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n", + __func__, irq, dst->ctpr, src->ivpr); + openpic_update_irq(opp, irq); + retval = opp->spve; + } else { + /* IRQ enter servicing state */ + IRQ_setbit(&dst->servicing, irq); + retval = IVPR_VECTOR(opp, src->ivpr); + } + + if (!src->level) { + /* edge-sensitive IRQ */ + src->ivpr &= ~IVPR_ACTIVITY_MASK; + src->pending = 0; + IRQ_resetbit(&dst->raised, irq); + } + + if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) { + src->destmask &= ~(1 << cpu); + if (src->destmask && !src->level) { + /* trigger on CPUs that didn't know about it yet */ + openpic_set_irq(opp, irq, 1); + openpic_set_irq(opp, irq, 0); + /* if all CPUs knew about it, set active bit again */ + src->ivpr |= IVPR_ACTIVITY_MASK; + } + } + + return retval; +} + +static uint32_t openpic_cpu_read_internal(void *opaque, hwaddr addr, int idx) +{ + OpenPICState *opp = opaque; + IRQDest *dst; + uint32_t retval; + + DPRINTF("%s: cpu %d addr %#" HWADDR_PRIx "\n", __func__, idx, addr); + retval = 0xFFFFFFFF; + + if (idx < 0) { + return retval; + } + + if (addr & 0xF) { + return retval; + } + dst = &opp->dst[idx]; + addr &= 0xFF0; + switch (addr) { + case 0x80: /* CTPR */ + retval = dst->ctpr; + break; + case 0x90: /* WHOAMI */ + retval = idx; + break; + case 0xA0: /* IACK */ + retval = openpic_iack(opp, dst, idx); + break; + case 0xB0: /* EOI */ + retval = 0; + break; + default: + break; + } + DPRINTF("%s: => 0x%08x\n", __func__, retval); + + return retval; +} + +static uint64_t openpic_cpu_read(void *opaque, hwaddr addr, unsigned len) +{ + return openpic_cpu_read_internal(opaque, addr, (addr & 0x1f000) >> 12); +} + +static const MemoryRegionOps openpic_glb_ops_le = { + .write = openpic_gbl_write, + .read = openpic_gbl_read, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_glb_ops_be = { + .write = openpic_gbl_write, + .read = openpic_gbl_read, + .endianness = DEVICE_BIG_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_tmr_ops_le = { + .write = openpic_tmr_write, + .read = openpic_tmr_read, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_tmr_ops_be = { + .write = openpic_tmr_write, + .read = openpic_tmr_read, + .endianness = DEVICE_BIG_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_cpu_ops_le = { + .write = openpic_cpu_write, + .read = openpic_cpu_read, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_cpu_ops_be = { + .write = openpic_cpu_write, + .read = openpic_cpu_read, + .endianness = DEVICE_BIG_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_src_ops_le = { + .write = openpic_src_write, + .read = openpic_src_read, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_src_ops_be = { + .write = openpic_src_write, + .read = openpic_src_read, + .endianness = DEVICE_BIG_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_msi_ops_be = { + .read = openpic_msi_read, + .write = openpic_msi_write, + .endianness = DEVICE_BIG_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static const MemoryRegionOps openpic_summary_ops_be = { + .read = openpic_summary_read, + .write = openpic_summary_write, + .endianness = DEVICE_BIG_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + }, +}; + +static void openpic_save_IRQ_queue(QEMUFile * f, IRQQueue * q) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(q->queue); i++) { + /* Always put the lower half of a 64-bit long first, in case we + * restore on a 32-bit host. The least significant bits correspond + * to lower IRQ numbers in the bitmap. + */ + qemu_put_be32(f, (uint32_t) q->queue[i]); +#if LONG_MAX > 0x7FFFFFFF + qemu_put_be32(f, (uint32_t) (q->queue[i] >> 32)); +#endif + } + + qemu_put_sbe32s(f, &q->next); + qemu_put_sbe32s(f, &q->priority); +} + +static void openpic_save(QEMUFile * f, void *opaque) +{ + OpenPICState *opp = (OpenPICState *) opaque; + unsigned int i; + + qemu_put_be32s(f, &opp->gcr); + qemu_put_be32s(f, &opp->vir); + qemu_put_be32s(f, &opp->pir); + qemu_put_be32s(f, &opp->spve); + qemu_put_be32s(f, &opp->tfrr); + + qemu_put_be32s(f, &opp->nb_cpus); + + for (i = 0; i < opp->nb_cpus; i++) { + qemu_put_sbe32s(f, &opp->dst[i].ctpr); + openpic_save_IRQ_queue(f, &opp->dst[i].raised); + openpic_save_IRQ_queue(f, &opp->dst[i].servicing); + qemu_put_buffer(f, (uint8_t *) & opp->dst[i].outputs_active, + sizeof(opp->dst[i].outputs_active)); + } + + for (i = 0; i < MAX_TMR; i++) { + qemu_put_be32s(f, &opp->timers[i].tccr); + qemu_put_be32s(f, &opp->timers[i].tbcr); + } + + for (i = 0; i < opp->max_irq; i++) { + qemu_put_be32s(f, &opp->src[i].ivpr); + qemu_put_be32s(f, &opp->src[i].idr); + qemu_get_be32s(f, &opp->src[i].destmask); + qemu_put_sbe32s(f, &opp->src[i].last_cpu); + qemu_put_sbe32s(f, &opp->src[i].pending); + } +} + +static void openpic_load_IRQ_queue(QEMUFile * f, IRQQueue * q) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(q->queue); i++) { + unsigned long val; + + val = qemu_get_be32(f); +#if LONG_MAX > 0x7FFFFFFF + val <<= 32; + val |= qemu_get_be32(f); +#endif + + q->queue[i] = val; + } + + qemu_get_sbe32s(f, &q->next); + qemu_get_sbe32s(f, &q->priority); +} + +static int openpic_load(QEMUFile * f, void *opaque, int version_id) +{ + OpenPICState *opp = (OpenPICState *) opaque; + unsigned int i; + + if (version_id != 1) { + return -EINVAL; + } + + qemu_get_be32s(f, &opp->gcr); + qemu_get_be32s(f, &opp->vir); + qemu_get_be32s(f, &opp->pir); + qemu_get_be32s(f, &opp->spve); + qemu_get_be32s(f, &opp->tfrr); + + qemu_get_be32s(f, &opp->nb_cpus); + + for (i = 0; i < opp->nb_cpus; i++) { + qemu_get_sbe32s(f, &opp->dst[i].ctpr); + openpic_load_IRQ_queue(f, &opp->dst[i].raised); + openpic_load_IRQ_queue(f, &opp->dst[i].servicing); + qemu_get_buffer(f, (uint8_t *) & opp->dst[i].outputs_active, + sizeof(opp->dst[i].outputs_active)); + } + + for (i = 0; i < MAX_TMR; i++) { + qemu_get_be32s(f, &opp->timers[i].tccr); + qemu_get_be32s(f, &opp->timers[i].tbcr); + } + + for (i = 0; i < opp->max_irq; i++) { + uint32_t val; + + val = qemu_get_be32(f); + write_IRQreg_idr(opp, i, val); + val = qemu_get_be32(f); + write_IRQreg_ivpr(opp, i, val); + + qemu_get_be32s(f, &opp->src[i].ivpr); + qemu_get_be32s(f, &opp->src[i].idr); + qemu_get_be32s(f, &opp->src[i].destmask); + qemu_get_sbe32s(f, &opp->src[i].last_cpu); + qemu_get_sbe32s(f, &opp->src[i].pending); + } + + return 0; +} + +typedef struct MemReg { + const char *name; + MemoryRegionOps const *ops; + hwaddr start_addr; + ram_addr_t size; +} MemReg; + +static void fsl_common_init(OpenPICState * opp) +{ + int i; + int virq = MAX_SRC; + + opp->vid = VID_REVISION_1_2; + opp->vir = VIR_GENERIC; + opp->vector_mask = 0xFFFF; + opp->tfrr_reset = 0; + opp->ivpr_reset = IVPR_MASK_MASK; + opp->idr_reset = 1 << 0; + opp->max_irq = MAX_IRQ; + + opp->irq_ipi0 = virq; + virq += MAX_IPI; + opp->irq_tim0 = virq; + virq += MAX_TMR; + + assert(virq <= MAX_IRQ); + + opp->irq_msi = 224; + + msi_supported = true; + for (i = 0; i < opp->fsl->max_ext; i++) { + opp->src[i].level = false; + } + + /* Internal interrupts, including message and MSI */ + for (i = 16; i < MAX_SRC; i++) { + opp->src[i].type = IRQ_TYPE_FSLINT; + opp->src[i].level = true; + } + + /* timers and IPIs */ + for (i = MAX_SRC; i < virq; i++) { + opp->src[i].type = IRQ_TYPE_FSLSPECIAL; + opp->src[i].level = false; + } +} + +static void map_list(OpenPICState * opp, const MemReg * list, int *count) +{ + while (list->name) { + assert(*count < ARRAY_SIZE(opp->sub_io_mem)); + + memory_region_init_io(&opp->sub_io_mem[*count], list->ops, opp, + list->name, list->size); + + memory_region_add_subregion(&opp->mem, list->start_addr, + &opp->sub_io_mem[*count]); + + (*count)++; + list++; + } +} + +static int openpic_init(SysBusDevice * dev) +{ + OpenPICState *opp = FROM_SYSBUS(typeof(*opp), dev); + int i, j; + int list_count = 0; + static const MemReg list_le[] = { + {"glb", &openpic_glb_ops_le, + OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE}, + {"tmr", &openpic_tmr_ops_le, + OPENPIC_TMR_REG_START, OPENPIC_TMR_REG_SIZE}, + {"src", &openpic_src_ops_le, + OPENPIC_SRC_REG_START, OPENPIC_SRC_REG_SIZE}, + {"cpu", &openpic_cpu_ops_le, + OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE}, + {NULL} + }; + static const MemReg list_be[] = { + {"glb", &openpic_glb_ops_be, + OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE}, + {"tmr", &openpic_tmr_ops_be, + OPENPIC_TMR_REG_START, OPENPIC_TMR_REG_SIZE}, + {"src", &openpic_src_ops_be, + OPENPIC_SRC_REG_START, OPENPIC_SRC_REG_SIZE}, + {"cpu", &openpic_cpu_ops_be, + OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE}, + {NULL} + }; + static const MemReg list_fsl[] = { + {"msi", &openpic_msi_ops_be, + OPENPIC_MSI_REG_START, OPENPIC_MSI_REG_SIZE}, + {"summary", &openpic_summary_ops_be, + OPENPIC_SUMMARY_REG_START, OPENPIC_SUMMARY_REG_SIZE}, + {NULL} + }; + + memory_region_init(&opp->mem, "openpic", 0x40000); + + switch (opp->model) { + case OPENPIC_MODEL_FSL_MPIC_20: + default: + opp->fsl = &fsl_mpic_20; + opp->brr1 = 0x00400200; + opp->flags |= OPENPIC_FLAG_IDR_CRIT; + opp->nb_irqs = 80; + opp->mpic_mode_mask = GCR_MODE_MIXED; + + fsl_common_init(opp); + map_list(opp, list_be, &list_count); + map_list(opp, list_fsl, &list_count); + + break; + + case OPENPIC_MODEL_FSL_MPIC_42: + opp->fsl = &fsl_mpic_42; + opp->brr1 = 0x00400402; + opp->flags |= OPENPIC_FLAG_ILR; + opp->nb_irqs = 196; + opp->mpic_mode_mask = GCR_MODE_PROXY; + + fsl_common_init(opp); + map_list(opp, list_be, &list_count); + map_list(opp, list_fsl, &list_count); + + break; + + case OPENPIC_MODEL_RAVEN: + opp->nb_irqs = RAVEN_MAX_EXT; + opp->vid = VID_REVISION_1_3; + opp->vir = VIR_GENERIC; + opp->vector_mask = 0xFF; + opp->tfrr_reset = 4160000; + opp->ivpr_reset = IVPR_MASK_MASK | IVPR_MODE_MASK; + opp->idr_reset = 0; + opp->max_irq = RAVEN_MAX_IRQ; + opp->irq_ipi0 = RAVEN_IPI_IRQ; + opp->irq_tim0 = RAVEN_TMR_IRQ; + opp->brr1 = -1; + opp->mpic_mode_mask = GCR_MODE_MIXED; + + /* Only UP supported today */ + if (opp->nb_cpus != 1) { + return -EINVAL; + } + + map_list(opp, list_le, &list_count); + break; + } + + for (i = 0; i < opp->nb_cpus; i++) { + opp->dst[i].irqs = g_new(qemu_irq, OPENPIC_OUTPUT_NB); + for (j = 0; j < OPENPIC_OUTPUT_NB; j++) { + sysbus_init_irq(dev, &opp->dst[i].irqs[j]); + } + } + + register_savevm(&opp->busdev.qdev, "openpic", 0, 2, + openpic_save, openpic_load, opp); + + sysbus_init_mmio(dev, &opp->mem); + qdev_init_gpio_in(&dev->qdev, openpic_set_irq, opp->max_irq); + + return 0; +} + +static Property openpic_properties[] = { + DEFINE_PROP_UINT32("model", OpenPICState, model, + OPENPIC_MODEL_FSL_MPIC_20), + DEFINE_PROP_UINT32("nb_cpus", OpenPICState, nb_cpus, 1), + DEFINE_PROP_END_OF_LIST(), +}; + +static void openpic_class_init(ObjectClass * klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass); + + k->init = openpic_init; + dc->props = openpic_properties; + dc->reset = openpic_reset; +} + +static const TypeInfo openpic_info = { + .name = "openpic", + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(OpenPICState), + .class_init = openpic_class_init, +}; + +static void openpic_register_types(void) +{ + type_register_static(&openpic_info); +} + +type_init(openpic_register_types) From 6dd830a09a245c068aad2d10ff6d35c5d81cf2b6 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 12 Apr 2013 14:08:44 +0000 Subject: [PATCH 158/204] kvm/ppc/mpic: remove some obviously unneeded code Remove some parts of the code that are obviously QEMU or Raven specific before fixing style issues, to reduce the style issues that need to be fixed. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/mpic.c | 344 ---------------------------------------- 1 file changed, 344 deletions(-) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 57655b988288..d6d70a473d7f 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -22,39 +22,6 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -/* - * - * Based on OpenPic implementations: - * - Intel GW80314 I/O companion chip developer's manual - * - Motorola MPC8245 & MPC8540 user manuals. - * - Motorola MCP750 (aka Raven) programmer manual. - * - Motorola Harrier programmer manuel - * - * Serial interrupts, as implemented in Raven chipset are not supported yet. - * - */ -#include "hw.h" -#include "ppc/mac.h" -#include "pci/pci.h" -#include "openpic.h" -#include "sysbus.h" -#include "pci/msi.h" -#include "qemu/bitops.h" -#include "ppc.h" - -//#define DEBUG_OPENPIC - -#ifdef DEBUG_OPENPIC -static const int debug_openpic = 1; -#else -static const int debug_openpic = 0; -#endif - -#define DPRINTF(fmt, ...) do { \ - if (debug_openpic) { \ - printf(fmt , ## __VA_ARGS__); \ - } \ - } while (0) #define MAX_CPU 32 #define MAX_SRC 256 @@ -82,21 +49,6 @@ static const int debug_openpic = 0; #define OPENPIC_CPU_REG_START 0x20000 #define OPENPIC_CPU_REG_SIZE 0x100 + ((MAX_CPU - 1) * 0x1000) -/* Raven */ -#define RAVEN_MAX_CPU 2 -#define RAVEN_MAX_EXT 48 -#define RAVEN_MAX_IRQ 64 -#define RAVEN_MAX_TMR MAX_TMR -#define RAVEN_MAX_IPI MAX_IPI - -/* Interrupt definitions */ -#define RAVEN_FE_IRQ (RAVEN_MAX_EXT) /* Internal functional IRQ */ -#define RAVEN_ERR_IRQ (RAVEN_MAX_EXT + 1) /* Error IRQ */ -#define RAVEN_TMR_IRQ (RAVEN_MAX_EXT + 2) /* First timer IRQ */ -#define RAVEN_IPI_IRQ (RAVEN_TMR_IRQ + RAVEN_MAX_TMR) /* First IPI IRQ */ -/* First doorbell IRQ */ -#define RAVEN_DBL_IRQ (RAVEN_IPI_IRQ + (RAVEN_MAX_CPU * RAVEN_MAX_IPI)) - typedef struct FslMpicInfo { int max_ext; } FslMpicInfo; @@ -138,44 +90,6 @@ static FslMpicInfo fsl_mpic_42 = { #define ILR_INTTGT_CINT 0x01 /* critical */ #define ILR_INTTGT_MCP 0x02 /* machine check */ -/* The currently supported INTTGT values happen to be the same as QEMU's - * openpic output codes, but don't depend on this. The output codes - * could change (unlikely, but...) or support could be added for - * more INTTGT values. - */ -static const int inttgt_output[][2] = { - {ILR_INTTGT_INT, OPENPIC_OUTPUT_INT}, - {ILR_INTTGT_CINT, OPENPIC_OUTPUT_CINT}, - {ILR_INTTGT_MCP, OPENPIC_OUTPUT_MCK}, -}; - -static int inttgt_to_output(int inttgt) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(inttgt_output); i++) { - if (inttgt_output[i][0] == inttgt) { - return inttgt_output[i][1]; - } - } - - fprintf(stderr, "%s: unsupported inttgt %d\n", __func__, inttgt); - return OPENPIC_OUTPUT_INT; -} - -static int output_to_inttgt(int output) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(inttgt_output); i++) { - if (inttgt_output[i][1] == output) { - return inttgt_output[i][0]; - } - } - - abort(); -} - #define MSIIR_OFFSET 0x140 #define MSIIR_SRS_SHIFT 29 #define MSIIR_SRS_MASK (0x7 << MSIIR_SRS_SHIFT) @@ -1265,228 +1179,36 @@ static uint64_t openpic_cpu_read(void *opaque, hwaddr addr, unsigned len) return openpic_cpu_read_internal(opaque, addr, (addr & 0x1f000) >> 12); } -static const MemoryRegionOps openpic_glb_ops_le = { - .write = openpic_gbl_write, - .read = openpic_gbl_read, - .endianness = DEVICE_LITTLE_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, -}; - static const MemoryRegionOps openpic_glb_ops_be = { .write = openpic_gbl_write, .read = openpic_gbl_read, - .endianness = DEVICE_BIG_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, -}; - -static const MemoryRegionOps openpic_tmr_ops_le = { - .write = openpic_tmr_write, - .read = openpic_tmr_read, - .endianness = DEVICE_LITTLE_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, }; static const MemoryRegionOps openpic_tmr_ops_be = { .write = openpic_tmr_write, .read = openpic_tmr_read, - .endianness = DEVICE_BIG_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, -}; - -static const MemoryRegionOps openpic_cpu_ops_le = { - .write = openpic_cpu_write, - .read = openpic_cpu_read, - .endianness = DEVICE_LITTLE_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, }; static const MemoryRegionOps openpic_cpu_ops_be = { .write = openpic_cpu_write, .read = openpic_cpu_read, - .endianness = DEVICE_BIG_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, -}; - -static const MemoryRegionOps openpic_src_ops_le = { - .write = openpic_src_write, - .read = openpic_src_read, - .endianness = DEVICE_LITTLE_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, }; static const MemoryRegionOps openpic_src_ops_be = { .write = openpic_src_write, .read = openpic_src_read, - .endianness = DEVICE_BIG_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, }; static const MemoryRegionOps openpic_msi_ops_be = { .read = openpic_msi_read, .write = openpic_msi_write, - .endianness = DEVICE_BIG_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, }; static const MemoryRegionOps openpic_summary_ops_be = { .read = openpic_summary_read, .write = openpic_summary_write, - .endianness = DEVICE_BIG_ENDIAN, - .impl = { - .min_access_size = 4, - .max_access_size = 4, - }, }; -static void openpic_save_IRQ_queue(QEMUFile * f, IRQQueue * q) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(q->queue); i++) { - /* Always put the lower half of a 64-bit long first, in case we - * restore on a 32-bit host. The least significant bits correspond - * to lower IRQ numbers in the bitmap. - */ - qemu_put_be32(f, (uint32_t) q->queue[i]); -#if LONG_MAX > 0x7FFFFFFF - qemu_put_be32(f, (uint32_t) (q->queue[i] >> 32)); -#endif - } - - qemu_put_sbe32s(f, &q->next); - qemu_put_sbe32s(f, &q->priority); -} - -static void openpic_save(QEMUFile * f, void *opaque) -{ - OpenPICState *opp = (OpenPICState *) opaque; - unsigned int i; - - qemu_put_be32s(f, &opp->gcr); - qemu_put_be32s(f, &opp->vir); - qemu_put_be32s(f, &opp->pir); - qemu_put_be32s(f, &opp->spve); - qemu_put_be32s(f, &opp->tfrr); - - qemu_put_be32s(f, &opp->nb_cpus); - - for (i = 0; i < opp->nb_cpus; i++) { - qemu_put_sbe32s(f, &opp->dst[i].ctpr); - openpic_save_IRQ_queue(f, &opp->dst[i].raised); - openpic_save_IRQ_queue(f, &opp->dst[i].servicing); - qemu_put_buffer(f, (uint8_t *) & opp->dst[i].outputs_active, - sizeof(opp->dst[i].outputs_active)); - } - - for (i = 0; i < MAX_TMR; i++) { - qemu_put_be32s(f, &opp->timers[i].tccr); - qemu_put_be32s(f, &opp->timers[i].tbcr); - } - - for (i = 0; i < opp->max_irq; i++) { - qemu_put_be32s(f, &opp->src[i].ivpr); - qemu_put_be32s(f, &opp->src[i].idr); - qemu_get_be32s(f, &opp->src[i].destmask); - qemu_put_sbe32s(f, &opp->src[i].last_cpu); - qemu_put_sbe32s(f, &opp->src[i].pending); - } -} - -static void openpic_load_IRQ_queue(QEMUFile * f, IRQQueue * q) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(q->queue); i++) { - unsigned long val; - - val = qemu_get_be32(f); -#if LONG_MAX > 0x7FFFFFFF - val <<= 32; - val |= qemu_get_be32(f); -#endif - - q->queue[i] = val; - } - - qemu_get_sbe32s(f, &q->next); - qemu_get_sbe32s(f, &q->priority); -} - -static int openpic_load(QEMUFile * f, void *opaque, int version_id) -{ - OpenPICState *opp = (OpenPICState *) opaque; - unsigned int i; - - if (version_id != 1) { - return -EINVAL; - } - - qemu_get_be32s(f, &opp->gcr); - qemu_get_be32s(f, &opp->vir); - qemu_get_be32s(f, &opp->pir); - qemu_get_be32s(f, &opp->spve); - qemu_get_be32s(f, &opp->tfrr); - - qemu_get_be32s(f, &opp->nb_cpus); - - for (i = 0; i < opp->nb_cpus; i++) { - qemu_get_sbe32s(f, &opp->dst[i].ctpr); - openpic_load_IRQ_queue(f, &opp->dst[i].raised); - openpic_load_IRQ_queue(f, &opp->dst[i].servicing); - qemu_get_buffer(f, (uint8_t *) & opp->dst[i].outputs_active, - sizeof(opp->dst[i].outputs_active)); - } - - for (i = 0; i < MAX_TMR; i++) { - qemu_get_be32s(f, &opp->timers[i].tccr); - qemu_get_be32s(f, &opp->timers[i].tbcr); - } - - for (i = 0; i < opp->max_irq; i++) { - uint32_t val; - - val = qemu_get_be32(f); - write_IRQreg_idr(opp, i, val); - val = qemu_get_be32(f); - write_IRQreg_ivpr(opp, i, val); - - qemu_get_be32s(f, &opp->src[i].ivpr); - qemu_get_be32s(f, &opp->src[i].idr); - qemu_get_be32s(f, &opp->src[i].destmask); - qemu_get_sbe32s(f, &opp->src[i].last_cpu); - qemu_get_sbe32s(f, &opp->src[i].pending); - } - - return 0; -} - typedef struct MemReg { const char *name; MemoryRegionOps const *ops; @@ -1614,73 +1336,7 @@ static int openpic_init(SysBusDevice * dev) map_list(opp, list_fsl, &list_count); break; - - case OPENPIC_MODEL_RAVEN: - opp->nb_irqs = RAVEN_MAX_EXT; - opp->vid = VID_REVISION_1_3; - opp->vir = VIR_GENERIC; - opp->vector_mask = 0xFF; - opp->tfrr_reset = 4160000; - opp->ivpr_reset = IVPR_MASK_MASK | IVPR_MODE_MASK; - opp->idr_reset = 0; - opp->max_irq = RAVEN_MAX_IRQ; - opp->irq_ipi0 = RAVEN_IPI_IRQ; - opp->irq_tim0 = RAVEN_TMR_IRQ; - opp->brr1 = -1; - opp->mpic_mode_mask = GCR_MODE_MIXED; - - /* Only UP supported today */ - if (opp->nb_cpus != 1) { - return -EINVAL; - } - - map_list(opp, list_le, &list_count); - break; } - for (i = 0; i < opp->nb_cpus; i++) { - opp->dst[i].irqs = g_new(qemu_irq, OPENPIC_OUTPUT_NB); - for (j = 0; j < OPENPIC_OUTPUT_NB; j++) { - sysbus_init_irq(dev, &opp->dst[i].irqs[j]); - } - } - - register_savevm(&opp->busdev.qdev, "openpic", 0, 2, - openpic_save, openpic_load, opp); - - sysbus_init_mmio(dev, &opp->mem); - qdev_init_gpio_in(&dev->qdev, openpic_set_irq, opp->max_irq); - return 0; } - -static Property openpic_properties[] = { - DEFINE_PROP_UINT32("model", OpenPICState, model, - OPENPIC_MODEL_FSL_MPIC_20), - DEFINE_PROP_UINT32("nb_cpus", OpenPICState, nb_cpus, 1), - DEFINE_PROP_END_OF_LIST(), -}; - -static void openpic_class_init(ObjectClass * klass, void *data) -{ - DeviceClass *dc = DEVICE_CLASS(klass); - SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass); - - k->init = openpic_init; - dc->props = openpic_properties; - dc->reset = openpic_reset; -} - -static const TypeInfo openpic_info = { - .name = "openpic", - .parent = TYPE_SYS_BUS_DEVICE, - .instance_size = sizeof(OpenPICState), - .class_init = openpic_class_init, -}; - -static void openpic_register_types(void) -{ - type_register_static(&openpic_info); -} - -type_init(openpic_register_types) From f0f5c481a91c56f1ee5b3809bf3943115143b1a7 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 12 Apr 2013 14:08:45 +0000 Subject: [PATCH 159/204] kvm/ppc/mpic: adapt to kernel style and environment Remove braces that Linux style doesn't permit, remove space after '*' that Lindent added, keep error/debug strings contiguous, etc. Substitute type names, debug prints, etc. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/mpic.c | 445 +++++++++++++++++++--------------------- 1 file changed, 208 insertions(+), 237 deletions(-) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index d6d70a473d7f..1df67aed7a91 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -42,22 +42,22 @@ #define OPENPIC_TMR_REG_SIZE 0x220 #define OPENPIC_MSI_REG_START 0x1600 #define OPENPIC_MSI_REG_SIZE 0x200 -#define OPENPIC_SUMMARY_REG_START 0x3800 -#define OPENPIC_SUMMARY_REG_SIZE 0x800 +#define OPENPIC_SUMMARY_REG_START 0x3800 +#define OPENPIC_SUMMARY_REG_SIZE 0x800 #define OPENPIC_SRC_REG_START 0x10000 #define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20) #define OPENPIC_CPU_REG_START 0x20000 -#define OPENPIC_CPU_REG_SIZE 0x100 + ((MAX_CPU - 1) * 0x1000) +#define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000)) -typedef struct FslMpicInfo { +struct fsl_mpic_info { int max_ext; -} FslMpicInfo; +}; -static FslMpicInfo fsl_mpic_20 = { +static struct fsl_mpic_info fsl_mpic_20 = { .max_ext = 12, }; -static FslMpicInfo fsl_mpic_42 = { +static struct fsl_mpic_info fsl_mpic_42 = { .max_ext = 12, }; @@ -100,44 +100,43 @@ static int get_current_cpu(void) { CPUState *cpu_single_cpu; - if (!cpu_single_env) { + if (!cpu_single_env) return -1; - } cpu_single_cpu = ENV_GET_CPU(cpu_single_env); return cpu_single_cpu->cpu_index; } -static uint32_t openpic_cpu_read_internal(void *opaque, hwaddr addr, int idx); -static void openpic_cpu_write_internal(void *opaque, hwaddr addr, +static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx); +static void openpic_cpu_write_internal(void *opaque, gpa_t addr, uint32_t val, int idx); -typedef enum IRQType { +enum irq_type { IRQ_TYPE_NORMAL = 0, IRQ_TYPE_FSLINT, /* FSL internal interrupt -- level only */ IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */ -} IRQType; +}; -typedef struct IRQQueue { +struct irq_queue { /* Round up to the nearest 64 IRQs so that the queue length * won't change when moving between 32 and 64 bit hosts. */ unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)]; int next; int priority; -} IRQQueue; +}; -typedef struct IRQSource { +struct irq_source { uint32_t ivpr; /* IRQ vector/priority register */ uint32_t idr; /* IRQ destination register */ uint32_t destmask; /* bitmap of CPU destinations */ int last_cpu; int output; /* IRQ level, e.g. OPENPIC_OUTPUT_INT */ int pending; /* TRUE if IRQ is pending */ - IRQType type; + enum irq_type type; bool level:1; /* level-triggered */ - bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */ -} IRQSource; + bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */ +}; #define IVPR_MASK_SHIFT 31 #define IVPR_MASK_MASK (1 << IVPR_MASK_SHIFT) @@ -158,22 +157,19 @@ typedef struct IRQSource { #define IDR_EP 0x80000000 /* external pin */ #define IDR_CI 0x40000000 /* critical interrupt */ -typedef struct IRQDest { +struct irq_dest { int32_t ctpr; /* CPU current task priority */ - IRQQueue raised; - IRQQueue servicing; + struct irq_queue raised; + struct irq_queue servicing; qemu_irq *irqs; /* Count of IRQ sources asserting on non-INT outputs */ uint32_t outputs_active[OPENPIC_OUTPUT_NB]; -} IRQDest; - -typedef struct OpenPICState { - SysBusDevice busdev; - MemoryRegion mem; +}; +struct openpic { /* Behavior control */ - FslMpicInfo *fsl; + struct fsl_mpic_info *fsl; uint32_t model; uint32_t flags; uint32_t nb_irqs; @@ -186,9 +182,6 @@ typedef struct OpenPICState { uint32_t brr1; uint32_t mpic_mode_mask; - /* Sub-regions */ - MemoryRegion sub_io_mem[6]; - /* Global registers */ uint32_t frr; /* Feature reporting register */ uint32_t gcr; /* Global configuration register */ @@ -196,9 +189,9 @@ typedef struct OpenPICState { uint32_t spve; /* Spurious vector register */ uint32_t tfrr; /* Timer frequency reporting register */ /* Source registers */ - IRQSource src[MAX_IRQ]; + struct irq_source src[MAX_IRQ]; /* Local registers per output pin */ - IRQDest dst[MAX_CPU]; + struct irq_dest dst[MAX_CPU]; uint32_t nb_cpus; /* Timer registers */ struct { @@ -213,24 +206,24 @@ typedef struct OpenPICState { uint32_t irq_ipi0; uint32_t irq_tim0; uint32_t irq_msi; -} OpenPICState; +}; -static inline void IRQ_setbit(IRQQueue * q, int n_IRQ) +static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ) { set_bit(n_IRQ, q->queue); } -static inline void IRQ_resetbit(IRQQueue * q, int n_IRQ) +static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ) { clear_bit(n_IRQ, q->queue); } -static inline int IRQ_testbit(IRQQueue * q, int n_IRQ) +static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ) { return test_bit(n_IRQ, q->queue); } -static void IRQ_check(OpenPICState * opp, IRQQueue * q) +static void IRQ_check(struct openpic *opp, struct irq_queue *q) { int irq = -1; int next = -1; @@ -238,11 +231,10 @@ static void IRQ_check(OpenPICState * opp, IRQQueue * q) for (;;) { irq = find_next_bit(q->queue, opp->max_irq, irq + 1); - if (irq == opp->max_irq) { + if (irq == opp->max_irq) break; - } - DPRINTF("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n", + pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n", irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority); if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) { @@ -255,7 +247,7 @@ static void IRQ_check(OpenPICState * opp, IRQQueue * q) q->priority = priority; } -static int IRQ_get_next(OpenPICState * opp, IRQQueue * q) +static int IRQ_get_next(struct openpic *opp, struct irq_queue *q) { /* XXX: optimize */ IRQ_check(opp, q); @@ -263,21 +255,21 @@ static int IRQ_get_next(OpenPICState * opp, IRQQueue * q) return q->next; } -static void IRQ_local_pipe(OpenPICState * opp, int n_CPU, int n_IRQ, +static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, bool active, bool was_active) { - IRQDest *dst; - IRQSource *src; + struct irq_dest *dst; + struct irq_source *src; int priority; dst = &opp->dst[n_CPU]; src = &opp->src[n_IRQ]; - DPRINTF("%s: IRQ %d active %d was %d\n", + pr_debug("%s: IRQ %d active %d was %d\n", __func__, n_IRQ, active, was_active); if (src->output != OPENPIC_OUTPUT_INT) { - DPRINTF("%s: output %d irq %d active %d was %d count %d\n", + pr_debug("%s: output %d irq %d active %d was %d count %d\n", __func__, src->output, n_IRQ, active, was_active, dst->outputs_active[src->output]); @@ -286,19 +278,17 @@ static void IRQ_local_pipe(OpenPICState * opp, int n_CPU, int n_IRQ, * masking. */ if (active) { - if (!was_active - && dst->outputs_active[src->output]++ == 0) { - DPRINTF - ("%s: Raise OpenPIC output %d cpu %d irq %d\n", - __func__, src->output, n_CPU, n_IRQ); + if (!was_active && + dst->outputs_active[src->output]++ == 0) { + pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); qemu_irq_raise(dst->irqs[src->output]); } } else { - if (was_active - && --dst->outputs_active[src->output] == 0) { - DPRINTF - ("%s: Lower OpenPIC output %d cpu %d irq %d\n", - __func__, src->output, n_CPU, n_IRQ); + if (was_active && + --dst->outputs_active[src->output] == 0) { + pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); qemu_irq_lower(dst->irqs[src->output]); } } @@ -311,31 +301,27 @@ static void IRQ_local_pipe(OpenPICState * opp, int n_CPU, int n_IRQ, /* Even if the interrupt doesn't have enough priority, * it is still raised, in case ctpr is lowered later. */ - if (active) { + if (active) IRQ_setbit(&dst->raised, n_IRQ); - } else { + else IRQ_resetbit(&dst->raised, n_IRQ); - } IRQ_check(opp, &dst->raised); if (active && priority <= dst->ctpr) { - DPRINTF - ("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n", - __func__, n_IRQ, priority, dst->ctpr, n_CPU); + pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n", + __func__, n_IRQ, priority, dst->ctpr, n_CPU); active = 0; } if (active) { if (IRQ_get_next(opp, &dst->servicing) >= 0 && priority <= dst->servicing.priority) { - DPRINTF - ("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n", - __func__, n_IRQ, dst->servicing.next, n_CPU); + pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n", + __func__, n_IRQ, dst->servicing.next, n_CPU); } else { - DPRINTF - ("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", - __func__, n_CPU, n_IRQ, dst->raised.next); + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", + __func__, n_CPU, n_IRQ, dst->raised.next); qemu_irq_raise(opp->dst[n_CPU]. irqs[OPENPIC_OUTPUT_INT]); } @@ -343,17 +329,15 @@ static void IRQ_local_pipe(OpenPICState * opp, int n_CPU, int n_IRQ, IRQ_get_next(opp, &dst->servicing); if (dst->raised.priority > dst->ctpr && dst->raised.priority > dst->servicing.priority) { - DPRINTF - ("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n", - __func__, n_IRQ, dst->raised.next, - dst->raised.priority, dst->ctpr, - dst->servicing.priority, n_CPU); + pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n", + __func__, n_IRQ, dst->raised.next, + dst->raised.priority, dst->ctpr, + dst->servicing.priority, n_CPU); /* IRQ line stays asserted */ } else { - DPRINTF - ("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", - __func__, n_IRQ, dst->ctpr, - dst->servicing.priority, n_CPU); + pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", + __func__, n_IRQ, dst->ctpr, + dst->servicing.priority, n_CPU); qemu_irq_lower(opp->dst[n_CPU]. irqs[OPENPIC_OUTPUT_INT]); } @@ -361,9 +345,9 @@ static void IRQ_local_pipe(OpenPICState * opp, int n_CPU, int n_IRQ, } /* update pic state because registers for n_IRQ have changed value */ -static void openpic_update_irq(OpenPICState * opp, int n_IRQ) +static void openpic_update_irq(struct openpic *opp, int n_IRQ) { - IRQSource *src; + struct irq_source *src; bool active, was_active; int i; @@ -372,30 +356,29 @@ static void openpic_update_irq(OpenPICState * opp, int n_IRQ) if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) { /* Interrupt source is disabled */ - DPRINTF("%s: IRQ %d is disabled\n", __func__, n_IRQ); + pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ); active = false; } - was_active = ! !(src->ivpr & IVPR_ACTIVITY_MASK); + was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK); /* * We don't have a similar check for already-active because * ctpr may have changed and we need to withdraw the interrupt. */ if (!active && !was_active) { - DPRINTF("%s: IRQ %d is already inactive\n", __func__, n_IRQ); + pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ); return; } - if (active) { + if (active) src->ivpr |= IVPR_ACTIVITY_MASK; - } else { + else src->ivpr &= ~IVPR_ACTIVITY_MASK; - } if (src->destmask == 0) { /* No target */ - DPRINTF("%s: IRQ %d has no target\n", __func__, n_IRQ); + pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ); return; } @@ -413,9 +396,9 @@ static void openpic_update_irq(OpenPICState * opp, int n_IRQ) } else { /* Distributed delivery mode */ for (i = src->last_cpu + 1; i != src->last_cpu; i++) { - if (i == opp->nb_cpus) { + if (i == opp->nb_cpus) i = 0; - } + if (src->destmask & (1 << i)) { IRQ_local_pipe(opp, i, n_IRQ, active, was_active); @@ -428,16 +411,16 @@ static void openpic_update_irq(OpenPICState * opp, int n_IRQ) static void openpic_set_irq(void *opaque, int n_IRQ, int level) { - OpenPICState *opp = opaque; - IRQSource *src; + struct openpic *opp = opaque; + struct irq_source *src; if (n_IRQ >= MAX_IRQ) { - fprintf(stderr, "%s: IRQ %d out of range\n", __func__, n_IRQ); + pr_err("%s: IRQ %d out of range\n", __func__, n_IRQ); abort(); } src = &opp->src[n_IRQ]; - DPRINTF("openpic: set irq %d = %d ivpr=0x%08x\n", + pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n", n_IRQ, level, src->ivpr); if (src->level) { /* level-sensitive irq */ @@ -463,9 +446,9 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level) } } -static void openpic_reset(DeviceState * d) +static void openpic_reset(DeviceState *d) { - OpenPICState *opp = FROM_SYSBUS(typeof(*opp), SYS_BUS_DEVICE(d)); + struct openpic *opp = FROM_SYSBUS(typeof(*opp), SYS_BUS_DEVICE(d)); int i; opp->gcr = GCR_RESET; @@ -485,7 +468,7 @@ static void openpic_reset(DeviceState * d) switch (opp->src[i].type) { case IRQ_TYPE_NORMAL: opp->src[i].level = - ! !(opp->ivpr_reset & IVPR_SENSE_MASK); + !!(opp->ivpr_reset & IVPR_SENSE_MASK); break; case IRQ_TYPE_FSLINT: @@ -499,9 +482,9 @@ static void openpic_reset(DeviceState * d) /* Initialise IRQ destinations */ for (i = 0; i < MAX_CPU; i++) { opp->dst[i].ctpr = 15; - memset(&opp->dst[i].raised, 0, sizeof(IRQQueue)); + memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue)); opp->dst[i].raised.next = -1; - memset(&opp->dst[i].servicing, 0, sizeof(IRQQueue)); + memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue)); opp->dst[i].servicing.next = -1; } /* Initialise timers */ @@ -513,28 +496,28 @@ static void openpic_reset(DeviceState * d) opp->gcr = 0; } -static inline uint32_t read_IRQreg_idr(OpenPICState * opp, int n_IRQ) +static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ) { return opp->src[n_IRQ].idr; } -static inline uint32_t read_IRQreg_ilr(OpenPICState * opp, int n_IRQ) +static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ) { - if (opp->flags & OPENPIC_FLAG_ILR) { + if (opp->flags & OPENPIC_FLAG_ILR) return output_to_inttgt(opp->src[n_IRQ].output); - } return 0xffffffff; } -static inline uint32_t read_IRQreg_ivpr(OpenPICState * opp, int n_IRQ) +static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ) { return opp->src[n_IRQ].ivpr; } -static inline void write_IRQreg_idr(OpenPICState * opp, int n_IRQ, uint32_t val) +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, + uint32_t val) { - IRQSource *src = &opp->src[n_IRQ]; + struct irq_source *src = &opp->src[n_IRQ]; uint32_t normal_mask = (1UL << opp->nb_cpus) - 1; uint32_t crit_mask = 0; uint32_t mask = normal_mask; @@ -547,14 +530,13 @@ static inline void write_IRQreg_idr(OpenPICState * opp, int n_IRQ, uint32_t val) } src->idr = val & mask; - DPRINTF("Set IDR %d to 0x%08x\n", n_IRQ, src->idr); + pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr); if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { if (src->idr & crit_mask) { if (src->idr & normal_mask) { - DPRINTF - ("%s: IRQ configured for multiple output types, using " - "critical\n", __func__); + pr_debug("%s: IRQ configured for multiple output types, using critical\n", + __func__); } src->output = OPENPIC_OUTPUT_CINT; @@ -564,9 +546,8 @@ static inline void write_IRQreg_idr(OpenPICState * opp, int n_IRQ, uint32_t val) for (i = 0; i < opp->nb_cpus; i++) { int n_ci = IDR_CI0_SHIFT - i; - if (src->idr & (1UL << n_ci)) { + if (src->idr & (1UL << n_ci)) src->destmask |= 1UL << i; - } } } else { src->output = OPENPIC_OUTPUT_INT; @@ -578,20 +559,21 @@ static inline void write_IRQreg_idr(OpenPICState * opp, int n_IRQ, uint32_t val) } } -static inline void write_IRQreg_ilr(OpenPICState * opp, int n_IRQ, uint32_t val) +static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ, + uint32_t val) { if (opp->flags & OPENPIC_FLAG_ILR) { - IRQSource *src = &opp->src[n_IRQ]; + struct irq_source *src = &opp->src[n_IRQ]; src->output = inttgt_to_output(val & ILR_INTTGT_MASK); - DPRINTF("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, + pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, src->output); /* TODO: on MPIC v4.0 only, set nomask for non-INT */ } } -static inline void write_IRQreg_ivpr(OpenPICState * opp, int n_IRQ, +static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ, uint32_t val) { uint32_t mask; @@ -613,7 +595,7 @@ static inline void write_IRQreg_ivpr(OpenPICState * opp, int n_IRQ, switch (opp->src[n_IRQ].type) { case IRQ_TYPE_NORMAL: opp->src[n_IRQ].level = - ! !(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK); + !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK); break; case IRQ_TYPE_FSLINT: @@ -626,11 +608,11 @@ static inline void write_IRQreg_ivpr(OpenPICState * opp, int n_IRQ, } openpic_update_irq(opp, n_IRQ); - DPRINTF("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val, + pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val, opp->src[n_IRQ].ivpr); } -static void openpic_gcr_write(OpenPICState * opp, uint64_t val) +static void openpic_gcr_write(struct openpic *opp, uint64_t val) { bool mpic_proxy = false; @@ -643,27 +625,26 @@ static void openpic_gcr_write(OpenPICState * opp, uint64_t val) opp->gcr |= val & opp->mpic_mode_mask; /* Set external proxy mode */ - if ((val & opp->mpic_mode_mask) == GCR_MODE_PROXY) { + if ((val & opp->mpic_mode_mask) == GCR_MODE_PROXY) mpic_proxy = true; - } ppce500_set_mpic_proxy(mpic_proxy); } -static void openpic_gbl_write(void *opaque, hwaddr addr, uint64_t val, +static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val, unsigned len) { - OpenPICState *opp = opaque; - IRQDest *dst; + struct openpic *opp = opaque; + struct irq_dest *dst; int idx; - DPRINTF("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", + pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", __func__, addr, val); - if (addr & 0xF) { + if (addr & 0xF) return; - } + switch (addr) { - case 0x00: /* Block Revision Register1 (BRR1) is Readonly */ + case 0x00: /* Block Revision Register1 (BRR1) is Readonly */ break; case 0x40: case 0x50: @@ -685,16 +666,14 @@ static void openpic_gbl_write(void *opaque, hwaddr addr, uint64_t val, case 0x1090: /* PIR */ for (idx = 0; idx < opp->nb_cpus; idx++) { if ((val & (1 << idx)) && !(opp->pir & (1 << idx))) { - DPRINTF - ("Raise OpenPIC RESET output for CPU %d\n", - idx); + pr_debug("Raise OpenPIC RESET output for CPU %d\n", + idx); dst = &opp->dst[idx]; qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_RESET]); - } else if (!(val & (1 << idx)) - && (opp->pir & (1 << idx))) { - DPRINTF - ("Lower OpenPIC RESET output for CPU %d\n", - idx); + } else if (!(val & (1 << idx)) && + (opp->pir & (1 << idx))) { + pr_debug("Lower OpenPIC RESET output for CPU %d\n", + idx); dst = &opp->dst[idx]; qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_RESET]); } @@ -704,13 +683,12 @@ static void openpic_gbl_write(void *opaque, hwaddr addr, uint64_t val, case 0x10A0: /* IPI_IVPR */ case 0x10B0: case 0x10C0: - case 0x10D0: - { - int idx; - idx = (addr - 0x10A0) >> 4; - write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val); - } + case 0x10D0: { + int idx; + idx = (addr - 0x10A0) >> 4; + write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val); break; + } case 0x10E0: /* SPVE */ opp->spve = val & opp->vector_mask; break; @@ -719,16 +697,16 @@ static void openpic_gbl_write(void *opaque, hwaddr addr, uint64_t val, } } -static uint64_t openpic_gbl_read(void *opaque, hwaddr addr, unsigned len) +static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len) { - OpenPICState *opp = opaque; + struct openpic *opp = opaque; uint32_t retval; - DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); retval = 0xFFFFFFFF; - if (addr & 0xF) { + if (addr & 0xF) return retval; - } + switch (addr) { case 0x1000: /* FRR */ retval = opp->frr; @@ -772,24 +750,23 @@ static uint64_t openpic_gbl_read(void *opaque, hwaddr addr, unsigned len) default: break; } - DPRINTF("%s: => 0x%08x\n", __func__, retval); + pr_debug("%s: => 0x%08x\n", __func__, retval); return retval; } -static void openpic_tmr_write(void *opaque, hwaddr addr, uint64_t val, +static void openpic_tmr_write(void *opaque, gpa_t addr, uint64_t val, unsigned len) { - OpenPICState *opp = opaque; + struct openpic *opp = opaque; int idx; addr += 0x10f0; - DPRINTF("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", + pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", __func__, addr, val); - if (addr & 0xF) { + if (addr & 0xF) return; - } if (addr == 0x10f0) { /* TFRR */ @@ -806,9 +783,9 @@ static void openpic_tmr_write(void *opaque, hwaddr addr, uint64_t val, case 0x10: /* TBCR */ if ((opp->timers[idx].tccr & TCCR_TOG) != 0 && (val & TBCR_CI) == 0 && - (opp->timers[idx].tbcr & TBCR_CI) != 0) { + (opp->timers[idx].tbcr & TBCR_CI) != 0) opp->timers[idx].tccr &= ~TCCR_TOG; - } + opp->timers[idx].tbcr = val; break; case 0x20: /* TVPR */ @@ -820,16 +797,16 @@ static void openpic_tmr_write(void *opaque, hwaddr addr, uint64_t val, } } -static uint64_t openpic_tmr_read(void *opaque, hwaddr addr, unsigned len) +static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len) { - OpenPICState *opp = opaque; + struct openpic *opp = opaque; uint32_t retval = -1; int idx; - DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); - if (addr & 0xF) { + pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + if (addr & 0xF) goto out; - } + idx = (addr >> 6) & 0x3; if (addr == 0x0) { /* TFRR */ @@ -852,18 +829,18 @@ static uint64_t openpic_tmr_read(void *opaque, hwaddr addr, unsigned len) } out: - DPRINTF("%s: => 0x%08x\n", __func__, retval); + pr_debug("%s: => 0x%08x\n", __func__, retval); return retval; } -static void openpic_src_write(void *opaque, hwaddr addr, uint64_t val, +static void openpic_src_write(void *opaque, gpa_t addr, uint64_t val, unsigned len) { - OpenPICState *opp = opaque; + struct openpic *opp = opaque; int idx; - DPRINTF("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", + pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", __func__, addr, val); addr = addr & 0xffff; @@ -884,11 +861,11 @@ static void openpic_src_write(void *opaque, hwaddr addr, uint64_t val, static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len) { - OpenPICState *opp = opaque; + struct openpic *opp = opaque; uint32_t retval; int idx; - DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); retval = 0xFFFFFFFF; addr = addr & 0xffff; @@ -906,22 +883,21 @@ static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len) break; } - DPRINTF("%s: => 0x%08x\n", __func__, retval); + pr_debug("%s: => 0x%08x\n", __func__, retval); return retval; } -static void openpic_msi_write(void *opaque, hwaddr addr, uint64_t val, +static void openpic_msi_write(void *opaque, gpa_t addr, uint64_t val, unsigned size) { - OpenPICState *opp = opaque; + struct openpic *opp = opaque; int idx = opp->irq_msi; int srs, ibs; - DPRINTF("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n", + pr_debug("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n", __func__, addr, val); - if (addr & 0xF) { + if (addr & 0xF) return; - } switch (addr) { case MSIIR_OFFSET: @@ -937,16 +913,15 @@ static void openpic_msi_write(void *opaque, hwaddr addr, uint64_t val, } } -static uint64_t openpic_msi_read(void *opaque, hwaddr addr, unsigned size) +static uint64_t openpic_msi_read(void *opaque, gpa_t addr, unsigned size) { - OpenPICState *opp = opaque; + struct openpic *opp = opaque; uint64_t r = 0; int i, srs; - DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); - if (addr & 0xF) { + pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + if (addr & 0xF) return -1; - } srs = addr >> 4; @@ -965,53 +940,51 @@ static uint64_t openpic_msi_read(void *opaque, hwaddr addr, unsigned size) openpic_set_irq(opp, opp->irq_msi + srs, 0); break; case 0x120: /* MSISR */ - for (i = 0; i < MAX_MSI; i++) { + for (i = 0; i < MAX_MSI; i++) r |= (opp->msi[i].msir ? 1 : 0) << i; - } break; } return r; } -static uint64_t openpic_summary_read(void *opaque, hwaddr addr, unsigned size) +static uint64_t openpic_summary_read(void *opaque, gpa_t addr, unsigned size) { uint64_t r = 0; - DPRINTF("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); /* TODO: EISR/EIMR */ return r; } -static void openpic_summary_write(void *opaque, hwaddr addr, uint64_t val, +static void openpic_summary_write(void *opaque, gpa_t addr, uint64_t val, unsigned size) { - DPRINTF("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n", + pr_debug("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n", __func__, addr, val); /* TODO: EISR/EIMR */ } -static void openpic_cpu_write_internal(void *opaque, hwaddr addr, +static void openpic_cpu_write_internal(void *opaque, gpa_t addr, uint32_t val, int idx) { - OpenPICState *opp = opaque; - IRQSource *src; - IRQDest *dst; + struct openpic *opp = opaque; + struct irq_source *src; + struct irq_dest *dst; int s_IRQ, n_IRQ; - DPRINTF("%s: cpu %d addr %#" HWADDR_PRIx " <= 0x%08x\n", __func__, idx, + pr_debug("%s: cpu %d addr %#" HWADDR_PRIx " <= 0x%08x\n", __func__, idx, addr, val); - if (idx < 0) { + if (idx < 0) return; - } - if (addr & 0xF) { + if (addr & 0xF) return; - } + dst = &opp->dst[idx]; addr &= 0xFF0; switch (addr) { @@ -1028,17 +1001,16 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr, case 0x80: /* CTPR */ dst->ctpr = val & 0x0000000F; - DPRINTF("%s: set CPU %d ctpr to %d, raised %d servicing %d\n", + pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n", __func__, idx, dst->ctpr, dst->raised.priority, dst->servicing.priority); if (dst->raised.priority <= dst->ctpr) { - DPRINTF - ("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", - __func__, idx); + pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", + __func__, idx); qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]); } else if (dst->raised.priority > dst->servicing.priority) { - DPRINTF("%s: Raise OpenPIC INT output cpu %d irq %d\n", + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n", __func__, idx, dst->raised.next); qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_INT]); } @@ -1051,11 +1023,11 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr, /* Read-only register */ break; case 0xB0: /* EOI */ - DPRINTF("EOI\n"); + pr_debug("EOI\n"); s_IRQ = IRQ_get_next(opp, &dst->servicing); if (s_IRQ < 0) { - DPRINTF("%s: EOI with no interrupt in service\n", + pr_debug("%s: EOI with no interrupt in service\n", __func__); break; } @@ -1069,7 +1041,7 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr, if (n_IRQ != -1 && (s_IRQ == -1 || IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { - DPRINTF("Raise OpenPIC INT output cpu %d irq %d\n", + pr_debug("Raise OpenPIC INT output cpu %d irq %d\n", idx, n_IRQ); qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]); } @@ -1079,32 +1051,32 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr, } } -static void openpic_cpu_write(void *opaque, hwaddr addr, uint64_t val, +static void openpic_cpu_write(void *opaque, gpa_t addr, uint64_t val, unsigned len) { openpic_cpu_write_internal(opaque, addr, val, (addr & 0x1f000) >> 12); } -static uint32_t openpic_iack(OpenPICState * opp, IRQDest * dst, int cpu) +static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, + int cpu) { - IRQSource *src; + struct irq_source *src; int retval, irq; - DPRINTF("Lower OpenPIC INT output\n"); + pr_debug("Lower OpenPIC INT output\n"); qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]); irq = IRQ_get_next(opp, &dst->raised); - DPRINTF("IACK: irq=%d\n", irq); + pr_debug("IACK: irq=%d\n", irq); - if (irq == -1) { + if (irq == -1) /* No more interrupt pending */ return opp->spve; - } src = &opp->src[irq]; if (!(src->ivpr & IVPR_ACTIVITY_MASK) || !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) { - fprintf(stderr, "%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n", + pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n", __func__, irq, dst->ctpr, src->ivpr); openpic_update_irq(opp, irq); retval = opp->spve; @@ -1135,22 +1107,21 @@ static uint32_t openpic_iack(OpenPICState * opp, IRQDest * dst, int cpu) return retval; } -static uint32_t openpic_cpu_read_internal(void *opaque, hwaddr addr, int idx) +static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx) { - OpenPICState *opp = opaque; - IRQDest *dst; + struct openpic *opp = opaque; + struct irq_dest *dst; uint32_t retval; - DPRINTF("%s: cpu %d addr %#" HWADDR_PRIx "\n", __func__, idx, addr); + pr_debug("%s: cpu %d addr %#" HWADDR_PRIx "\n", __func__, idx, addr); retval = 0xFFFFFFFF; - if (idx < 0) { + if (idx < 0) return retval; - } - if (addr & 0xF) { + if (addr & 0xF) return retval; - } + dst = &opp->dst[idx]; addr &= 0xFF0; switch (addr) { @@ -1169,54 +1140,54 @@ static uint32_t openpic_cpu_read_internal(void *opaque, hwaddr addr, int idx) default: break; } - DPRINTF("%s: => 0x%08x\n", __func__, retval); + pr_debug("%s: => 0x%08x\n", __func__, retval); return retval; } -static uint64_t openpic_cpu_read(void *opaque, hwaddr addr, unsigned len) +static uint64_t openpic_cpu_read(void *opaque, gpa_t addr, unsigned len) { return openpic_cpu_read_internal(opaque, addr, (addr & 0x1f000) >> 12); } -static const MemoryRegionOps openpic_glb_ops_be = { +static const struct kvm_io_device_ops openpic_glb_ops_be = { .write = openpic_gbl_write, .read = openpic_gbl_read, }; -static const MemoryRegionOps openpic_tmr_ops_be = { +static const struct kvm_io_device_ops openpic_tmr_ops_be = { .write = openpic_tmr_write, .read = openpic_tmr_read, }; -static const MemoryRegionOps openpic_cpu_ops_be = { +static const struct kvm_io_device_ops openpic_cpu_ops_be = { .write = openpic_cpu_write, .read = openpic_cpu_read, }; -static const MemoryRegionOps openpic_src_ops_be = { +static const struct kvm_io_device_ops openpic_src_ops_be = { .write = openpic_src_write, .read = openpic_src_read, }; -static const MemoryRegionOps openpic_msi_ops_be = { +static const struct kvm_io_device_ops openpic_msi_ops_be = { .read = openpic_msi_read, .write = openpic_msi_write, }; -static const MemoryRegionOps openpic_summary_ops_be = { +static const struct kvm_io_device_ops openpic_summary_ops_be = { .read = openpic_summary_read, .write = openpic_summary_write, }; -typedef struct MemReg { +struct mem_reg { const char *name; - MemoryRegionOps const *ops; - hwaddr start_addr; - ram_addr_t size; -} MemReg; + const struct kvm_io_device_ops *ops; + gpa_t start_addr; + int size; +}; -static void fsl_common_init(OpenPICState * opp) +static void fsl_common_init(struct openpic *opp) { int i; int virq = MAX_SRC; @@ -1239,9 +1210,8 @@ static void fsl_common_init(OpenPICState * opp) opp->irq_msi = 224; msi_supported = true; - for (i = 0; i < opp->fsl->max_ext; i++) { + for (i = 0; i < opp->fsl->max_ext; i++) opp->src[i].level = false; - } /* Internal interrupts, including message and MSI */ for (i = 16; i < MAX_SRC; i++) { @@ -1256,7 +1226,8 @@ static void fsl_common_init(OpenPICState * opp) } } -static void map_list(OpenPICState * opp, const MemReg * list, int *count) +static void map_list(struct openpic *opp, const struct mem_reg *list, + int *count) { while (list->name) { assert(*count < ARRAY_SIZE(opp->sub_io_mem)); @@ -1272,12 +1243,12 @@ static void map_list(OpenPICState * opp, const MemReg * list, int *count) } } -static int openpic_init(SysBusDevice * dev) +static int openpic_init(SysBusDevice *dev) { - OpenPICState *opp = FROM_SYSBUS(typeof(*opp), dev); + struct openpic *opp = FROM_SYSBUS(typeof(*opp), dev); int i, j; int list_count = 0; - static const MemReg list_le[] = { + static const struct mem_reg list_le[] = { {"glb", &openpic_glb_ops_le, OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE}, {"tmr", &openpic_tmr_ops_le, @@ -1288,7 +1259,7 @@ static int openpic_init(SysBusDevice * dev) OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE}, {NULL} }; - static const MemReg list_be[] = { + static const struct mem_reg list_be[] = { {"glb", &openpic_glb_ops_be, OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE}, {"tmr", &openpic_tmr_ops_be, @@ -1299,7 +1270,7 @@ static int openpic_init(SysBusDevice * dev) OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE}, {NULL} }; - static const MemReg list_fsl[] = { + static const struct mem_reg list_fsl[] = { {"msi", &openpic_msi_ops_be, OPENPIC_MSI_REG_START, OPENPIC_MSI_REG_SIZE}, {"summary", &openpic_summary_ops_be, From 5df554ad5b7522ea62b0ff9d5be35183494efc21 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 12 Apr 2013 14:08:46 +0000 Subject: [PATCH 160/204] kvm/ppc/mpic: in-kernel MPIC emulation Hook the MPIC code up to the KVM interfaces, add locking, etc. Signed-off-by: Scott Wood [agraf: add stub function for kvmppc_mpic_set_epr, non-booke, 64bit] Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/devices/mpic.txt | 37 + arch/powerpc/include/asm/kvm_host.h | 8 +- arch/powerpc/include/asm/kvm_ppc.h | 17 + arch/powerpc/include/uapi/asm/kvm.h | 8 + arch/powerpc/kvm/Kconfig | 9 + arch/powerpc/kvm/Makefile | 2 + arch/powerpc/kvm/booke.c | 8 +- arch/powerpc/kvm/mpic.c | 800 +++++++++++++++------ arch/powerpc/kvm/powerpc.c | 12 +- include/linux/kvm_host.h | 2 + include/uapi/linux/kvm.h | 3 + virt/kvm/kvm_main.c | 6 + 12 files changed, 693 insertions(+), 219 deletions(-) create mode 100644 Documentation/virtual/kvm/devices/mpic.txt diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt new file mode 100644 index 000000000000..ce98e3264fa8 --- /dev/null +++ b/Documentation/virtual/kvm/devices/mpic.txt @@ -0,0 +1,37 @@ +MPIC interrupt controller +========================= + +Device types supported: + KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0 + KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2 + +Only one MPIC instance, of any type, may be instantiated. The created +MPIC will act as the system interrupt controller, connecting to each +vcpu's interrupt inputs. + +Groups: + KVM_DEV_MPIC_GRP_MISC + Attributes: + KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit) + Base address of the 256 KiB MPIC register space. Must be + naturally aligned. A value of zero disables the mapping. + Reset value is zero. + + KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit) + Access an MPIC register, as if the access were made from the guest. + "attr" is the byte offset into the MPIC register space. Accesses + must be 4-byte aligned. + + MSIs may be signaled by using this attribute group to write + to the relevant MSIIR. + + KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit) + IRQ input line for each standard openpic source. 0 is inactive and 1 + is active, regardless of interrupt sense. + + For edge-triggered interrupts: Writing 1 is considered an activating + edge, and writing 0 is ignored. Reading returns 1 if a previously + signaled edge has not been acknowledged, and 0 otherwise. + + "attr" is the IRQ number. IRQ numbers for standard sources are the + byte offset of the relevant IVPR from EIVPR0, divided by 32. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1443768a6588..153c8c2b0f88 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -361,6 +361,11 @@ struct kvmppc_slb { #define KVMPPC_BOOKE_MAX_IAC 4 #define KVMPPC_BOOKE_MAX_DAC 2 +/* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */ +#define KVMPPC_EPR_NONE 0 /* EPR not supported */ +#define KVMPPC_EPR_USER 1 /* exit to userspace to fill EPR */ +#define KVMPPC_EPR_KERNEL 2 /* in-kernel irqchip */ + struct kvmppc_booke_debug_reg { u32 dbcr0; u32 dbcr1; @@ -526,7 +531,7 @@ struct kvm_vcpu_arch { u8 sane; u8 cpu_type; u8 hcall_needed; - u8 epr_enabled; + u8 epr_flags; /* KVMPPC_EPR_xxx */ u8 epr_needed; u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ @@ -593,5 +598,6 @@ struct kvm_vcpu_arch { #define KVM_MMIO_REG_FQPR 0x0060 #define __KVM_HAVE_ARCH_WQP +#define __KVM_HAVE_CREATE_DEVICE #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index bcc68b1afc66..3810f9c7616c 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -164,6 +164,8 @@ extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); +int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. @@ -245,6 +247,9 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *); void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); +struct openpic; +void kvmppc_mpic_put(struct openpic *opp); + #ifdef CONFIG_KVM_BOOK3S_64_HV static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) { @@ -270,6 +275,18 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr) #endif } +#ifdef CONFIG_KVM_MPIC + +void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu); + +#else + +static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) +{ +} + +#endif /* CONFIG_KVM_MPIC */ + int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, struct kvm_config_tlb *cfg); int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 41d59d8bdfe8..02ad96606860 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -382,6 +382,14 @@ struct kvm_get_htab_header { __u16 n_invalid; }; +/* Device control API: PPC-specific devices */ +#define KVM_DEV_MPIC_GRP_MISC 1 +#define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */ + +#define KVM_DEV_MPIC_GRP_REGISTER 2 /* 32-bit */ +#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE 3 /* 32-bit */ + +/* One-Reg API: PPC-specific registers */ #define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) #define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2) #define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 448952035b6e..f47e95e0b6de 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -151,6 +151,15 @@ config KVM_E500MC If unsure, say N. +config KVM_MPIC + bool "KVM in-kernel MPIC emulation" + depends on KVM + help + Enable support for emulating MPIC devices inside the + host kernel, rather than relying on userspace to emulate. + Currently, support is limited to certain versions of + Freescale's MPIC implementation. + source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index b772eded8c26..4a2277a221bb 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -103,6 +103,8 @@ kvm-book3s_32-objs := \ book3s_32_mmu.o kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) +kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o + kvm-objs := $(kvm-objs-m) $(kvm-objs-y) obj-$(CONFIG_KVM_440) += kvm.o diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 02756537cd90..4da11ed48c59 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -346,7 +346,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, keep_irq = true; } - if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled) + if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags) update_epr = true; switch (priority) { @@ -427,8 +427,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, set_guest_esr(vcpu, vcpu->arch.queued_esr); if (update_dear == true) set_guest_dear(vcpu, vcpu->arch.queued_dear); - if (update_epr == true) - kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); + if (update_epr == true) { + if (vcpu->arch.epr_flags & KVMPPC_EPR_USER) + kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); + } new_msr &= msr_mask; #if defined(CONFIG_64BIT) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 1df67aed7a91..cb451b91e342 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -23,6 +23,19 @@ * THE SOFTWARE. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "iodev.h" + #define MAX_CPU 32 #define MAX_SRC 256 #define MAX_TMR 4 @@ -36,6 +49,7 @@ #define OPENPIC_FLAG_ILR (2 << 0) /* OpenPIC address map */ +#define OPENPIC_REG_SIZE 0x40000 #define OPENPIC_GLB_REG_START 0x0 #define OPENPIC_GLB_REG_SIZE 0x10F0 #define OPENPIC_TMR_REG_START 0x10F0 @@ -89,6 +103,7 @@ static struct fsl_mpic_info fsl_mpic_42 = { #define ILR_INTTGT_INT 0x00 #define ILR_INTTGT_CINT 0x01 /* critical */ #define ILR_INTTGT_MCP 0x02 /* machine check */ +#define NUM_OUTPUTS 3 #define MSIIR_OFFSET 0x140 #define MSIIR_SRS_SHIFT 29 @@ -98,18 +113,19 @@ static struct fsl_mpic_info fsl_mpic_42 = { static int get_current_cpu(void) { - CPUState *cpu_single_cpu; - - if (!cpu_single_env) - return -1; - - cpu_single_cpu = ENV_GET_CPU(cpu_single_env); - return cpu_single_cpu->cpu_index; +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) + struct kvm_vcpu *vcpu = current->thread.kvm_vcpu; + return vcpu ? vcpu->vcpu_id : -1; +#else + /* XXX */ + return -1; +#endif } -static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx); -static void openpic_cpu_write_internal(void *opaque, gpa_t addr, - uint32_t val, int idx); +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, + u32 val, int idx); +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, + u32 *ptr, int idx); enum irq_type { IRQ_TYPE_NORMAL = 0, @@ -131,7 +147,7 @@ struct irq_source { uint32_t idr; /* IRQ destination register */ uint32_t destmask; /* bitmap of CPU destinations */ int last_cpu; - int output; /* IRQ level, e.g. OPENPIC_OUTPUT_INT */ + int output; /* IRQ level, e.g. ILR_INTTGT_INT */ int pending; /* TRUE if IRQ is pending */ enum irq_type type; bool level:1; /* level-triggered */ @@ -158,16 +174,27 @@ struct irq_source { #define IDR_CI 0x40000000 /* critical interrupt */ struct irq_dest { + struct kvm_vcpu *vcpu; + int32_t ctpr; /* CPU current task priority */ struct irq_queue raised; struct irq_queue servicing; - qemu_irq *irqs; /* Count of IRQ sources asserting on non-INT outputs */ - uint32_t outputs_active[OPENPIC_OUTPUT_NB]; + uint32_t outputs_active[NUM_OUTPUTS]; }; struct openpic { + struct kvm *kvm; + struct kvm_device *dev; + struct kvm_io_device mmio; + struct list_head mmio_regions; + atomic_t users; + bool mmio_mapped; + + gpa_t reg_base; + spinlock_t lock; + /* Behavior control */ struct fsl_mpic_info *fsl; uint32_t model; @@ -208,6 +235,47 @@ struct openpic { uint32_t irq_msi; }; + +static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst, + int output) +{ + struct kvm_interrupt irq = { + .irq = KVM_INTERRUPT_SET_LEVEL, + }; + + if (!dst->vcpu) { + pr_debug("%s: destination cpu %d does not exist\n", + __func__, (int)(dst - &opp->dst[0])); + return; + } + + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->vcpu_id, + output); + + if (output != ILR_INTTGT_INT) /* TODO */ + return; + + kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq); +} + +static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst, + int output) +{ + if (!dst->vcpu) { + pr_debug("%s: destination cpu %d does not exist\n", + __func__, (int)(dst - &opp->dst[0])); + return; + } + + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->vcpu_id, + output); + + if (output != ILR_INTTGT_INT) /* TODO */ + return; + + kvmppc_core_dequeue_external(dst->vcpu); +} + static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ) { set_bit(n_IRQ, q->queue); @@ -268,7 +336,7 @@ static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, pr_debug("%s: IRQ %d active %d was %d\n", __func__, n_IRQ, active, was_active); - if (src->output != OPENPIC_OUTPUT_INT) { + if (src->output != ILR_INTTGT_INT) { pr_debug("%s: output %d irq %d active %d was %d count %d\n", __func__, src->output, n_IRQ, active, was_active, dst->outputs_active[src->output]); @@ -282,14 +350,14 @@ static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, dst->outputs_active[src->output]++ == 0) { pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n", __func__, src->output, n_CPU, n_IRQ); - qemu_irq_raise(dst->irqs[src->output]); + mpic_irq_raise(opp, dst, src->output); } } else { if (was_active && --dst->outputs_active[src->output] == 0) { pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n", __func__, src->output, n_CPU, n_IRQ); - qemu_irq_lower(dst->irqs[src->output]); + mpic_irq_lower(opp, dst, src->output); } } @@ -322,8 +390,7 @@ static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, } else { pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", __func__, n_CPU, n_IRQ, dst->raised.next); - qemu_irq_raise(opp->dst[n_CPU]. - irqs[OPENPIC_OUTPUT_INT]); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); } } else { IRQ_get_next(opp, &dst->servicing); @@ -338,8 +405,7 @@ static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", __func__, n_IRQ, dst->ctpr, dst->servicing.priority, n_CPU); - qemu_irq_lower(opp->dst[n_CPU]. - irqs[OPENPIC_OUTPUT_INT]); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); } } } @@ -415,8 +481,8 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level) struct irq_source *src; if (n_IRQ >= MAX_IRQ) { - pr_err("%s: IRQ %d out of range\n", __func__, n_IRQ); - abort(); + WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ); + return; } src = &opp->src[n_IRQ]; @@ -433,7 +499,7 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level) openpic_update_irq(opp, n_IRQ); } - if (src->output != OPENPIC_OUTPUT_INT) { + if (src->output != ILR_INTTGT_INT) { /* Edge-triggered interrupts shouldn't be used * with non-INT delivery, but just in case, * try to make it do something sane rather than @@ -446,15 +512,13 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level) } } -static void openpic_reset(DeviceState *d) +static void openpic_reset(struct openpic *opp) { - struct openpic *opp = FROM_SYSBUS(typeof(*opp), SYS_BUS_DEVICE(d)); int i; opp->gcr = GCR_RESET; /* Initialise controller registers */ opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) | - ((opp->nb_cpus - 1) << FRR_NCPU_SHIFT) | (opp->vid << FRR_VID_SHIFT); opp->pir = 0; @@ -504,7 +568,7 @@ static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ) static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ) { if (opp->flags & OPENPIC_FLAG_ILR) - return output_to_inttgt(opp->src[n_IRQ].output); + return opp->src[n_IRQ].output; return 0xffffffff; } @@ -539,7 +603,7 @@ static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, __func__); } - src->output = OPENPIC_OUTPUT_CINT; + src->output = ILR_INTTGT_CINT; src->nomask = true; src->destmask = 0; @@ -550,7 +614,7 @@ static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, src->destmask |= 1UL << i; } } else { - src->output = OPENPIC_OUTPUT_INT; + src->output = ILR_INTTGT_INT; src->nomask = false; src->destmask = src->idr & normal_mask; } @@ -565,7 +629,7 @@ static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ, if (opp->flags & OPENPIC_FLAG_ILR) { struct irq_source *src = &opp->src[n_IRQ]; - src->output = inttgt_to_output(val & ILR_INTTGT_MASK); + src->output = val & ILR_INTTGT_MASK; pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, src->output); @@ -614,34 +678,23 @@ static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ, static void openpic_gcr_write(struct openpic *opp, uint64_t val) { - bool mpic_proxy = false; - if (val & GCR_RESET) { - openpic_reset(&opp->busdev.qdev); + openpic_reset(opp); return; } opp->gcr &= ~opp->mpic_mode_mask; opp->gcr |= val & opp->mpic_mode_mask; - - /* Set external proxy mode */ - if ((val & opp->mpic_mode_mask) == GCR_MODE_PROXY) - mpic_proxy = true; - - ppce500_set_mpic_proxy(mpic_proxy); } -static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val, - unsigned len) +static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val) { struct openpic *opp = opaque; - struct irq_dest *dst; - int idx; + int err = 0; - pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", - __func__, addr, val); + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); if (addr & 0xF) - return; + return 0; switch (addr) { case 0x00: /* Block Revision Register1 (BRR1) is Readonly */ @@ -654,7 +707,8 @@ static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val, case 0x90: case 0xA0: case 0xB0: - openpic_cpu_write_internal(opp, addr, val, get_current_cpu()); + err = openpic_cpu_write_internal(opp, addr, val, + get_current_cpu()); break; case 0x1000: /* FRR */ break; @@ -664,21 +718,11 @@ static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val, case 0x1080: /* VIR */ break; case 0x1090: /* PIR */ - for (idx = 0; idx < opp->nb_cpus; idx++) { - if ((val & (1 << idx)) && !(opp->pir & (1 << idx))) { - pr_debug("Raise OpenPIC RESET output for CPU %d\n", - idx); - dst = &opp->dst[idx]; - qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_RESET]); - } else if (!(val & (1 << idx)) && - (opp->pir & (1 << idx))) { - pr_debug("Lower OpenPIC RESET output for CPU %d\n", - idx); - dst = &opp->dst[idx]; - qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_RESET]); - } - } - opp->pir = val; + /* + * This register is used to reset a CPU core -- + * let userspace handle it. + */ + err = -ENXIO; break; case 0x10A0: /* IPI_IVPR */ case 0x10B0: @@ -695,21 +739,25 @@ static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val, default: break; } + + return err; } -static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len) +static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr) { struct openpic *opp = opaque; - uint32_t retval; + u32 retval; + int err = 0; - pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + pr_debug("%s: addr %#llx\n", __func__, addr); retval = 0xFFFFFFFF; if (addr & 0xF) - return retval; + goto out; switch (addr) { case 0x1000: /* FRR */ retval = opp->frr; + retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT; break; case 0x1020: /* GCR */ retval = opp->gcr; @@ -731,8 +779,8 @@ static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len) case 0x90: case 0xA0: case 0xB0: - retval = - openpic_cpu_read_internal(opp, addr, get_current_cpu()); + err = openpic_cpu_read_internal(opp, addr, + &retval, get_current_cpu()); break; case 0x10A0: /* IPI_IVPR */ case 0x10B0: @@ -750,28 +798,28 @@ static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len) default: break; } - pr_debug("%s: => 0x%08x\n", __func__, retval); - return retval; +out: + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return err; } -static void openpic_tmr_write(void *opaque, gpa_t addr, uint64_t val, - unsigned len) +static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val) { struct openpic *opp = opaque; int idx; addr += 0x10f0; - pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", - __func__, addr, val); + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); if (addr & 0xF) - return; + return 0; if (addr == 0x10f0) { /* TFRR */ opp->tfrr = val; - return; + return 0; } idx = (addr >> 6) & 0x3; @@ -795,15 +843,17 @@ static void openpic_tmr_write(void *opaque, gpa_t addr, uint64_t val, write_IRQreg_idr(opp, opp->irq_tim0 + idx, val); break; } + + return 0; } -static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len) +static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr) { struct openpic *opp = opaque; uint32_t retval = -1; int idx; - pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + pr_debug("%s: addr %#llx\n", __func__, addr); if (addr & 0xF) goto out; @@ -813,6 +863,7 @@ static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len) retval = opp->tfrr; goto out; } + switch (addr & 0x30) { case 0x00: /* TCCR */ retval = opp->timers[idx].tccr; @@ -830,18 +881,16 @@ static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len) out: pr_debug("%s: => 0x%08x\n", __func__, retval); - - return retval; + *ptr = retval; + return 0; } -static void openpic_src_write(void *opaque, gpa_t addr, uint64_t val, - unsigned len) +static int openpic_src_write(void *opaque, gpa_t addr, u32 val) { struct openpic *opp = opaque; int idx; - pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n", - __func__, addr, val); + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); addr = addr & 0xffff; idx = addr >> 5; @@ -857,15 +906,17 @@ static void openpic_src_write(void *opaque, gpa_t addr, uint64_t val, write_IRQreg_ilr(opp, idx, val); break; } + + return 0; } -static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len) +static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr) { struct openpic *opp = opaque; uint32_t retval; int idx; - pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + pr_debug("%s: addr %#llx\n", __func__, addr); retval = 0xFFFFFFFF; addr = addr & 0xffff; @@ -884,20 +935,19 @@ static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len) } pr_debug("%s: => 0x%08x\n", __func__, retval); - return retval; + *ptr = retval; + return 0; } -static void openpic_msi_write(void *opaque, gpa_t addr, uint64_t val, - unsigned size) +static int openpic_msi_write(void *opaque, gpa_t addr, u32 val) { struct openpic *opp = opaque; int idx = opp->irq_msi; int srs, ibs; - pr_debug("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n", - __func__, addr, val); + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); if (addr & 0xF) - return; + return 0; switch (addr) { case MSIIR_OFFSET: @@ -911,17 +961,19 @@ static void openpic_msi_write(void *opaque, gpa_t addr, uint64_t val, /* most registers are read-only, thus ignored */ break; } + + return 0; } -static uint64_t openpic_msi_read(void *opaque, gpa_t addr, unsigned size) +static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr) { struct openpic *opp = opaque; - uint64_t r = 0; + uint32_t r = 0; int i, srs; - pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + pr_debug("%s: addr %#llx\n", __func__, addr); if (addr & 0xF) - return -1; + return -ENXIO; srs = addr >> 4; @@ -945,45 +997,47 @@ static uint64_t openpic_msi_read(void *opaque, gpa_t addr, unsigned size) break; } - return r; + pr_debug("%s: => 0x%08x\n", __func__, r); + *ptr = r; + return 0; } -static uint64_t openpic_summary_read(void *opaque, gpa_t addr, unsigned size) +static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr) { - uint64_t r = 0; + uint32_t r = 0; - pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr); + pr_debug("%s: addr %#llx\n", __func__, addr); /* TODO: EISR/EIMR */ - return r; + *ptr = r; + return 0; } -static void openpic_summary_write(void *opaque, gpa_t addr, uint64_t val, - unsigned size) +static int openpic_summary_write(void *opaque, gpa_t addr, u32 val) { - pr_debug("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n", - __func__, addr, val); + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); /* TODO: EISR/EIMR */ + return 0; } -static void openpic_cpu_write_internal(void *opaque, gpa_t addr, - uint32_t val, int idx) +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, + u32 val, int idx) { struct openpic *opp = opaque; struct irq_source *src; struct irq_dest *dst; int s_IRQ, n_IRQ; - pr_debug("%s: cpu %d addr %#" HWADDR_PRIx " <= 0x%08x\n", __func__, idx, + pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx, addr, val); if (idx < 0) - return; + return 0; if (addr & 0xF) - return; + return 0; dst = &opp->dst[idx]; addr &= 0xFF0; @@ -1008,11 +1062,11 @@ static void openpic_cpu_write_internal(void *opaque, gpa_t addr, if (dst->raised.priority <= dst->ctpr) { pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", __func__, idx); - qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); } else if (dst->raised.priority > dst->servicing.priority) { pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n", __func__, idx, dst->raised.next); - qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_INT]); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); } break; @@ -1043,18 +1097,22 @@ static void openpic_cpu_write_internal(void *opaque, gpa_t addr, IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { pr_debug("Raise OpenPIC INT output cpu %d irq %d\n", idx, n_IRQ); - qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); } break; default: break; } + + return 0; } -static void openpic_cpu_write(void *opaque, gpa_t addr, uint64_t val, - unsigned len) +static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val) { - openpic_cpu_write_internal(opaque, addr, val, (addr & 0x1f000) >> 12); + struct openpic *opp = opaque; + + return openpic_cpu_write_internal(opp, addr, val, + (addr & 0x1f000) >> 12); } static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, @@ -1064,7 +1122,7 @@ static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, int retval, irq; pr_debug("Lower OpenPIC INT output\n"); - qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); irq = IRQ_get_next(opp, &dst->raised); pr_debug("IACK: irq=%d\n", irq); @@ -1107,20 +1165,21 @@ static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, return retval; } -static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx) +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, + u32 *ptr, int idx) { struct openpic *opp = opaque; struct irq_dest *dst; uint32_t retval; - pr_debug("%s: cpu %d addr %#" HWADDR_PRIx "\n", __func__, idx, addr); + pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr); retval = 0xFFFFFFFF; if (idx < 0) - return retval; + goto out; if (addr & 0xF) - return retval; + goto out; dst = &opp->dst[idx]; addr &= 0xFF0; @@ -1142,56 +1201,77 @@ static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx) } pr_debug("%s: => 0x%08x\n", __func__, retval); - return retval; +out: + *ptr = retval; + return 0; } -static uint64_t openpic_cpu_read(void *opaque, gpa_t addr, unsigned len) +static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr) { - return openpic_cpu_read_internal(opaque, addr, (addr & 0x1f000) >> 12); + struct openpic *opp = opaque; + + return openpic_cpu_read_internal(opp, addr, ptr, + (addr & 0x1f000) >> 12); } -static const struct kvm_io_device_ops openpic_glb_ops_be = { - .write = openpic_gbl_write, - .read = openpic_gbl_read, -}; - -static const struct kvm_io_device_ops openpic_tmr_ops_be = { - .write = openpic_tmr_write, - .read = openpic_tmr_read, -}; - -static const struct kvm_io_device_ops openpic_cpu_ops_be = { - .write = openpic_cpu_write, - .read = openpic_cpu_read, -}; - -static const struct kvm_io_device_ops openpic_src_ops_be = { - .write = openpic_src_write, - .read = openpic_src_read, -}; - -static const struct kvm_io_device_ops openpic_msi_ops_be = { - .read = openpic_msi_read, - .write = openpic_msi_write, -}; - -static const struct kvm_io_device_ops openpic_summary_ops_be = { - .read = openpic_summary_read, - .write = openpic_summary_write, -}; - struct mem_reg { - const char *name; - const struct kvm_io_device_ops *ops; + struct list_head list; + int (*read)(void *opaque, gpa_t addr, u32 *ptr); + int (*write)(void *opaque, gpa_t addr, u32 val); gpa_t start_addr; int size; }; +static struct mem_reg openpic_gbl_mmio = { + .write = openpic_gbl_write, + .read = openpic_gbl_read, + .start_addr = OPENPIC_GLB_REG_START, + .size = OPENPIC_GLB_REG_SIZE, +}; + +static struct mem_reg openpic_tmr_mmio = { + .write = openpic_tmr_write, + .read = openpic_tmr_read, + .start_addr = OPENPIC_TMR_REG_START, + .size = OPENPIC_TMR_REG_SIZE, +}; + +static struct mem_reg openpic_cpu_mmio = { + .write = openpic_cpu_write, + .read = openpic_cpu_read, + .start_addr = OPENPIC_CPU_REG_START, + .size = OPENPIC_CPU_REG_SIZE, +}; + +static struct mem_reg openpic_src_mmio = { + .write = openpic_src_write, + .read = openpic_src_read, + .start_addr = OPENPIC_SRC_REG_START, + .size = OPENPIC_SRC_REG_SIZE, +}; + +static struct mem_reg openpic_msi_mmio = { + .read = openpic_msi_read, + .write = openpic_msi_write, + .start_addr = OPENPIC_MSI_REG_START, + .size = OPENPIC_MSI_REG_SIZE, +}; + +static struct mem_reg openpic_summary_mmio = { + .read = openpic_summary_read, + .write = openpic_summary_write, + .start_addr = OPENPIC_SUMMARY_REG_START, + .size = OPENPIC_SUMMARY_REG_SIZE, +}; + static void fsl_common_init(struct openpic *opp) { int i; int virq = MAX_SRC; + list_add(&openpic_msi_mmio.list, &opp->mmio_regions); + list_add(&openpic_summary_mmio.list, &opp->mmio_regions); + opp->vid = VID_REVISION_1_2; opp->vir = VIR_GENERIC; opp->vector_mask = 0xFFFF; @@ -1205,11 +1285,10 @@ static void fsl_common_init(struct openpic *opp) opp->irq_tim0 = virq; virq += MAX_TMR; - assert(virq <= MAX_IRQ); + BUG_ON(virq > MAX_IRQ); opp->irq_msi = 224; - msi_supported = true; for (i = 0; i < opp->fsl->max_ext; i++) opp->src[i].level = false; @@ -1226,63 +1305,352 @@ static void fsl_common_init(struct openpic *opp) } } -static void map_list(struct openpic *opp, const struct mem_reg *list, - int *count) +static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr) { - while (list->name) { - assert(*count < ARRAY_SIZE(opp->sub_io_mem)); + struct list_head *node; - memory_region_init_io(&opp->sub_io_mem[*count], list->ops, opp, - list->name, list->size); + list_for_each(node, &opp->mmio_regions) { + struct mem_reg *mr = list_entry(node, struct mem_reg, list); - memory_region_add_subregion(&opp->mem, list->start_addr, - &opp->sub_io_mem[*count]); + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) + continue; - (*count)++; - list++; + return mr->read(opp, addr - mr->start_addr, ptr); } + + return -ENXIO; } -static int openpic_init(SysBusDevice *dev) +static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val) { - struct openpic *opp = FROM_SYSBUS(typeof(*opp), dev); - int i, j; - int list_count = 0; - static const struct mem_reg list_le[] = { - {"glb", &openpic_glb_ops_le, - OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE}, - {"tmr", &openpic_tmr_ops_le, - OPENPIC_TMR_REG_START, OPENPIC_TMR_REG_SIZE}, - {"src", &openpic_src_ops_le, - OPENPIC_SRC_REG_START, OPENPIC_SRC_REG_SIZE}, - {"cpu", &openpic_cpu_ops_le, - OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE}, - {NULL} - }; - static const struct mem_reg list_be[] = { - {"glb", &openpic_glb_ops_be, - OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE}, - {"tmr", &openpic_tmr_ops_be, - OPENPIC_TMR_REG_START, OPENPIC_TMR_REG_SIZE}, - {"src", &openpic_src_ops_be, - OPENPIC_SRC_REG_START, OPENPIC_SRC_REG_SIZE}, - {"cpu", &openpic_cpu_ops_be, - OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE}, - {NULL} - }; - static const struct mem_reg list_fsl[] = { - {"msi", &openpic_msi_ops_be, - OPENPIC_MSI_REG_START, OPENPIC_MSI_REG_SIZE}, - {"summary", &openpic_summary_ops_be, - OPENPIC_SUMMARY_REG_START, OPENPIC_SUMMARY_REG_SIZE}, - {NULL} - }; + struct list_head *node; - memory_region_init(&opp->mem, "openpic", 0x40000); + list_for_each(node, &opp->mmio_regions) { + struct mem_reg *mr = list_entry(node, struct mem_reg, list); + + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) + continue; + + return mr->write(opp, addr - mr->start_addr, val); + } + + return -ENXIO; +} + +static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr, + int len, void *ptr) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + int ret; + union { + u32 val; + u8 bytes[4]; + } u; + + if (addr & (len - 1)) { + pr_debug("%s: bad alignment %llx/%d\n", + __func__, addr, len); + return -EINVAL; + } + + spin_lock_irq(&opp->lock); + ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val); + spin_unlock_irq(&opp->lock); + + /* + * Technically only 32-bit accesses are allowed, but be nice to + * people dumping registers a byte at a time -- it works in real + * hardware (reads only, not writes). + */ + if (len == 4) { + *(u32 *)ptr = u.val; + pr_debug("%s: addr %llx ret %d len 4 val %x\n", + __func__, addr, ret, u.val); + } else if (len == 1) { + *(u8 *)ptr = u.bytes[addr & 3]; + pr_debug("%s: addr %llx ret %d len 1 val %x\n", + __func__, addr, ret, u.bytes[addr & 3]); + } else { + pr_debug("%s: bad length %d\n", __func__, len); + return -EINVAL; + } + + return ret; +} + +static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr, + int len, const void *ptr) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + int ret; + + if (len != 4) { + pr_debug("%s: bad length %d\n", __func__, len); + return -EOPNOTSUPP; + } + if (addr & 3) { + pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len); + return -EOPNOTSUPP; + } + + spin_lock_irq(&opp->lock); + ret = kvm_mpic_write_internal(opp, addr - opp->reg_base, + *(const u32 *)ptr); + spin_unlock_irq(&opp->lock); + + pr_debug("%s: addr %llx ret %d val %x\n", + __func__, addr, ret, *(const u32 *)ptr); + + return ret; +} + +static void kvm_mpic_dtor(struct kvm_io_device *this) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + + opp->mmio_mapped = false; +} + +static const struct kvm_io_device_ops mpic_mmio_ops = { + .read = kvm_mpic_read, + .write = kvm_mpic_write, + .destructor = kvm_mpic_dtor, +}; + +static void map_mmio(struct openpic *opp) +{ + BUG_ON(opp->mmio_mapped); + opp->mmio_mapped = true; + + kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops); + + kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS, + opp->reg_base, OPENPIC_REG_SIZE, + &opp->mmio); +} + +static void unmap_mmio(struct openpic *opp) +{ + BUG_ON(opp->mmio_mapped); + opp->mmio_mapped = false; + + kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); +} + +static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) +{ + u64 base; + + if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64))) + return -EFAULT; + + if (base & 0x3ffff) { + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n", + __func__, base); + return -EINVAL; + } + + if (base == opp->reg_base) + return 0; + + mutex_lock(&opp->kvm->slots_lock); + + unmap_mmio(opp); + opp->reg_base = base; + + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n", + __func__, base); + + if (base == 0) + goto out; + + map_mmio(opp); + + mutex_unlock(&opp->kvm->slots_lock); +out: + return 0; +} + +#define ATTR_SET 0 +#define ATTR_GET 1 + +static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type) +{ + int ret; + + if (addr & 3) + return -ENXIO; + + spin_lock_irq(&opp->lock); + + if (type == ATTR_SET) + ret = kvm_mpic_write_internal(opp, addr, *val); + else + ret = kvm_mpic_read_internal(opp, addr, val); + + spin_unlock_irq(&opp->lock); + + pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val); + + return ret; +} + +static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct openpic *opp = dev->private; + u32 attr32; + + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + return set_base_addr(opp, attr); + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + if (get_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return access_reg(opp, attr->attr, &attr32, ATTR_SET); + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + return -EINVAL; + + if (get_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + if (attr32 != 0 && attr32 != 1) + return -EINVAL; + + spin_lock_irq(&opp->lock); + openpic_set_irq(opp, attr->attr, attr32); + spin_unlock_irq(&opp->lock); + return 0; + } + + return -ENXIO; +} + +static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct openpic *opp = dev->private; + u64 attr64; + u32 attr32; + int ret; + + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + mutex_lock(&opp->kvm->slots_lock); + attr64 = opp->reg_base; + mutex_unlock(&opp->kvm->slots_lock); + + if (copy_to_user((u64 __user *)(long)attr->addr, + &attr64, sizeof(u64))) + return -EFAULT; + + return 0; + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + ret = access_reg(opp, attr->attr, &attr32, ATTR_GET); + if (ret) + return ret; + + if (put_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return 0; + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + return -EINVAL; + + spin_lock_irq(&opp->lock); + attr32 = opp->src[attr->attr].pending; + spin_unlock_irq(&opp->lock); + + if (put_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return 0; + } + + return -ENXIO; +} + +static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + return 0; + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + return 0; + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + break; + + return 0; + } + + return -ENXIO; +} + +static void mpic_destroy(struct kvm_device *dev) +{ + struct openpic *opp = dev->private; + + if (opp->mmio_mapped) { + /* + * Normally we get unmapped by kvm_io_bus_destroy(), + * which happens before the VCPUs release their references. + * + * Thus, we should only get here if no VCPUs took a reference + * to us in the first place. + */ + WARN_ON(opp->nb_cpus != 0); + unmap_mmio(opp); + } + + kfree(opp); +} + +static int mpic_create(struct kvm_device *dev, u32 type) +{ + struct openpic *opp; + int ret; + + opp = kzalloc(sizeof(struct openpic), GFP_KERNEL); + if (!opp) + return -ENOMEM; + + dev->private = opp; + opp->kvm = dev->kvm; + opp->dev = dev; + opp->model = type; + spin_lock_init(&opp->lock); + + INIT_LIST_HEAD(&opp->mmio_regions); + list_add(&openpic_gbl_mmio.list, &opp->mmio_regions); + list_add(&openpic_tmr_mmio.list, &opp->mmio_regions); + list_add(&openpic_src_mmio.list, &opp->mmio_regions); + list_add(&openpic_cpu_mmio.list, &opp->mmio_regions); switch (opp->model) { - case OPENPIC_MODEL_FSL_MPIC_20: - default: + case KVM_DEV_TYPE_FSL_MPIC_20: opp->fsl = &fsl_mpic_20; opp->brr1 = 0x00400200; opp->flags |= OPENPIC_FLAG_IDR_CRIT; @@ -1290,12 +1658,10 @@ static int openpic_init(SysBusDevice *dev) opp->mpic_mode_mask = GCR_MODE_MIXED; fsl_common_init(opp); - map_list(opp, list_be, &list_count); - map_list(opp, list_fsl, &list_count); break; - case OPENPIC_MODEL_FSL_MPIC_42: + case KVM_DEV_TYPE_FSL_MPIC_42: opp->fsl = &fsl_mpic_42; opp->brr1 = 0x00400402; opp->flags |= OPENPIC_FLAG_ILR; @@ -1303,11 +1669,27 @@ static int openpic_init(SysBusDevice *dev) opp->mpic_mode_mask = GCR_MODE_PROXY; fsl_common_init(opp); - map_list(opp, list_be, &list_count); - map_list(opp, list_fsl, &list_count); break; + + default: + ret = -ENODEV; + goto err; } + openpic_reset(opp); return 0; + +err: + kfree(opp); + return ret; } + +struct kvm_device_ops kvm_mpic_ops = { + .name = "kvm-mpic", + .create = mpic_create, + .destroy = mpic_destroy, + .set_attr = mpic_set_attr, + .get_attr = mpic_get_attr, + .has_attr = mpic_has_attr, +}; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6b8108624851..88d69cf1f953 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -317,6 +317,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: r = 1; break; #ifndef CONFIG_KVM_BOOK3S_64_HV @@ -762,7 +763,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; case KVM_CAP_PPC_EPR: r = 0; - vcpu->arch.epr_enabled = cap->args[0]; + if (cap->args[0]) + vcpu->arch.epr_flags |= KVMPPC_EPR_USER; + else + vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER; break; #ifdef CONFIG_BOOKE case KVM_CAP_PPC_BOOKE_WATCHDOG: @@ -908,6 +912,7 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { + struct kvm *kvm __maybe_unused = filp->private_data; void __user *argp = (void __user *)arg; long r; @@ -926,7 +931,6 @@ long kvm_arch_vm_ioctl(struct file *filp, #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CREATE_SPAPR_TCE: { struct kvm_create_spapr_tce create_tce; - struct kvm *kvm = filp->private_data; r = -EFAULT; if (copy_from_user(&create_tce, argp, sizeof(create_tce))) @@ -938,7 +942,6 @@ long kvm_arch_vm_ioctl(struct file *filp, #ifdef CONFIG_KVM_BOOK3S_64_HV case KVM_ALLOCATE_RMA: { - struct kvm *kvm = filp->private_data; struct kvm_allocate_rma rma; r = kvm_vm_ioctl_allocate_rma(kvm, &rma); @@ -948,7 +951,6 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_PPC_ALLOCATE_HTAB: { - struct kvm *kvm = filp->private_data; u32 htab_order; r = -EFAULT; @@ -965,7 +967,6 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_PPC_GET_HTAB_FD: { - struct kvm *kvm = filp->private_data; struct kvm_get_htab_fd ghf; r = -EFAULT; @@ -978,7 +979,6 @@ long kvm_arch_vm_ioctl(struct file *filp, #ifdef CONFIG_PPC_BOOK3S_64 case KVM_PPC_GET_SMMU_INFO: { - struct kvm *kvm = filp->private_data; struct kvm_ppc_smmu_info info; memset(&info, 0, sizeof(info)); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6dab6b5b3e3b..feffbdaf8986 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1099,6 +1099,8 @@ void kvm_device_get(struct kvm_device *dev); void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); +extern struct kvm_device_ops kvm_mpic_ops; + #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 38a0be0c199f..4148becdc93f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -837,6 +837,9 @@ struct kvm_device_attr { __u64 addr; /* userspace address of attr data */ }; +#define KVM_DEV_TYPE_FSL_MPIC_20 1 +#define KVM_DEV_TYPE_FSL_MPIC_42 2 + /* * ioctls for VM fds */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5f0d78c2537b..f6cd14d2f0d9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2238,6 +2238,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm, int ret; switch (cd->type) { +#ifdef CONFIG_KVM_MPIC + case KVM_DEV_TYPE_FSL_MPIC_20: + case KVM_DEV_TYPE_FSL_MPIC_42: + ops = &kvm_mpic_ops; + break; +#endif default: return -ENODEV; } From eb1e4f43e0f47f2655372c7d32c43db9711c278e Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 12 Apr 2013 14:08:47 +0000 Subject: [PATCH 161/204] kvm/ppc/mpic: add KVM_CAP_IRQ_MPIC Enabling this capability connects the vcpu to the designated in-kernel MPIC. Using explicit connections between vcpus and irqchips allows for flexibility, but the main benefit at the moment is that it simplifies the code -- KVM doesn't need vm-global state to remember which MPIC object is associated with this vm, and it doesn't need to care about ordering between irqchip creation and vcpu creation. Signed-off-by: Scott Wood [agraf: add stub functions for kvmppc_mpic_{dis,}connect_vcpu] Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 8 +++ arch/powerpc/include/asm/kvm_host.h | 9 ++++ arch/powerpc/include/asm/kvm_ppc.h | 15 +++++- arch/powerpc/kvm/booke.c | 4 ++ arch/powerpc/kvm/mpic.c | 82 ++++++++++++++++++++++++++--- arch/powerpc/kvm/powerpc.c | 30 +++++++++++ include/uapi/linux/kvm.h | 1 + 7 files changed, 141 insertions(+), 8 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 66b58e494935..149558b1e81e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2744,3 +2744,11 @@ to receive the topmost interrupt vector. When disabled (args[0] == 0), behavior is as if this facility is unsupported. When this capability is enabled, KVM_EXIT_EPR can occur. + +6.6 KVM_CAP_IRQ_MPIC + +Architectures: ppc +Parameters: args[0] is the MPIC device fd + args[1] is the MPIC CPU number for this vcpu + +This capability connects the vcpu to an in-kernel MPIC device. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 153c8c2b0f88..c3f8ceffa412 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -377,6 +377,11 @@ struct kvmppc_booke_debug_reg { u64 dac[KVMPPC_BOOKE_MAX_DAC]; }; +#define KVMPPC_IRQ_DEFAULT 0 +#define KVMPPC_IRQ_MPIC 1 + +struct openpic; + struct kvm_vcpu_arch { ulong host_stack; u32 host_pid; @@ -558,6 +563,10 @@ struct kvm_vcpu_arch { unsigned long magic_page_pa; /* phys addr to map the magic page to */ unsigned long magic_page_ea; /* effect. addr to map the magic page to */ + int irq_type; /* one of KVM_IRQ_* */ + int irq_cpu_id; + struct openpic *mpic; /* KVM_IRQ_MPIC */ + #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu_arch_shared shregs; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 3810f9c7616c..df9c80b37905 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -248,7 +248,6 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *); void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); struct openpic; -void kvmppc_mpic_put(struct openpic *opp); #ifdef CONFIG_KVM_BOOK3S_64_HV static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) @@ -278,6 +277,9 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr) #ifdef CONFIG_KVM_MPIC void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu); +int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 cpu); +void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu); #else @@ -285,6 +287,17 @@ static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) { } +static inline int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, + struct kvm_vcpu *vcpu, u32 cpu) +{ + return -EINVAL; +} + +static inline void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, + struct kvm_vcpu *vcpu) +{ +} + #endif /* CONFIG_KVM_MPIC */ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 4da11ed48c59..1020119226db 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -430,6 +430,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, if (update_epr == true) { if (vcpu->arch.epr_flags & KVMPPC_EPR_USER) kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); + else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) { + BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC); + kvmppc_mpic_set_epr(vcpu); + } } new_msr &= msr_mask; diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index cb451b91e342..10bc08a246fd 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -115,7 +115,7 @@ static int get_current_cpu(void) { #if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) struct kvm_vcpu *vcpu = current->thread.kvm_vcpu; - return vcpu ? vcpu->vcpu_id : -1; + return vcpu ? vcpu->arch.irq_cpu_id : -1; #else /* XXX */ return -1; @@ -249,7 +249,7 @@ static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst, return; } - pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->vcpu_id, + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, output); if (output != ILR_INTTGT_INT) /* TODO */ @@ -267,7 +267,7 @@ static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst, return; } - pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->vcpu_id, + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, output); if (output != ILR_INTTGT_INT) /* TODO */ @@ -1165,6 +1165,20 @@ static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, return retval; } +void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) +{ + struct openpic *opp = vcpu->arch.mpic; + int cpu = vcpu->arch.irq_cpu_id; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + + if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY) + kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu)); + + spin_unlock_irqrestore(&opp->lock, flags); +} + static int openpic_cpu_read_internal(void *opaque, gpa_t addr, u32 *ptr, int idx) { @@ -1431,10 +1445,10 @@ static void map_mmio(struct openpic *opp) static void unmap_mmio(struct openpic *opp) { - BUG_ON(opp->mmio_mapped); - opp->mmio_mapped = false; - - kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); + if (opp->mmio_mapped) { + opp->mmio_mapped = false; + kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); + } } static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) @@ -1693,3 +1707,57 @@ struct kvm_device_ops kvm_mpic_ops = { .get_attr = mpic_get_attr, .has_attr = mpic_has_attr, }; + +int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 cpu) +{ + struct openpic *opp = dev->private; + int ret = 0; + + if (dev->ops != &kvm_mpic_ops) + return -EPERM; + if (opp->kvm != vcpu->kvm) + return -EPERM; + if (cpu < 0 || cpu >= MAX_CPU) + return -EPERM; + + spin_lock_irq(&opp->lock); + + if (opp->dst[cpu].vcpu) { + ret = -EEXIST; + goto out; + } + if (vcpu->arch.irq_type) { + ret = -EBUSY; + goto out; + } + + opp->dst[cpu].vcpu = vcpu; + opp->nb_cpus = max(opp->nb_cpus, cpu + 1); + + vcpu->arch.mpic = opp; + vcpu->arch.irq_cpu_id = cpu; + vcpu->arch.irq_type = KVMPPC_IRQ_MPIC; + + /* This might need to be changed if GCR gets extended */ + if (opp->mpic_mode_mask == GCR_MODE_PROXY) + vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL; + + kvm_device_get(dev); +out: + spin_unlock_irq(&opp->lock); + return ret; +} + +/* + * This should only happen immediately before the mpic is destroyed, + * so we shouldn't need to worry about anything still trying to + * access the vcpu pointer. + */ +void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu) +{ + BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu); + + opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL; + kvm_device_put(opp->dev); +} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 88d69cf1f953..5d046bbdf11f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -326,6 +327,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_GET_PVINFO: #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: +#endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: #endif r = 1; break; @@ -460,6 +464,13 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) tasklet_kill(&vcpu->arch.tasklet); kvmppc_remove_vcpu_debugfs(vcpu); + + switch (vcpu->arch.irq_type) { + case KVMPPC_IRQ_MPIC: + kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); + break; + } + kvmppc_core_vcpu_free(vcpu); } @@ -786,6 +797,25 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg); break; } +#endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: { + struct file *filp; + struct kvm_device *dev; + + r = -EBADF; + filp = fget(cap->args[0]); + if (!filp) + break; + + r = -EPERM; + dev = kvm_device_from_filp(filp); + if (dev) + r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]); + + fput(filp); + break; + } #endif default: r = -EINVAL; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4148becdc93f..0ebf59b50ed0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PSCI 87 #define KVM_CAP_ARM_SET_DEVICE_ADDR 88 #define KVM_CAP_DEVICE_CTRL 89 +#define KVM_CAP_IRQ_MPIC 90 #ifdef KVM_CAP_IRQ_ROUTING From de9ba2f36368d21314860ee24893a6ffef01e548 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 16 Apr 2013 17:42:19 +0200 Subject: [PATCH 162/204] KVM: PPC: Support irq routing and irqfd for in-kernel MPIC Now that all the irq routing and irqfd pieces are generic, we can expose real irqchip support to all of KVM's internal helpers. This allows us to use irqfd with the in-kernel MPIC. Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/devices/mpic.txt | 19 ++++ arch/powerpc/include/asm/kvm_host.h | 7 ++ arch/powerpc/include/uapi/asm/kvm.h | 1 + arch/powerpc/kvm/Kconfig | 3 + arch/powerpc/kvm/Makefile | 1 + arch/powerpc/kvm/irq.h | 17 ++++ arch/powerpc/kvm/mpic.c | 111 ++++++++++++++++++++- 7 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kvm/irq.h diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt index ce98e3264fa8..ad0ac778f20a 100644 --- a/Documentation/virtual/kvm/devices/mpic.txt +++ b/Documentation/virtual/kvm/devices/mpic.txt @@ -35,3 +35,22 @@ Groups: "attr" is the IRQ number. IRQ numbers for standard sources are the byte offset of the relevant IVPR from EIVPR0, divided by 32. + +IRQ Routing: + + The MPIC emulation supports IRQ routing. Only a single MPIC device can + be instantiated. Once that device has been created, it's available as + irqchip id 0. + + This irqchip 0 has 256 interrupt pins, which expose the interrupts in + the main array of interrupt sources (a.k.a. "SRC" interrupts). + + The numbering is the same as the MPIC device tree binding -- based on + the register offset from the beginning of the sources array, without + regard to any subdivisions in chip documentation such as "internal" + or "external" interrupts. + + Default routes are established for these pins, with the GSI being equal + to the pin number. + + Access to non-SRC interrupts is not implemented through IRQ routing mechanisms. diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index c3f8ceffa412..13740a645a6d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -44,6 +44,10 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif +/* These values are internal and can be increased later */ +#define KVM_NR_IRQCHIPS 1 +#define KVM_IRQCHIP_NUM_PINS 256 + #if !defined(CONFIG_KVM_440) #include @@ -256,6 +260,9 @@ struct kvm_arch { #ifdef CONFIG_PPC_BOOK3S_64 struct list_head spapr_tce_tables; #endif +#ifdef CONFIG_KVM_MPIC + struct openpic *mpic; +#endif }; /* diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 02ad96606860..ca871067a69b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -25,6 +25,7 @@ /* Select powerpc specific features in */ #define __KVM_HAVE_SPAPR_TCE #define __KVM_HAVE_PPC_SMT +#define __KVM_HAVE_IRQCHIP struct kvm_regs { __u64 pc; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index f47e95e0b6de..4bf10b520765 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -154,6 +154,9 @@ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" depends on KVM + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_MSI help Enable support for emulating MPIC devices inside the host kernel, rather than relying on userspace to emulate. diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 4a2277a221bb..4eada0c01082 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -104,6 +104,7 @@ kvm-book3s_32-objs := \ kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o +kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o) kvm-objs := $(kvm-objs-m) $(kvm-objs-y) diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h new file mode 100644 index 000000000000..f1e27fdc8c2e --- /dev/null +++ b/arch/powerpc/kvm/irq.h @@ -0,0 +1,17 @@ +#ifndef __IRQ_H +#define __IRQ_H + +#include + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + int ret = 0; + +#ifdef CONFIG_KVM_MPIC + ret = ret || (kvm->arch.mpic != NULL); +#endif + smp_rmb(); + return ret; +} + +#endif diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 10bc08a246fd..89fe1d66a7fb 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -1076,7 +1076,9 @@ static int openpic_cpu_write_internal(void *opaque, gpa_t addr, case 0xA0: /* IACK */ /* Read-only register */ break; - case 0xB0: /* EOI */ + case 0xB0: { /* EOI */ + int notify_eoi; + pr_debug("EOI\n"); s_IRQ = IRQ_get_next(opp, &dst->servicing); @@ -1087,6 +1089,8 @@ static int openpic_cpu_write_internal(void *opaque, gpa_t addr, } IRQ_resetbit(&dst->servicing, s_IRQ); + /* Notify listeners that the IRQ is over */ + notify_eoi = s_IRQ; /* Set up next servicing IRQ */ s_IRQ = IRQ_get_next(opp, &dst->servicing); /* Check queued interrupts. */ @@ -1099,7 +1103,13 @@ static int openpic_cpu_write_internal(void *opaque, gpa_t addr, idx, n_IRQ); mpic_irq_raise(opp, dst, ILR_INTTGT_INT); } + + spin_unlock(&opp->lock); + kvm_notify_acked_irq(opp->kvm, 0, notify_eoi); + spin_lock(&opp->lock); + break; + } default: break; } @@ -1639,14 +1649,34 @@ static void mpic_destroy(struct kvm_device *dev) unmap_mmio(opp); } + dev->kvm->arch.mpic = NULL; kfree(opp); } +static int mpic_set_default_irq_routing(struct openpic *opp) +{ + struct kvm_irq_routing_entry *routing; + + /* Create a nop default map, so that dereferencing it still works */ + routing = kzalloc((sizeof(*routing)), GFP_KERNEL); + if (!routing) + return -ENOMEM; + + kvm_set_irq_routing(opp->kvm, routing, 0, 0); + + kfree(routing); + return 0; +} + static int mpic_create(struct kvm_device *dev, u32 type) { struct openpic *opp; int ret; + /* We only support one MPIC at a time for now */ + if (dev->kvm->arch.mpic) + return -EINVAL; + opp = kzalloc(sizeof(struct openpic), GFP_KERNEL); if (!opp) return -ENOMEM; @@ -1691,7 +1721,15 @@ static int mpic_create(struct kvm_device *dev, u32 type) goto err; } + ret = mpic_set_default_irq_routing(opp); + if (ret) + goto err; + openpic_reset(opp); + + smp_wmb(); + dev->kvm->arch.mpic = opp; + return 0; err: @@ -1761,3 +1799,74 @@ void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu) opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL; kvm_device_put(opp->dev); } + +/* + * Return value: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + u32 irq = e->irqchip.pin; + struct openpic *opp = kvm->arch.mpic; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + openpic_set_irq(opp, irq, level); + spin_unlock_irqrestore(&opp->lock, flags); + + /* All code paths we care about don't check for the return value */ + return 0; +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + struct openpic *opp = kvm->arch.mpic; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + + /* + * XXX We ignore the target address for now, as we only support + * a single MSI bank. + */ + openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data); + spin_unlock_irqrestore(&opp->lock, flags); + + /* All code paths we care about don't check for the return value */ + return 0; +} + +int kvm_set_routing_entry(struct kvm_irq_routing_table *rt, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = mpic_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) + goto out; + rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; + default: + goto out; + } + + r = 0; +out: + return r; +} From 5efdb4be598fc2af6937c3387586635ddf6fd2c8 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 17 Apr 2013 00:37:57 +0200 Subject: [PATCH 163/204] KVM: PPC: MPIC: Add support for KVM_IRQ_LINE Now that all pieces are in place for reusing generic irq infrastructure, we can copy x86's implementation of KVM_IRQ_LINE irq injection and simply reuse it for PPC, as it will work there just as well. Signed-off-by: Alexander Graf --- arch/powerpc/include/uapi/asm/kvm.h | 1 + arch/powerpc/kvm/powerpc.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index ca871067a69b..03c7819a44a3 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -26,6 +26,7 @@ #define __KVM_HAVE_SPAPR_TCE #define __KVM_HAVE_PPC_SMT #define __KVM_HAVE_IRQCHIP +#define __KVM_HAVE_IRQ_LINE struct kvm_regs { __u64 pc; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 5d046bbdf11f..d8e81e6c1afe 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -33,6 +33,7 @@ #include #include #include "timing.h" +#include "irq.h" #include "../mm/mmu_decl.h" #define CREATE_TRACE_POINTS @@ -939,6 +940,18 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) return 0; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level, + line_status); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { From 447a03c02adc57933d328eb15aeae33160bc92e8 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 17 Apr 2013 01:54:26 +0200 Subject: [PATCH 164/204] KVM: PPC: MPIC: Restrict to e500 platforms The code as is doesn't make any sense on non-e500 platforms. Restrict it there, so that people don't get wrong ideas on what would actually work. This patch should get reverted as soon as it's possible to either run e500 guests on non-e500 hosts or the MPIC emulation gains support for non-e500 modes. Signed-off-by: Alexander Graf --- arch/powerpc/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 4bf10b520765..656e0bc29fe8 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -153,7 +153,7 @@ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" - depends on KVM + depends on KVM && E500 select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI From 22e64024fb83065664160d1c28a10aa98cb5f24c Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 25 Apr 2013 11:50:03 +0200 Subject: [PATCH 165/204] KVM: IA64: Carry non-ia64 changes into ia64 We changed a few things in non-ia64 code paths. This patch blindly applies the changes to the ia64 code as well, hoping it proves useful in case anyone revives the ia64 kvm code. Signed-off-by: Alexander Graf --- arch/ia64/include/asm/kvm_host.h | 1 + arch/ia64/kvm/Kconfig | 1 + arch/ia64/kvm/Makefile | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index cfa74983c675..989dd3fe8de1 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -26,6 +26,7 @@ #define KVM_USER_MEM_SLOTS 32 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS /* define exit reasons from vmm to kvm*/ #define EXIT_REASON_VM_PANIC 0 diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 2cd225f8c68d..043183a06409 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -27,6 +27,7 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQ_ROUTING select KVM_APIC_ARCHITECTURE select KVM_MMIO ---help--- diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile index db3d7c5d1071..511f64a78d56 100644 --- a/arch/ia64/kvm/Makefile +++ b/arch/ia64/kvm/Makefile @@ -49,7 +49,7 @@ ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o assigned-dev.o) + coalesced_mmio.o irq_comm.o assigned-dev.o irqchip.o) ifeq ($(CONFIG_IOMMU_API),y) common-objs += $(addprefix ../../../virt/kvm/, iommu.o) From 07f0a7bdec5c4039cfb9b836482c45004d4c21cc Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 25 Apr 2013 14:11:23 +0000 Subject: [PATCH 166/204] kvm: destroy emulated devices on VM exit The hassle of getting refcounting right was greater than the hassle of keeping a list of devices to destroy on VM exit. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/mpic.c | 2 -- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 29 ++++++++++++++++------------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 89fe1d66a7fb..795ca0c9ae69 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -1781,7 +1781,6 @@ int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, if (opp->mpic_mode_mask == GCR_MODE_PROXY) vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL; - kvm_device_get(dev); out: spin_unlock_irq(&opp->lock); return ret; @@ -1797,7 +1796,6 @@ void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu) BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu); opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL; - kvm_device_put(opp->dev); } /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index feffbdaf8986..36c977694741 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -393,6 +393,7 @@ struct kvm { long mmu_notifier_count; #endif long tlbs_dirty; + struct list_head devices; }; #define kvm_err(fmt, ...) \ @@ -1069,8 +1070,8 @@ struct kvm_device_ops; struct kvm_device { struct kvm_device_ops *ops; struct kvm *kvm; - atomic_t users; void *private; + struct list_head vm_node; }; /* create, destroy, and name are mandatory */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f6cd14d2f0d9..5da9f02a2a67 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -504,6 +504,7 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); + INIT_LIST_HEAD(&kvm->devices); r = kvm_init_mmu_notifier(kvm); if (r) @@ -581,6 +582,19 @@ void kvm_free_physmem(struct kvm *kvm) kfree(kvm->memslots); } +static void kvm_destroy_devices(struct kvm *kvm) +{ + struct list_head *node, *tmp; + + list_for_each_safe(node, tmp, &kvm->devices) { + struct kvm_device *dev = + list_entry(node, struct kvm_device, vm_node); + + list_del(node); + dev->ops->destroy(dev); + } +} + static void kvm_destroy_vm(struct kvm *kvm) { int i; @@ -600,6 +614,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); + kvm_destroy_devices(kvm); kvm_free_physmem(kvm); cleanup_srcu_struct(&kvm->srcu); kvm_arch_free_vm(kvm); @@ -2195,23 +2210,11 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, } } -void kvm_device_get(struct kvm_device *dev) -{ - atomic_inc(&dev->users); -} - -void kvm_device_put(struct kvm_device *dev) -{ - if (atomic_dec_and_test(&dev->users)) - dev->ops->destroy(dev); -} - static int kvm_device_release(struct inode *inode, struct file *filp) { struct kvm_device *dev = filp->private_data; struct kvm *kvm = dev->kvm; - kvm_device_put(dev); kvm_put_kvm(kvm); return 0; } @@ -2257,7 +2260,6 @@ static int kvm_ioctl_create_device(struct kvm *kvm, dev->ops = ops; dev->kvm = kvm; - atomic_set(&dev->users, 1); ret = ops->create(dev, cd->type); if (ret < 0) { @@ -2271,6 +2273,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, return ret; } + list_add(&dev->vm_node, &kvm->devices); kvm_get_kvm(kvm); cd->fd = ret; return 0; From 91194919a6b07d70081fe185a79b129efee84fff Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 25 Apr 2013 14:11:24 +0000 Subject: [PATCH 167/204] kvm/ppc/mpic: Eliminate mmio_mapped We no longer need to keep track of this now that MPIC destruction always happens either during VM destruction (after MMIO has been destroyed) or during a failed creation (before the fd has been exposed to userspace, and thus before the MMIO region could have been registered). Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/mpic.c | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 795ca0c9ae69..f3148f8cdc12 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -190,7 +190,6 @@ struct openpic { struct kvm_io_device mmio; struct list_head mmio_regions; atomic_t users; - bool mmio_mapped; gpa_t reg_base; spinlock_t lock; @@ -1428,24 +1427,13 @@ static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr, return ret; } -static void kvm_mpic_dtor(struct kvm_io_device *this) -{ - struct openpic *opp = container_of(this, struct openpic, mmio); - - opp->mmio_mapped = false; -} - static const struct kvm_io_device_ops mpic_mmio_ops = { .read = kvm_mpic_read, .write = kvm_mpic_write, - .destructor = kvm_mpic_dtor, }; static void map_mmio(struct openpic *opp) { - BUG_ON(opp->mmio_mapped); - opp->mmio_mapped = true; - kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops); kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS, @@ -1455,10 +1443,7 @@ static void map_mmio(struct openpic *opp) static void unmap_mmio(struct openpic *opp) { - if (opp->mmio_mapped) { - opp->mmio_mapped = false; - kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); - } + kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); } static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) @@ -1637,18 +1622,6 @@ static void mpic_destroy(struct kvm_device *dev) { struct openpic *opp = dev->private; - if (opp->mmio_mapped) { - /* - * Normally we get unmapped by kvm_io_bus_destroy(), - * which happens before the VCPUs release their references. - * - * Thus, we should only get here if no VCPUs took a reference - * to us in the first place. - */ - WARN_ON(opp->nb_cpus != 0); - unmap_mmio(opp); - } - dev->kvm->arch.mpic = NULL; kfree(opp); } From 8e591cb7204739efa8e15967ea334eb367039dde Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 17 Apr 2013 20:30:00 +0000 Subject: [PATCH 168/204] KVM: PPC: Book3S: Add infrastructure to implement kernel-side RTAS calls For pseries machine emulation, in order to move the interrupt controller code to the kernel, we need to intercept some RTAS calls in the kernel itself. This adds an infrastructure to allow in-kernel handlers to be registered for RTAS services by name. A new ioctl, KVM_PPC_RTAS_DEFINE_TOKEN, then allows userspace to associate token values with those service names. Then, when the guest requests an RTAS service with one of those token values, it will be handled by the relevant in-kernel handler rather than being passed up to userspace as at present. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras [agraf: fix warning] Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 19 +++ arch/powerpc/include/asm/hvcall.h | 3 + arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/kvm_ppc.h | 4 + arch/powerpc/include/uapi/asm/kvm.h | 6 + arch/powerpc/kvm/Makefile | 1 + arch/powerpc/kvm/book3s_hv.c | 18 ++- arch/powerpc/kvm/book3s_pr.c | 1 + arch/powerpc/kvm/book3s_pr_papr.c | 7 ++ arch/powerpc/kvm/book3s_rtas.c | 182 ++++++++++++++++++++++++++++ arch/powerpc/kvm/powerpc.c | 8 ++ include/uapi/linux/kvm.h | 3 + 12 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kvm/book3s_rtas.c diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 149558b1e81e..fb308be8521b 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2341,6 +2341,25 @@ and distributor interface, the ioctl must be called after calling KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the base addresses will return -EEXIST. +4.82 KVM_PPC_RTAS_DEFINE_TOKEN + +Capability: KVM_CAP_PPC_RTAS +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_rtas_token_args +Returns: 0 on success, -1 on error + +Defines a token value for a RTAS (Run Time Abstraction Services) +service in order to allow it to be handled in the kernel. The +argument struct gives the name of the service, which must be the name +of a service that has a kernel-side implementation. If the token +value is non-zero, it will be associated with that service, and +subsequent RTAS calls by the guest specifying that token will be +handled by the kernel. If the token value is 0, then any token +associated with the service will be forgotten, and subsequent RTAS +calls by the guest for that service will be passed to userspace to be +handled. + 5. The kvm_run structure ------------------------ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 4bc2c3dad6ad..cf4df8e2139a 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -270,6 +270,9 @@ #define H_SET_MODE 0x31C #define MAX_HCALL_OPCODE H_SET_MODE +/* Platform specific hcalls, used by KVM */ +#define H_RTAS 0xf000 + #ifndef __ASSEMBLY__ /** diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 13740a645a6d..311f7e6f09e9 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -259,6 +259,7 @@ struct kvm_arch { #endif /* CONFIG_KVM_BOOK3S_64_HV */ #ifdef CONFIG_PPC_BOOK3S_64 struct list_head spapr_tce_tables; + struct list_head rtas_tokens; #endif #ifdef CONFIG_KVM_MPIC struct openpic *mpic; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index df9c80b37905..8a30eb7f2bec 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -166,6 +166,10 @@ extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); +extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp); +extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu); +extern void kvmppc_rtas_tokens_free(struct kvm *kvm); + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 03c7819a44a3..eb9e25c194ad 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -324,6 +324,12 @@ struct kvm_allocate_rma { __u64 rma_size; }; +/* for KVM_CAP_PPC_RTAS */ +struct kvm_rtas_token_args { + char name[120]; + __u64 token; /* Use a token of 0 to undefine a mapping */ +}; + struct kvm_book3e_206_tlb_entry { __u32 mas8; __u32 mas1; diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 4eada0c01082..3faf5c07329c 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -86,6 +86,7 @@ kvm-book3s_64-module-objs := \ emulate.o \ book3s.o \ book3s_64_vio.o \ + book3s_rtas.o \ $(kvm-book3s_64-objs-y) kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5af0f2979833..f3d7af7981c7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -483,7 +483,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) unsigned long req = kvmppc_get_gpr(vcpu, 3); unsigned long target, ret = H_SUCCESS; struct kvm_vcpu *tvcpu; - int idx; + int idx, rc; switch (req) { case H_ENTER: @@ -519,6 +519,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); break; + case H_RTAS: + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + return RESUME_HOST; + + rc = kvmppc_rtas_hcall(vcpu); + + if (rc == -ENOENT) + return RESUME_HOST; + else if (rc == 0) + break; + + /* Send the error out to userspace via KVM_RUN */ + return rc; default: return RESUME_HOST; } @@ -1829,6 +1842,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) cpumask_setall(&kvm->arch.need_tlb_flush); INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); kvm->arch.rma = NULL; @@ -1874,6 +1888,8 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) kvm->arch.rma = NULL; } + kvmppc_rtas_tokens_free(kvm); + kvmppc_free_hpt(kvm); WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index c1cffa882a69..d09baf143500 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1296,6 +1296,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) { #ifdef CONFIG_PPC64 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); #endif if (firmware_has_feature(FW_FEATURE_SET_MODE)) { diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index ee02b30878ed..4efa4a4f3722 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -246,6 +246,13 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; return EMULATE_DONE; + case H_RTAS: + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + return RESUME_HOST; + if (kvmppc_rtas_hcall(vcpu)) + break; + kvmppc_set_gpr(vcpu, 3, 0); + return EMULATE_DONE; } return EMULATE_FAIL; diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c new file mode 100644 index 000000000000..6ad7050eb67d --- /dev/null +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -0,0 +1,182 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +struct rtas_handler { + void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args); + char *name; +}; + +static struct rtas_handler rtas_handlers[] = { }; + +struct rtas_token_definition { + struct list_head list; + struct rtas_handler *handler; + u64 token; +}; + +static int rtas_name_matches(char *s1, char *s2) +{ + struct kvm_rtas_token_args args; + return !strncmp(s1, s2, sizeof(args.name)); +} + +static int rtas_token_undefine(struct kvm *kvm, char *name) +{ + struct rtas_token_definition *d, *tmp; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { + if (rtas_name_matches(d->handler->name, name)) { + list_del(&d->list); + kfree(d); + return 0; + } + } + + /* It's not an error to undefine an undefined token */ + return 0; +} + +static int rtas_token_define(struct kvm *kvm, char *name, u64 token) +{ + struct rtas_token_definition *d; + struct rtas_handler *h = NULL; + bool found; + int i; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry(d, &kvm->arch.rtas_tokens, list) { + if (d->token == token) + return -EEXIST; + } + + found = false; + for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) { + h = &rtas_handlers[i]; + if (rtas_name_matches(h->name, name)) { + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->handler = h; + d->token = token; + + list_add_tail(&d->list, &kvm->arch.rtas_tokens); + + return 0; +} + +int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp) +{ + struct kvm_rtas_token_args args; + int rc; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + mutex_lock(&kvm->lock); + + if (args.token) + rc = rtas_token_define(kvm, args.name, args.token); + else + rc = rtas_token_undefine(kvm, args.name); + + mutex_unlock(&kvm->lock); + + return rc; +} + +int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) +{ + struct rtas_token_definition *d; + struct rtas_args args; + rtas_arg_t *orig_rets; + gpa_t args_phys; + int rc; + + /* r4 contains the guest physical address of the RTAS args */ + args_phys = kvmppc_get_gpr(vcpu, 4); + + rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); + if (rc) + goto fail; + + /* + * args->rets is a pointer into args->args. Now that we've + * copied args we need to fix it up to point into our copy, + * not the guest args. We also need to save the original + * value so we can restore it on the way out. + */ + orig_rets = args.rets; + args.rets = &args.args[args.nargs]; + + mutex_lock(&vcpu->kvm->lock); + + rc = -ENOENT; + list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) { + if (d->token == args.token) { + d->handler->handler(vcpu, &args); + rc = 0; + break; + } + } + + mutex_unlock(&vcpu->kvm->lock); + + if (rc == 0) { + args.rets = orig_rets; + rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args)); + if (rc) + goto fail; + } + + return rc; + +fail: + /* + * We only get here if the guest has called RTAS with a bogus + * args pointer. That means we can't get to the args, and so we + * can't fail the RTAS call. So fail right out to userspace, + * which should kill the guest. + */ + return rc; +} + +void kvmppc_rtas_tokens_free(struct kvm *kvm) +{ + struct rtas_token_definition *d, *tmp; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { + list_del(&d->list); + kfree(d); + } +} diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index d8e81e6c1afe..d4fd443ae7bd 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -341,6 +341,7 @@ int kvm_dev_ioctl_check_extension(long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_PPC_ALLOC_HTAB: + case KVM_CAP_PPC_RTAS: r = 1; break; #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -986,6 +987,7 @@ long kvm_arch_vm_ioctl(struct file *filp, #ifdef CONFIG_KVM_BOOK3S_64_HV case KVM_ALLOCATE_RMA: { struct kvm_allocate_rma rma; + struct kvm *kvm = filp->private_data; r = kvm_vm_ioctl_allocate_rma(kvm, &rma); if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) @@ -1030,6 +1032,12 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_PPC_RTAS_DEFINE_TOKEN: { + struct kvm *kvm = filp->private_data; + + r = kvm_vm_ioctl_rtas_define_token(kvm, argp); + break; + } #endif /* CONFIG_PPC_BOOK3S_64 */ default: r = -ENOTTY; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0ebf59b50ed0..d4005192ad6e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -668,6 +668,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_SET_DEVICE_ADDR 88 #define KVM_CAP_DEVICE_CTRL 89 #define KVM_CAP_IRQ_MPIC 90 +#define KVM_CAP_PPC_RTAS 91 #ifdef KVM_CAP_IRQ_ROUTING @@ -928,6 +929,8 @@ struct kvm_s390_ucas_mapping { #define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd) /* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */ #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) +/* Available with KVM_CAP_PPC_RTAS */ +#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) From bc5ad3f3701116e7db57268e6f89010ec714697e Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 17 Apr 2013 20:30:26 +0000 Subject: [PATCH 169/204] KVM: PPC: Book3S: Add kernel emulation for the XICS interrupt controller This adds in-kernel emulation of the XICS (eXternal Interrupt Controller Specification) interrupt controller specified by PAPR, for both HV and PR KVM guests. The XICS emulation supports up to 1048560 interrupt sources. Interrupt source numbers below 16 are reserved; 0 is used to mean no interrupt and 2 is used for IPIs. Internally these are represented in blocks of 1024, called ICS (interrupt controller source) entities, but that is not visible to userspace. Each vcpu gets one ICP (interrupt controller presentation) entity, used to store the per-vcpu state such as vcpu priority, pending interrupt state, IPI request, etc. This does not include any API or any way to connect vcpus to their ICP state; that will be added in later patches. This is based on an initial implementation by Michael Ellerman reworked by Benjamin Herrenschmidt and Paul Mackerras. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras [agraf: fix typo, add dependency on !KVM_MPIC] Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s.h | 2 + arch/powerpc/include/asm/kvm_host.h | 11 + arch/powerpc/include/asm/kvm_ppc.h | 29 + arch/powerpc/kvm/Kconfig | 8 + arch/powerpc/kvm/Makefile | 3 + arch/powerpc/kvm/book3s.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 9 + arch/powerpc/kvm/book3s_pr_papr.c | 14 + arch/powerpc/kvm/book3s_rtas.c | 54 +- arch/powerpc/kvm/book3s_xics.c | 946 ++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_xics.h | 113 +++ arch/powerpc/kvm/powerpc.c | 3 + 12 files changed, 1192 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_xics.c create mode 100644 arch/powerpc/kvm/book3s_xics.h diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index c55f7e6affaa..349ed85c7d61 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -142,6 +142,8 @@ extern int kvmppc_mmu_hv_init(void); extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); +extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, + unsigned int vec); extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags); extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper, u32 val); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 311f7e6f09e9..af326cde7cb6 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -192,6 +192,10 @@ struct kvmppc_linear_info { int type; }; +/* XICS components, defined in book3s_xics.c */ +struct kvmppc_xics; +struct kvmppc_icp; + /* * The reverse mapping array has one entry for each HPTE, * which stores the guest's view of the second word of the HPTE @@ -264,6 +268,9 @@ struct kvm_arch { #ifdef CONFIG_KVM_MPIC struct openpic *mpic; #endif +#ifdef CONFIG_KVM_XICS + struct kvmppc_xics *xics; +#endif }; /* @@ -387,6 +394,7 @@ struct kvmppc_booke_debug_reg { #define KVMPPC_IRQ_DEFAULT 0 #define KVMPPC_IRQ_MPIC 1 +#define KVMPPC_IRQ_XICS 2 struct openpic; @@ -574,6 +582,9 @@ struct kvm_vcpu_arch { int irq_type; /* one of KVM_IRQ_* */ int irq_cpu_id; struct openpic *mpic; /* KVM_IRQ_MPIC */ +#ifdef CONFIG_KVM_XICS + struct kvmppc_icp *icp; /* XICS presentation controller */ +#endif #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu_arch_shared shregs; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 8a30eb7f2bec..6582eed321ba 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -130,6 +130,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm, extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long porder); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); + extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, @@ -169,6 +170,10 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp); extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu); extern void kvmppc_rtas_tokens_free(struct kvm *kvm); +extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, + u32 priority); +extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, + u32 *priority); /* * Cuts out inst bits with ordering according to spec. @@ -267,6 +272,30 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) static inline void kvm_linear_init(void) {} + +#endif + +#ifdef CONFIG_KVM_XICS +static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.irq_type == KVMPPC_IRQ_XICS; +} +extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); +extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); +extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); +extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); +#else +static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) + { return 0; } +static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } +static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, + unsigned long server) + { return -EINVAL; } +static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm, + struct kvm_irq_level *args) + { return -ENOTTY; } +static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) + { return 0; } #endif static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 656e0bc29fe8..eb643f862579 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -163,6 +163,14 @@ config KVM_MPIC Currently, support is limited to certain versions of Freescale's MPIC implementation. +config KVM_XICS + bool "KVM in-kernel XICS emulation" + depends on KVM_BOOK3S_64 && !KVM_MPIC + ---help--- + Include support for the XICS (eXternal Interrupt Controller + Specification) interrupt controller architecture used on + IBM POWER (pSeries) servers. + source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 3faf5c07329c..f9b87b540450 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -79,6 +79,9 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv_ras.o \ book3s_hv_builtin.o +kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ + book3s_xics.o + kvm-book3s_64-module-objs := \ ../../../virt/kvm/kvm_main.o \ ../../../virt/kvm/eventfd.o \ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 128ed3a856b9..1a4d787df507 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -104,7 +104,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) return prio; } -static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, +void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) { unsigned long old_pending = vcpu->arch.pending_exceptions; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index f3d7af7981c7..82ba00f68b07 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -532,6 +532,15 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) /* Send the error out to userspace via KVM_RUN */ return rc; + + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + if (kvmppc_xics_enabled(vcpu)) { + ret = kvmppc_xics_hcall(vcpu, req); + break; + } /* fallthrough */ default: return RESUME_HOST; } diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index 4efa4a4f3722..b24309c6c2d5 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -227,6 +227,13 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) +{ + long rc = kvmppc_xics_hcall(vcpu, cmd); + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) { switch (cmd) { @@ -246,6 +253,13 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; return EMULATE_DONE; + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + if (kvmppc_xics_enabled(vcpu)) + return kvmppc_h_pr_xics_hcall(vcpu, cmd); + break; case H_RTAS: if (list_empty(&vcpu->kvm->arch.rtas_tokens)) return RESUME_HOST; diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 6ad7050eb67d..77f9aa5f4ba5 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -17,13 +17,65 @@ #include #include +#ifdef CONFIG_KVM_XICS +static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq, server, priority; + int rc; + + if (args->nargs != 3 || args->nret != 1) { + rc = -3; + goto out; + } + + irq = args->args[0]; + server = args->args[1]; + priority = args->args[2]; + + rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); + if (rc) + rc = -3; +out: + args->rets[0] = rc; +} + +static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq, server, priority; + int rc; + + if (args->nargs != 1 || args->nret != 3) { + rc = -3; + goto out; + } + + irq = args->args[0]; + + server = priority = 0; + rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); + if (rc) { + rc = -3; + goto out; + } + + args->rets[1] = server; + args->rets[2] = priority; +out: + args->rets[0] = rc; +} +#endif /* CONFIG_KVM_XICS */ struct rtas_handler { void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args); char *name; }; -static struct rtas_handler rtas_handlers[] = { }; +static struct rtas_handler rtas_handlers[] = { +#ifdef CONFIG_KVM_XICS + { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive }, + { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive }, +#endif +}; struct rtas_token_definition { struct list_head list; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c new file mode 100644 index 000000000000..53af848116f2 --- /dev/null +++ b/arch/powerpc/kvm/book3s_xics.c @@ -0,0 +1,946 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "book3s_xics.h" + +#if 1 +#define XICS_DBG(fmt...) do { } while (0) +#else +#define XICS_DBG(fmt...) trace_printk(fmt) +#endif + +/* + * LOCKING + * ======= + * + * Each ICS has a mutex protecting the information about the IRQ + * sources and avoiding simultaneous deliveries if the same interrupt. + * + * ICP operations are done via a single compare & swap transaction + * (most ICP state fits in the union kvmppc_icp_state) + */ + +/* + * TODO + * ==== + * + * - To speed up resends, keep a bitmap of "resend" set bits in the + * ICS + * + * - Speed up server# -> ICP lookup (array ? hash table ?) + * + * - Make ICS lockless as well, or at least a per-interrupt lock or hashed + * locks array to improve scalability + * + * - ioctl's to save/restore the entire state for snapshot & migration + */ + +/* -- ICS routines -- */ + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq); + +static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u16 src; + + XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq); + return -EINVAL; + } + state = &ics->irq_state[src]; + if (!state->exists) + return -EINVAL; + + /* + * We set state->asserted locklessly. This should be fine as + * we are the only setter, thus concurrent access is undefined + * to begin with. + */ + if (level == KVM_INTERRUPT_SET_LEVEL) + state->asserted = 1; + else if (level == KVM_INTERRUPT_UNSET) { + state->asserted = 0; + return 0; + } + + /* Attempt delivery */ + icp_deliver_irq(xics, NULL, irq); + + return 0; +} + +static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, + struct kvmppc_icp *icp) +{ + int i; + + mutex_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *state = &ics->irq_state[i]; + + if (!state->resend) + continue; + + XICS_DBG("resend %#x prio %#x\n", state->number, + state->priority); + + mutex_unlock(&ics->lock); + icp_deliver_irq(xics, icp, state->number); + mutex_lock(&ics->lock); + } + + mutex_unlock(&ics->lock); +} + +int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + bool deliver; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + icp = kvmppc_xics_find_server(kvm, server); + if (!icp) + return -EINVAL; + + mutex_lock(&ics->lock); + + XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n", + irq, server, priority, + state->masked_pending, state->resend); + + state->server = server; + state->priority = priority; + deliver = false; + if ((state->masked_pending || state->resend) && priority != MASKED) { + state->masked_pending = 0; + deliver = true; + } + + mutex_unlock(&ics->lock); + + if (deliver) + icp_deliver_irq(xics, icp, irq); + + return 0; +} + +int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + mutex_lock(&ics->lock); + *server = state->server; + *priority = state->priority; + mutex_unlock(&ics->lock); + + return 0; +} + +/* -- ICP routines, including hcalls -- */ + +static inline bool icp_try_update(struct kvmppc_icp *icp, + union kvmppc_icp_state old, + union kvmppc_icp_state new, + bool change_self) +{ + bool success; + + /* Calculate new output value */ + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + + /* Attempt atomic update */ + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; + if (!success) + goto bail; + + XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + icp->server_num, + old.cppr, old.mfrr, old.pending_pri, old.xisr, + old.need_resend, old.out_ee); + XICS_DBG("UPD - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + new.cppr, new.mfrr, new.pending_pri, new.xisr, + new.need_resend, new.out_ee); + /* + * Check for output state update + * + * Note that this is racy since another processor could be updating + * the state already. This is why we never clear the interrupt output + * here, we only ever set it. The clear only happens prior to doing + * an update and only by the processor itself. Currently we do it + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). + * + * We also do not try to figure out whether the EE state has changed, + * we unconditionally set it if the new state calls for it for the + * same reason. + */ + if (new.out_ee) { + kvmppc_book3s_queue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + if (!change_self) + kvm_vcpu_kick(icp->vcpu); + } + bail: + return success; +} + +static void icp_check_resend(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + u32 icsid; + + /* Order this load with the test for need_resend in the caller */ + smp_rmb(); + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!test_and_clear_bit(icsid, icp->resend_map)) + continue; + if (!ics) + continue; + ics_check_resend(xics, ics, icp); + } +} + +static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, + u32 *reject) +{ + union kvmppc_icp_state old_state, new_state; + bool success; + + XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority, + icp->server_num); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + *reject = 0; + + /* See if we can deliver */ + success = new_state.cppr > priority && + new_state.mfrr > priority && + new_state.pending_pri > priority; + + /* + * If we can, check for a rejection and perform the + * delivery + */ + if (success) { + *reject = new_state.xisr; + new_state.xisr = irq; + new_state.pending_pri = priority; + } else { + /* + * If we failed to deliver we set need_resend + * so a subsequent CPPR state change causes us + * to try a new delivery. + */ + new_state.need_resend = true; + } + + } while (!icp_try_update(icp, old_state, new_state, false)); + + return success; +} + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u32 reject; + u16 src; + + /* + * This is used both for initial delivery of an interrupt and + * for subsequent rejection. + * + * Rejection can be racy vs. resends. We have evaluated the + * rejection in an atomic ICP transaction which is now complete, + * so potentially the ICP can already accept the interrupt again. + * + * So we need to retry the delivery. Essentially the reject path + * boils down to a failed delivery. Always. + * + * Now the interrupt could also have moved to a different target, + * thus we may need to re-do the ICP lookup as well + */ + + again: + /* Get the ICS state and lock it */ + ics = kvmppc_xics_find_ics(xics, new_irq, &src); + if (!ics) { + XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq); + return; + } + state = &ics->irq_state[src]; + + /* Get a lock on the ICS */ + mutex_lock(&ics->lock); + + /* Get our server */ + if (!icp || state->server != icp->server_num) { + icp = kvmppc_xics_find_server(xics->kvm, state->server); + if (!icp) { + pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n", + new_irq, state->server); + goto out; + } + } + + /* Clear the resend bit of that interrupt */ + state->resend = 0; + + /* + * If masked, bail out + * + * Note: PAPR doesn't mention anything about masked pending + * when doing a resend, only when doing a delivery. + * + * However that would have the effect of losing a masked + * interrupt that was rejected and isn't consistent with + * the whole masked_pending business which is about not + * losing interrupts that occur while masked. + * + * I don't differenciate normal deliveries and resends, this + * implementation will differ from PAPR and not lose such + * interrupts. + */ + if (state->priority == MASKED) { + XICS_DBG("irq %#x masked pending\n", new_irq); + state->masked_pending = 1; + goto out; + } + + /* + * Try the delivery, this will set the need_resend flag + * in the ICP as part of the atomic transaction if the + * delivery is not possible. + * + * Note that if successful, the new delivery might have itself + * rejected an interrupt that was "delivered" before we took the + * icp mutex. + * + * In this case we do the whole sequence all over again for the + * new guy. We cannot assume that the rejected interrupt is less + * favored than the new one, and thus doesn't need to be delivered, + * because by the time we exit icp_try_to_deliver() the target + * processor may well have alrady consumed & completed it, and thus + * the rejected interrupt might actually be already acceptable. + */ + if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) { + /* + * Delivery was successful, did we reject somebody else ? + */ + if (reject && reject != XICS_IPI) { + mutex_unlock(&ics->lock); + new_irq = reject; + goto again; + } + } else { + /* + * We failed to deliver the interrupt we need to set the + * resend map bit and mark the ICS state as needing a resend + */ + set_bit(ics->icsid, icp->resend_map); + state->resend = 1; + + /* + * If the need_resend flag got cleared in the ICP some time + * between icp_try_to_deliver() atomic update and now, then + * we know it might have missed the resend_map bit. So we + * retry + */ + smp_mb(); + if (!icp->state.need_resend) { + mutex_unlock(&ics->lock); + goto again; + } + } + out: + mutex_unlock(&ics->lock); +} + +static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u8 new_cppr) +{ + union kvmppc_icp_state old_state, new_state; + bool resend; + + /* + * This handles several related states in one operation: + * + * ICP State: Down_CPPR + * + * Load CPPR with new value and if the XISR is 0 + * then check for resends: + * + * ICP State: Resend + * + * If MFRR is more favored than CPPR, check for IPIs + * and notify ICS of a potential resend. This is done + * asynchronously (when used in real mode, we will have + * to exit here). + * + * We do not handle the complete Check_IPI as documented + * here. In the PAPR, this state will be used for both + * Set_MFRR and Down_CPPR. However, we know that we aren't + * changing the MFRR state here so we don't need to handle + * the case of an MFRR causing a reject of a pending irq, + * this will have been handled when the MFRR was set in the + * first place. + * + * Thus we don't have to handle rejects, only resends. + * + * When implementing real mode for HV KVM, resend will lead to + * a H_TOO_HARD return and the whole transaction will be handled + * in virtual mode. + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Down_CPPR */ + new_state.cppr = new_cppr; + + /* + * Cut down Resend / Check_IPI / IPI + * + * The logic is that we cannot have a pending interrupt + * trumped by an IPI at this point (see above), so we + * know that either the pending interrupt is already an + * IPI (in which case we don't care to override it) or + * it's either more favored than us or non existent + */ + if (new_state.mfrr < new_cppr && + new_state.mfrr <= new_state.pending_pri) { + WARN_ON(new_state.xisr != XICS_IPI && + new_state.xisr != 0); + new_state.pending_pri = new_state.mfrr; + new_state.xisr = XICS_IPI; + } + + /* Latch/clear resend bit */ + resend = new_state.need_resend; + new_state.need_resend = 0; + + } while (!icp_try_update(icp, old_state, new_state, true)); + + /* + * Now handle resend checks. Those are asynchronous to the ICP + * state update in HW (ie bus transactions) so we can handle them + * separately here too + */ + if (resend) + icp_check_resend(xics, icp); +} + +static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 xirr; + + /* First, remove EE from the processor */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + /* + * ICP State: Accept_Interrupt + * + * Return the pending interrupt (if any) along with the + * current CPPR, then clear the XISR & set CPPR to the + * pending priority + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); + if (!old_state.xisr) + break; + new_state.cppr = new_state.pending_pri; + new_state.pending_pri = 0xff; + new_state.xisr = 0; + + } while (!icp_try_update(icp, old_state, new_state, true)); + + XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr); + + return xirr; +} + +static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp; + u32 reject; + bool resend; + bool local; + + XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n", + vcpu->vcpu_id, server, mfrr); + + icp = vcpu->arch.icp; + local = icp->server_num == server; + if (!local) { + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + } + + /* + * ICP state: Set_MFRR + * + * If the CPPR is more favored than the new MFRR, then + * nothing needs to be rejected as there can be no XISR to + * reject. If the MFRR is being made less favored then + * there might be a previously-rejected interrupt needing + * to be resent. + * + * If the CPPR is less favored, then we might be replacing + * an interrupt, and thus need to possibly reject it as in + * + * ICP state: Check_IPI + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Set_MFRR */ + new_state.mfrr = mfrr; + + /* Check_IPI */ + reject = 0; + resend = false; + if (mfrr < new_state.cppr) { + /* Reject a pending interrupt if not an IPI */ + if (mfrr <= new_state.pending_pri) + reject = new_state.xisr; + new_state.pending_pri = mfrr; + new_state.xisr = XICS_IPI; + } + + if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { + resend = new_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_try_update(icp, old_state, new_state, local)); + + /* Handle reject */ + if (reject && reject != XICS_IPI) + icp_deliver_irq(xics, icp, reject); + + /* Handle resend */ + if (resend) + icp_check_resend(xics, icp); + + return H_SUCCESS; +} + +static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 reject; + + XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr); + + /* + * ICP State: Set_CPPR + * + * We can safely compare the new value with the current + * value outside of the transaction as the CPPR is only + * ever changed by the processor on itself + */ + if (cppr > icp->state.cppr) + icp_down_cppr(xics, icp, cppr); + else if (cppr == icp->state.cppr) + return; + + /* + * ICP State: Up_CPPR + * + * The processor is raising its priority, this can result + * in a rejection of a pending interrupt: + * + * ICP State: Reject_Current + * + * We can remove EE from the current processor, the update + * transaction will set it again if needed + */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + reject = 0; + new_state.cppr = cppr; + + if (cppr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.xisr = 0; + new_state.pending_pri = 0xff; + } + + } while (!icp_try_update(icp, old_state, new_state, true)); + + /* + * Check for rejects. They are handled by doing a new delivery + * attempt (see comments in icp_deliver_irq). + */ + if (reject && reject != XICS_IPI) + icp_deliver_irq(xics, icp, reject); +} + +static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u32 irq = xirr & 0x00ffffff; + u16 src; + + XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR sepcifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + return H_SUCCESS; + /* + * EOI handling: If the interrupt is still asserted, we need to + * resend it. We can take a lockless "peek" at the ICS state here. + * + * "Message" interrupts will never have "asserted" set + */ + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); + return H_PARAMETER; + } + state = &ics->irq_state[src]; + + /* Still asserted, resend it */ + if (state->asserted) + icp_deliver_irq(xics, icp, irq); + + return H_SUCCESS; +} + +int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) +{ + unsigned long res; + int rc = H_SUCCESS; + + /* Check if we have an ICP */ + if (!vcpu->arch.icp || !vcpu->kvm->arch.xics) + return H_HARDWARE; + + switch (req) { + case H_XIRR: + res = h_xirr(vcpu); + kvmppc_set_gpr(vcpu, 4, res); + break; + case H_CPPR: + h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_EOI: + rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_IPI: + rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + break; + } + + return rc; +} + + +/* -- Initialisation code etc. -- */ + +static int xics_debug_show(struct seq_file *m, void *private) +{ + struct kvmppc_xics *xics = m->private; + struct kvm *kvm = xics->kvm; + struct kvm_vcpu *vcpu; + int icsid, i; + + if (!kvm) + return 0; + + seq_printf(m, "=========\nICP state\n=========\n"); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvmppc_icp *icp = vcpu->arch.icp; + union kvmppc_icp_state state; + + if (!icp) + continue; + + state.raw = ACCESS_ONCE(icp->state.raw); + seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n", + icp->server_num, state.xisr, + state.pending_pri, state.cppr, state.mfrr, + state.out_ee, state.need_resend); + } + + for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!ics) + continue; + + seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", + icsid); + + mutex_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *irq = &ics->irq_state[i]; + + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", + irq->number, irq->server, irq->priority, + irq->saved_priority, irq->asserted, + irq->resend, irq->masked_pending); + + } + mutex_unlock(&ics->lock); + } + return 0; +} + +static int xics_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, xics_debug_show, inode->i_private); +} + +static const struct file_operations xics_debug_fops = { + .open = xics_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void xics_debugfs_init(struct kvmppc_xics *xics) +{ + char *name; + + name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics); + if (!name) { + pr_err("%s: no memory for name\n", __func__); + return; + } + + xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, + xics, &xics_debug_fops); + + pr_debug("%s: created %s\n", __func__, name); + kfree(name); +} + +struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, + struct kvmppc_xics *xics, int irq) +{ + struct kvmppc_ics *ics; + int i, icsid; + + icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + + mutex_lock(&kvm->lock); + + /* ICS already exists - somebody else got here first */ + if (xics->ics[icsid]) + goto out; + + /* Create the ICS */ + ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL); + if (!ics) + goto out; + + mutex_init(&ics->lock); + ics->icsid = icsid; + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i; + ics->irq_state[i].priority = MASKED; + ics->irq_state[i].saved_priority = MASKED; + } + smp_wmb(); + xics->ics[icsid] = ics; + + if (icsid > xics->max_icsid) + xics->max_icsid = icsid; + + out: + mutex_unlock(&kvm->lock); + return xics->ics[icsid]; +} + +int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) +{ + struct kvmppc_icp *icp; + + if (!vcpu->kvm->arch.xics) + return -ENODEV; + + if (kvmppc_xics_find_server(vcpu->kvm, server_num)) + return -EEXIST; + + icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL); + if (!icp) + return -ENOMEM; + + icp->vcpu = vcpu; + icp->server_num = server_num; + icp->state.mfrr = MASKED; + icp->state.pending_pri = MASKED; + vcpu->arch.icp = icp; + + XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id); + + return 0; +} + +/* -- ioctls -- */ + +int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args) +{ + struct kvmppc_xics *xics; + int r; + + /* locking against multiple callers? */ + + xics = kvm->arch.xics; + if (!xics) + return -ENODEV; + + switch (args->level) { + case KVM_INTERRUPT_SET: + case KVM_INTERRUPT_SET_LEVEL: + case KVM_INTERRUPT_UNSET: + r = ics_deliver_irq(xics, args->irq, args->level); + break; + default: + r = -EINVAL; + } + + return r; +} + +void kvmppc_xics_free(struct kvmppc_xics *xics) +{ + int i; + struct kvm *kvm = xics->kvm; + + debugfs_remove(xics->dentry); + + if (kvm) + kvm->arch.xics = NULL; + + for (i = 0; i <= xics->max_icsid; i++) + kfree(xics->ics[i]); + kfree(xics); +} + +int kvm_xics_create(struct kvm *kvm, u32 type) +{ + struct kvmppc_xics *xics; + int ret = 0; + + xics = kzalloc(sizeof(*xics), GFP_KERNEL); + if (!xics) + return -ENOMEM; + + xics->kvm = kvm; + + /* Already there ? */ + mutex_lock(&kvm->lock); + if (kvm->arch.xics) + ret = -EEXIST; + else + kvm->arch.xics = xics; + mutex_unlock(&kvm->lock); + + if (ret) + return ret; + + xics_debugfs_init(xics); + + return 0; +} + +void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.icp) + return; + kfree(vcpu->arch.icp); + vcpu->arch.icp = NULL; + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; +} diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h new file mode 100644 index 000000000000..58ee190de5e5 --- /dev/null +++ b/arch/powerpc/kvm/book3s_xics.h @@ -0,0 +1,113 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef _KVM_PPC_BOOK3S_XICS_H +#define _KVM_PPC_BOOK3S_XICS_H + +/* + * We use a two-level tree to store interrupt source information. + * There are up to 1024 ICS nodes, each of which can represent + * 1024 sources. + */ +#define KVMPPC_XICS_MAX_ICS_ID 1023 +#define KVMPPC_XICS_ICS_SHIFT 10 +#define KVMPPC_XICS_IRQ_PER_ICS (1 << KVMPPC_XICS_ICS_SHIFT) +#define KVMPPC_XICS_SRC_MASK (KVMPPC_XICS_IRQ_PER_ICS - 1) + +/* + * Interrupt source numbers below this are reserved, for example + * 0 is "no interrupt", and 2 is used for IPIs. + */ +#define KVMPPC_XICS_FIRST_IRQ 16 +#define KVMPPC_XICS_NR_IRQS ((KVMPPC_XICS_MAX_ICS_ID + 1) * \ + KVMPPC_XICS_IRQ_PER_ICS) + +/* Priority value to use for disabling an interrupt */ +#define MASKED 0xff + +/* State for one irq source */ +struct ics_irq_state { + u32 number; + u32 server; + u8 priority; + u8 saved_priority; /* currently unused */ + u8 resend; + u8 masked_pending; + u8 asserted; /* Only for LSI */ + u8 exists; +}; + +/* Atomic ICP state, updated with a single compare & swap */ +union kvmppc_icp_state { + unsigned long raw; + struct { + u8 out_ee:1; + u8 need_resend:1; + u8 cppr; + u8 mfrr; + u8 pending_pri; + u32 xisr; + }; +}; + +/* One bit per ICS */ +#define ICP_RESEND_MAP_SIZE (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1) + +struct kvmppc_icp { + struct kvm_vcpu *vcpu; + unsigned long server_num; + union kvmppc_icp_state state; + unsigned long resend_map[ICP_RESEND_MAP_SIZE]; +}; + +struct kvmppc_ics { + struct mutex lock; + u16 icsid; + struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; +}; + +struct kvmppc_xics { + struct kvm *kvm; + struct dentry *dentry; + u32 max_icsid; + struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; +}; + +static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, + u32 nr) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num) + return vcpu->arch.icp; + } + return NULL; +} + +static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics, + u32 irq, u16 *source) +{ + u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + u16 src = irq & KVMPPC_XICS_SRC_MASK; + struct kvmppc_ics *ics; + + if (source) + *source = src; + if (icsid > KVMPPC_XICS_MAX_ICS_ID) + return NULL; + ics = xics->ics[icsid]; + if (!ics) + return NULL; + return ics; +} + + +#endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index d4fd443ae7bd..31084c6335c9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -471,6 +471,9 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) case KVMPPC_IRQ_MPIC: kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); break; + case KVMPPC_IRQ_XICS: + kvmppc_xics_free_icp(vcpu); + break; } kvmppc_core_vcpu_free(vcpu); From 54695c3088a74e25474db8eb6b490b45d1aeb0ca Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 17 Apr 2013 20:30:50 +0000 Subject: [PATCH 170/204] KVM: PPC: Book3S HV: Speed up wakeups of CPUs on HV KVM Currently, we wake up a CPU by sending a host IPI with smp_send_reschedule() to thread 0 of that core, which will take all threads out of the guest, and cause them to re-evaluate their interrupt status on the way back in. This adds a mechanism to differentiate real host IPIs from IPIs sent by KVM for guest threads to poke each other, in order to target the guest threads precisely when possible and avoid that global switch of the core to host state. We then use this new facility in the in-kernel XICS code. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s_asm.h | 8 +- arch/powerpc/include/asm/kvm_ppc.h | 29 ++++++ arch/powerpc/kernel/asm-offsets.c | 2 + arch/powerpc/kvm/book3s_hv.c | 26 +++++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 102 ++++++++++++++++++---- arch/powerpc/kvm/book3s_xics.c | 2 +- arch/powerpc/sysdev/xics/icp-native.c | 8 ++ 7 files changed, 158 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index cdc3d2717cc6..9039d3c97eec 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -20,6 +20,11 @@ #ifndef __ASM_KVM_BOOK3S_ASM_H__ #define __ASM_KVM_BOOK3S_ASM_H__ +/* XICS ICP register offsets */ +#define XICS_XIRR 4 +#define XICS_MFRR 0xc +#define XICS_IPI 2 /* interrupt source # for IPIs */ + #ifdef __ASSEMBLY__ #ifdef CONFIG_KVM_BOOK3S_HANDLER @@ -81,10 +86,11 @@ struct kvmppc_host_state { #ifdef CONFIG_KVM_BOOK3S_64_HV u8 hwthread_req; u8 hwthread_state; - + u8 host_ipi; struct kvm_vcpu *kvm_vcpu; struct kvmppc_vcore *kvm_vcore; unsigned long xics_phys; + u32 saved_xirr; u64 dabr; u64 host_mmcr[3]; u32 host_pmc[8]; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 6582eed321ba..1589fd8bf063 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -264,6 +264,21 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) paca[cpu].kvm_hstate.xics_phys = addr; } +static inline u32 kvmppc_get_xics_latch(void) +{ + u32 xirr = get_paca()->kvm_hstate.saved_xirr; + + get_paca()->kvm_hstate.saved_xirr = 0; + + return xirr; +} + +static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) +{ + paca[cpu].kvm_hstate.host_ipi = host_ipi; +} + +extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu); extern void kvm_linear_init(void); #else @@ -273,6 +288,18 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) static inline void kvm_linear_init(void) {} +static inline u32 kvmppc_get_xics_latch(void) +{ + return 0; +} + +static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) +{} + +static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) +{ + kvm_vcpu_kick(vcpu); +} #endif #ifdef CONFIG_KVM_XICS @@ -393,4 +420,6 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb) return ea; } +extern void xics_wake_cpu(int cpu); + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index dbfd5498f440..a791229329cf 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -574,6 +574,8 @@ int main(void) HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); + HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); + HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); HSTATE_FIELD(HSTATE_MMCR, host_mmcr); HSTATE_FIELD(HSTATE_PMC, host_pmc); HSTATE_FIELD(HSTATE_PURR, host_purr); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 82ba00f68b07..16191915e8d0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -66,6 +66,31 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int me; + int cpu = vcpu->cpu; + wait_queue_head_t *wqp; + + wqp = kvm_arch_vcpu_wq(vcpu); + if (waitqueue_active(wqp)) { + wake_up_interruptible(wqp); + ++vcpu->stat.halt_wakeup; + } + + me = get_cpu(); + + /* CPU points to the first thread of the core */ + if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { + int real_cpu = cpu + vcpu->arch.ptid; + if (paca[real_cpu].kvm_hstate.xics_phys) + xics_wake_cpu(real_cpu); + else if (cpu_online(cpu)) + smp_send_reschedule(cpu); + } + put_cpu(); +} + /* * We use the vcpu_load/put functions to measure stolen time. * Stolen time is counted as time when either the vcpu is able to @@ -985,7 +1010,6 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu) } extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); -extern void xics_wake_cpu(int cpu); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 0f23bb851711..56f8927b0ddf 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -79,10 +79,6 @@ _GLOBAL(kvmppc_hv_entry_trampoline) * * *****************************************************************************/ -#define XICS_XIRR 4 -#define XICS_QIRR 0xc -#define XICS_IPI 2 /* interrupt source # for IPIs */ - /* * We come in here when wakened from nap mode on a secondary hw thread. * Relocation is off and most register values are lost. @@ -122,7 +118,7 @@ kvm_start_guest: beq 27f 25: ld r5,HSTATE_XICS_PHYS(r13) li r0,0xff - li r6,XICS_QIRR + li r6,XICS_MFRR li r7,XICS_XIRR lwzcix r8,r5,r7 /* get and ack the interrupt */ sync @@ -678,17 +674,91 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode - /* Check for mediated interrupts (could be done earlier really ...) */ + /* Only handle external interrupts here on arch 206 and later */ BEGIN_FTR_SECTION - cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL - bne+ 1f - andi. r0,r11,MSR_EE - beq 1f - mfspr r5,SPRN_LPCR - andi. r0,r5,LPCR_MER + b ext_interrupt_to_host +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) + + /* External interrupt ? */ + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + bne+ ext_interrupt_to_host + + /* External interrupt, first check for host_ipi. If this is + * set, we know the host wants us out so let's do it now + */ + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne ext_interrupt_to_host + + /* Now read the interrupt from the ICP */ + ld r5, HSTATE_XICS_PHYS(r13) + li r7, XICS_XIRR + cmpdi r5, 0 + beq- ext_interrupt_to_host + lwzcix r3, r5, r7 + rlwinm. r0, r3, 0, 0xffffff + sync + bne 1f + + /* Nothing pending in the ICP, check for mediated interrupts + * and bounce it to the guest + */ + andi. r0, r11, MSR_EE + beq ext_interrupt_to_host /* shouldn't happen ?? */ + mfspr r5, SPRN_LPCR + andi. r0, r5, LPCR_MER bne bounce_ext_interrupt -1: -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + b ext_interrupt_to_host /* shouldn't happen ?? */ + +1: /* We found something in the ICP... + * + * If it's not an IPI, stash it in the PACA and return to + * the host, we don't (yet) handle directing real external + * interrupts directly to the guest + */ + cmpwi r0, XICS_IPI + bne ext_stash_for_host + + /* It's an IPI, clear the MFRR and EOI it */ + li r0, 0xff + li r6, XICS_MFRR + stbcix r0, r5, r6 /* clear the IPI */ + stwcix r3, r5, r7 /* EOI it */ + sync + + /* We need to re-check host IPI now in case it got set in the + * meantime. If it's clear, we bounce the interrupt to the + * guest + */ + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne- 1f + + /* Allright, looks like an IPI for the guest, we need to set MER */ + mfspr r8,SPRN_LPCR + ori r8,r8,LPCR_MER + mtspr SPRN_LPCR,r8 + + /* And if the guest EE is set, we can deliver immediately, else + * we return to the guest with MER set + */ + andi. r0, r11, MSR_EE + bne bounce_ext_interrupt + mr r4, r9 + b fast_guest_return + + /* We raced with the host, we need to resend that IPI, bummer */ +1: li r0, IPI_PRIORITY + stbcix r0, r5, r6 /* set the IPI */ + sync + b ext_interrupt_to_host + +ext_stash_for_host: + /* It's not an IPI and it's for the host, stash it in the PACA + * before exit, it will be picked up by the host ICP driver + */ + stw r3, HSTATE_SAVED_XIRR(r13) +ext_interrupt_to_host: guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ @@ -831,7 +901,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) beq 44f ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ li r0,IPI_PRIORITY - li r7,XICS_QIRR + li r7,XICS_MFRR stbcix r0,r7,r8 /* trigger the IPI */ 44: srdi. r3,r3,1 addi r6,r6,PACA_SIZE @@ -1630,7 +1700,7 @@ secondary_nap: beq 37f sync li r0, 0xff - li r6, XICS_QIRR + li r6, XICS_MFRR stbcix r0, r5, r6 /* clear the IPI */ stwcix r3, r5, r7 /* EOI it */ 37: sync diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 53af848116f2..1417e65b6bbd 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -227,7 +227,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp, kvmppc_book3s_queue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); if (!change_self) - kvm_vcpu_kick(icp->vcpu); + kvmppc_fast_vcpu_kick(icp->vcpu); } bail: return success; diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 48861d3fcd07..20b328bb494d 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -51,6 +51,12 @@ static struct icp_ipl __iomem *icp_native_regs[NR_CPUS]; static inline unsigned int icp_native_get_xirr(void) { int cpu = smp_processor_id(); + unsigned int xirr; + + /* Handled an interrupt latched by KVM */ + xirr = kvmppc_get_xics_latch(); + if (xirr) + return xirr; return in_be32(&icp_native_regs[cpu]->xirr.word); } @@ -138,6 +144,7 @@ static unsigned int icp_native_get_irq(void) static void icp_native_cause_ipi(int cpu, unsigned long data) { + kvmppc_set_host_ipi(cpu, 1); icp_native_set_qirr(cpu, IPI_PRIORITY); } @@ -151,6 +158,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id) { int cpu = smp_processor_id(); + kvmppc_set_host_ipi(cpu, 0); icp_native_set_qirr(cpu, 0xff); return smp_ipi_demux(); From e7d26f285b4be9466c9e393139e1c9cffe4cedfc Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 17 Apr 2013 20:31:15 +0000 Subject: [PATCH 171/204] KVM: PPC: Book3S HV: Add support for real mode ICP in XICS emulation This adds an implementation of the XICS hypercalls in real mode for HV KVM, which allows us to avoid exiting the guest MMU context on all threads for a variety of operations such as fetching a pending interrupt, EOI of messages, IPIs, etc. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/Makefile | 5 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 406 ++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 18 +- arch/powerpc/kvm/book3s_xics.c | 64 +++- arch/powerpc/kvm/book3s_xics.h | 16 + 5 files changed, 490 insertions(+), 19 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_hv_rm_xics.c diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index f9b87b540450..422de3f4d46c 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -72,12 +72,15 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o +kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ + book3s_hv_rm_xics.o kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ book3s_64_vio_hv.o \ book3s_hv_ras.o \ - book3s_hv_builtin.o + book3s_hv_builtin.o \ + $(kvm-book3s_64-builtin-xics-objs-y) kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c new file mode 100644 index 000000000000..b4b0082f761c --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -0,0 +1,406 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "book3s_xics.h" + +#define DEBUG_PASSUP + +static inline void rm_writeb(unsigned long paddr, u8 val) +{ + __asm__ __volatile__("sync; stbcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, + struct kvm_vcpu *this_vcpu) +{ + struct kvmppc_icp *this_icp = this_vcpu->arch.icp; + unsigned long xics_phys; + int cpu; + + /* Mark the target VCPU as having an interrupt pending */ + vcpu->stat.queue_intr++; + set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); + + /* Kick self ? Just set MER and return */ + if (vcpu == this_vcpu) { + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER); + return; + } + + /* Check if the core is loaded, if not, too hard */ + cpu = vcpu->cpu; + if (cpu < 0 || cpu >= nr_cpu_ids) { + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + return; + } + /* In SMT cpu will always point to thread 0, we adjust it */ + cpu += vcpu->arch.ptid; + + /* Not too hard, then poke the target */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); +} + +static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) +{ + /* Note: Only called on self ! */ + clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, + &vcpu->arch.pending_exceptions); + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); +} + +static inline bool icp_rm_try_update(struct kvmppc_icp *icp, + union kvmppc_icp_state old, + union kvmppc_icp_state new) +{ + struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu; + bool success; + + /* Calculate new output value */ + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + + /* Attempt atomic update */ + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; + if (!success) + goto bail; + + /* + * Check for output state update + * + * Note that this is racy since another processor could be updating + * the state already. This is why we never clear the interrupt output + * here, we only ever set it. The clear only happens prior to doing + * an update and only by the processor itself. Currently we do it + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). + * + * We also do not try to figure out whether the EE state has changed, + * we unconditionally set it if the new state calls for it. The reason + * for that is that we opportunistically remove the pending interrupt + * flag when raising CPPR, so we need to set it back here if an + * interrupt is still pending. + */ + if (new.out_ee) + icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu); + + /* Expose the state change for debug purposes */ + this_vcpu->arch.icp->rm_dbgstate = new; + this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu; + + bail: + return success; +} + +static inline int check_too_hard(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; +} + +static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u8 new_cppr) +{ + union kvmppc_icp_state old_state, new_state; + bool resend; + + /* + * This handles several related states in one operation: + * + * ICP State: Down_CPPR + * + * Load CPPR with new value and if the XISR is 0 + * then check for resends: + * + * ICP State: Resend + * + * If MFRR is more favored than CPPR, check for IPIs + * and notify ICS of a potential resend. This is done + * asynchronously (when used in real mode, we will have + * to exit here). + * + * We do not handle the complete Check_IPI as documented + * here. In the PAPR, this state will be used for both + * Set_MFRR and Down_CPPR. However, we know that we aren't + * changing the MFRR state here so we don't need to handle + * the case of an MFRR causing a reject of a pending irq, + * this will have been handled when the MFRR was set in the + * first place. + * + * Thus we don't have to handle rejects, only resends. + * + * When implementing real mode for HV KVM, resend will lead to + * a H_TOO_HARD return and the whole transaction will be handled + * in virtual mode. + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Down_CPPR */ + new_state.cppr = new_cppr; + + /* + * Cut down Resend / Check_IPI / IPI + * + * The logic is that we cannot have a pending interrupt + * trumped by an IPI at this point (see above), so we + * know that either the pending interrupt is already an + * IPI (in which case we don't care to override it) or + * it's either more favored than us or non existent + */ + if (new_state.mfrr < new_cppr && + new_state.mfrr <= new_state.pending_pri) { + new_state.pending_pri = new_state.mfrr; + new_state.xisr = XICS_IPI; + } + + /* Latch/clear resend bit */ + resend = new_state.need_resend; + new_state.need_resend = 0; + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* + * Now handle resend checks. Those are asynchronous to the ICP + * state update in HW (ie bus transactions) so we can handle them + * separately here as well. + */ + if (resend) + icp->rm_action |= XICS_RM_CHECK_RESEND; +} + + +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 xirr; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* First clear the interrupt */ + icp_rm_clr_vcpu_irq(icp->vcpu); + + /* + * ICP State: Accept_Interrupt + * + * Return the pending interrupt (if any) along with the + * current CPPR, then clear the XISR & set CPPR to the + * pending priority + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); + if (!old_state.xisr) + break; + new_state.cppr = new_state.pending_pri; + new_state.pending_pri = 0xff; + new_state.xisr = 0; + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Return the result in GPR4 */ + vcpu->arch.gpr[4] = xirr; + + return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp; + u32 reject; + bool resend; + bool local; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + local = this_icp->server_num == server; + if (local) + icp = this_icp; + else + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + + /* + * ICP state: Set_MFRR + * + * If the CPPR is more favored than the new MFRR, then + * nothing needs to be done as there can be no XISR to + * reject. + * + * If the CPPR is less favored, then we might be replacing + * an interrupt, and thus need to possibly reject it as in + * + * ICP state: Check_IPI + */ + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + /* Set_MFRR */ + new_state.mfrr = mfrr; + + /* Check_IPI */ + reject = 0; + resend = false; + if (mfrr < new_state.cppr) { + /* Reject a pending interrupt if not an IPI */ + if (mfrr <= new_state.pending_pri) + reject = new_state.xisr; + new_state.pending_pri = mfrr; + new_state.xisr = XICS_IPI; + } + + if (mfrr > old_state.mfrr && mfrr > new_state.cppr) { + resend = new_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Pass rejects to virtual mode */ + if (reject && reject != XICS_IPI) { + this_icp->rm_action |= XICS_RM_REJECT; + this_icp->rm_reject = reject; + } + + /* Pass resends to virtual mode */ + if (resend) + this_icp->rm_action |= XICS_RM_CHECK_RESEND; + + return check_too_hard(xics, this_icp); +} + +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 reject; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: Set_CPPR + * + * We can safely compare the new value with the current + * value outside of the transaction as the CPPR is only + * ever changed by the processor on itself + */ + if (cppr > icp->state.cppr) { + icp_rm_down_cppr(xics, icp, cppr); + goto bail; + } else if (cppr == icp->state.cppr) + return H_SUCCESS; + + /* + * ICP State: Up_CPPR + * + * The processor is raising its priority, this can result + * in a rejection of a pending interrupt: + * + * ICP State: Reject_Current + * + * We can remove EE from the current processor, the update + * transaction will set it again if needed + */ + icp_rm_clr_vcpu_irq(icp->vcpu); + + do { + old_state = new_state = ACCESS_ONCE(icp->state); + + reject = 0; + new_state.cppr = cppr; + + if (cppr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.xisr = 0; + new_state.pending_pri = 0xff; + } + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Pass rejects to virtual mode */ + if (reject && reject != XICS_IPI) { + icp->rm_action |= XICS_RM_REJECT; + icp->rm_reject = reject; + } + bail: + return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u32 irq = xirr & 0x00ffffff; + u16 src; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR sepcifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_rm_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + goto bail; + /* + * EOI handling: If the interrupt is still asserted, we need to + * resend it. We can take a lockless "peek" at the ICS state here. + * + * "Message" interrupts will never have "asserted" set + */ + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + goto bail; + state = &ics->irq_state[src]; + + /* Still asserted, resend it, we make it look like a reject */ + if (state->asserted) { + icp->rm_action |= XICS_RM_REJECT; + icp->rm_reject = irq; + } + bail: + return check_too_hard(xics, icp); +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 56f8927b0ddf..fd3b72d5dfe6 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1424,11 +1424,19 @@ hcall_real_table: .long 0 /* 0x58 */ .long 0 /* 0x5c */ .long 0 /* 0x60 */ - .long 0 /* 0x64 */ - .long 0 /* 0x68 */ - .long 0 /* 0x6c */ - .long 0 /* 0x70 */ - .long 0 /* 0x74 */ +#ifdef CONFIG_KVM_XICS + .long .kvmppc_rm_h_eoi - hcall_real_table + .long .kvmppc_rm_h_cppr - hcall_real_table + .long .kvmppc_rm_h_ipi - hcall_real_table + .long 0 /* 0x70 - H_IPOLL */ + .long .kvmppc_rm_h_xirr - hcall_real_table +#else + .long 0 /* 0x64 - H_EOI */ + .long 0 /* 0x68 - H_CPPR */ + .long 0 /* 0x6c - H_IPI */ + .long 0 /* 0x70 - H_IPOLL */ + .long 0 /* 0x74 - H_XIRR */ +#endif .long 0 /* 0x78 */ .long 0 /* 0x7c */ .long 0 /* 0x80 */ diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 1417e65b6bbd..7fd247cbd0d1 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -30,6 +30,9 @@ #define XICS_DBG(fmt...) trace_printk(fmt) #endif +#define ENABLE_REALMODE true +#define DEBUG_REALMODE false + /* * LOCKING * ======= @@ -220,8 +223,10 @@ static inline bool icp_try_update(struct kvmppc_icp *icp, * in Accept (H_XIRR) and Up_Cppr (H_XPPR). * * We also do not try to figure out whether the EE state has changed, - * we unconditionally set it if the new state calls for it for the - * same reason. + * we unconditionally set it if the new state calls for it. The reason + * for that is that we opportunistically remove the pending interrupt + * flag when raising CPPR, so we need to set it back here if an + * interrupt is still pending. */ if (new.out_ee) { kvmppc_book3s_queue_irqprio(icp->vcpu, @@ -483,7 +488,7 @@ static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, icp_check_resend(xics, icp); } -static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu) +static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) { union kvmppc_icp_state old_state, new_state; struct kvmppc_icp *icp = vcpu->arch.icp; @@ -517,8 +522,8 @@ static noinline unsigned long h_xirr(struct kvm_vcpu *vcpu) return xirr; } -static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server, - unsigned long mfrr) +static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) { union kvmppc_icp_state old_state, new_state; struct kvmppc_xics *xics = vcpu->kvm->arch.xics; @@ -586,7 +591,7 @@ static noinline int h_ipi(struct kvm_vcpu *vcpu, unsigned long server, return H_SUCCESS; } -static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) { union kvmppc_icp_state old_state, new_state; struct kvmppc_xics *xics = vcpu->kvm->arch.xics; @@ -643,7 +648,7 @@ static noinline void h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) icp_deliver_irq(xics, icp, reject); } -static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; @@ -693,29 +698,54 @@ static noinline int h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return H_SUCCESS; } +static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + + XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", + hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); + + if (icp->rm_action & XICS_RM_KICK_VCPU) + kvmppc_fast_vcpu_kick(icp->rm_kick_target); + if (icp->rm_action & XICS_RM_CHECK_RESEND) + icp_check_resend(xics, icp); + if (icp->rm_action & XICS_RM_REJECT) + icp_deliver_irq(xics, icp, icp->rm_reject); + + icp->rm_action = 0; + + return H_SUCCESS; +} + int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) { + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; unsigned long res; int rc = H_SUCCESS; /* Check if we have an ICP */ - if (!vcpu->arch.icp || !vcpu->kvm->arch.xics) + if (!xics || !vcpu->arch.icp) return H_HARDWARE; + /* Check for real mode returning too hard */ + if (xics->real_mode) + return kvmppc_xics_rm_complete(vcpu, req); + switch (req) { case H_XIRR: - res = h_xirr(vcpu); + res = kvmppc_h_xirr(vcpu); kvmppc_set_gpr(vcpu, 4, res); break; case H_CPPR: - h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); + kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); break; case H_EOI: - rc = h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); + rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); break; case H_IPI: - rc = h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), - kvmppc_get_gpr(vcpu, 5)); + rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); break; } @@ -933,6 +963,14 @@ int kvm_xics_create(struct kvm *kvm, u32 type) xics_debugfs_init(xics); +#ifdef CONFIG_KVM_BOOK3S_64_HV + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + /* Enable real mode support */ + xics->real_mode = ENABLE_REALMODE; + xics->real_mode_dbg = DEBUG_REALMODE; + } +#endif /* CONFIG_KVM_BOOK3S_64_HV */ + return 0; } diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 58ee190de5e5..c816c5a49c90 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -64,6 +64,20 @@ struct kvmppc_icp { unsigned long server_num; union kvmppc_icp_state state; unsigned long resend_map[ICP_RESEND_MAP_SIZE]; + + /* Real mode might find something too hard, here's the action + * it might request from virtual mode + */ +#define XICS_RM_KICK_VCPU 0x1 +#define XICS_RM_CHECK_RESEND 0x2 +#define XICS_RM_REJECT 0x4 + u32 rm_action; + struct kvm_vcpu *rm_kick_target; + u32 rm_reject; + + /* Debug stuff for real mode */ + union kvmppc_icp_state rm_dbgstate; + struct kvm_vcpu *rm_dbgtgt; }; struct kvmppc_ics { @@ -76,6 +90,8 @@ struct kvmppc_xics { struct kvm *kvm; struct dentry *dentry; u32 max_icsid; + bool real_mode; + bool real_mode_dbg; struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; }; From 4619ac88b72c43c622ef1eae3069de0e6f2cba9d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 17 Apr 2013 20:31:41 +0000 Subject: [PATCH 172/204] KVM: PPC: Book3S HV: Improve real-mode handling of external interrupts This streamlines our handling of external interrupts that come in while we're in the guest. First, when waking up a hardware thread that was napping, we split off the "napping due to H_CEDE" case earlier, and use the code that handles an external interrupt (0x500) in the guest to handle that too. Secondly, the code that handles those external interrupts now checks if any other thread is exiting to the host before bouncing an external interrupt to the guest, and also checks that there is actually an external interrupt pending for the guest before setting the LPCR MER bit (mediated external request). This also makes sure that we clear the "ceded" flag when we handle a wakeup from cede in real mode, and fixes a potential infinite loop in kvmppc_run_vcpu() which can occur if we ever end up with the ceded flag set but MSR[EE] off. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/kvm/book3s_hv.c | 5 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 138 +++++++++++++----------- 3 files changed, 80 insertions(+), 64 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c9c67fc888c9..799322433620 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -290,6 +290,7 @@ #define LPCR_PECE1 0x00002000 /* decrementer can cause exit */ #define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */ #define LPCR_MER 0x00000800 /* Mediated External Exception */ +#define LPCR_MER_SH 11 #define LPCR_LPES 0x0000000c #define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */ #define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 16191915e8d0..178521e81ce4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1384,9 +1384,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) break; vc->runner = vcpu; n_ceded = 0; - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { if (!v->arch.pending_exceptions) n_ceded += v->arch.ceded; + else + v->arch.ceded = 0; + } if (n_ceded == vc->n_runnable) kvmppc_vcore_blocked(vc); else diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index fd3b72d5dfe6..b02f91e4c70d 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -97,50 +97,51 @@ kvm_start_guest: li r0,1 stb r0,PACA_NAPSTATELOST(r13) - /* get vcpu pointer, NULL if we have no vcpu to run */ - ld r4,HSTATE_KVM_VCPU(r13) - cmpdi cr1,r4,0 + /* were we napping due to cede? */ + lbz r0,HSTATE_NAPPING(r13) + cmpwi r0,0 + bne kvm_end_cede + + /* + * We weren't napping due to cede, so this must be a secondary + * thread being woken up to run a guest, or being woken up due + * to a stray IPI. (Or due to some machine check or hypervisor + * maintenance interrupt while the core is in KVM.) + */ /* Check the wake reason in SRR1 to see why we got here */ mfspr r3,SPRN_SRR1 rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ cmpwi r3,4 /* was it an external interrupt? */ - bne 27f - - /* - * External interrupt - for now assume it is an IPI, since we - * should never get any other interrupts sent to offline threads. - * Only do this for secondary threads. - */ - beq cr1,25f - lwz r3,VCPU_PTID(r4) - cmpwi r3,0 - beq 27f -25: ld r5,HSTATE_XICS_PHYS(r13) - li r0,0xff - li r6,XICS_MFRR - li r7,XICS_XIRR + bne 27f /* if not */ + ld r5,HSTATE_XICS_PHYS(r13) + li r7,XICS_XIRR /* if it was an external interrupt, */ lwzcix r8,r5,r7 /* get and ack the interrupt */ sync clrldi. r9,r8,40 /* get interrupt source ID. */ - beq 27f /* none there? */ - cmpwi r9,XICS_IPI - bne 26f + beq 28f /* none there? */ + cmpwi r9,XICS_IPI /* was it an IPI? */ + bne 29f + li r0,0xff + li r6,XICS_MFRR stbcix r0,r5,r6 /* clear IPI */ -26: stwcix r8,r5,r7 /* EOI the interrupt */ + stwcix r8,r5,r7 /* EOI the interrupt */ + sync /* order loading of vcpu after that */ -27: /* XXX should handle hypervisor maintenance interrupts etc. here */ - - /* reload vcpu pointer after clearing the IPI */ + /* get vcpu pointer, NULL if we have no vcpu to run */ ld r4,HSTATE_KVM_VCPU(r13) cmpdi r4,0 /* if we have no vcpu to run, go back to sleep */ beq kvm_no_guest + b kvmppc_hv_entry - /* were we napping due to cede? */ - lbz r0,HSTATE_NAPPING(r13) - cmpwi r0,0 - bne kvm_end_cede +27: /* XXX should handle hypervisor maintenance interrupts etc. here */ + b kvm_no_guest +28: /* SRR1 said external but ICP said nope?? */ + b kvm_no_guest +29: /* External non-IPI interrupt to offline secondary thread? help?? */ + stw r8,HSTATE_SAVED_XIRR(r13) + b kvm_no_guest .global kvmppc_hv_entry kvmppc_hv_entry: @@ -483,20 +484,20 @@ toc_tlbie_lock: mtctr r6 mtxer r7 + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) - ld r10, VCPU_PC(r4) - ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ + /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG ori r11, r11, MSR_ME /* Check if we can deliver an external or decrementer interrupt now */ ld r0,VCPU_PENDING_EXC(r4) - li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL) - oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h + lis r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h and r0,r0,r8 cmpdi cr1,r0,0 andi. r0,r11,MSR_EE @@ -524,10 +525,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Move SRR0 and SRR1 into the respective regs */ 5: mtspr SPRN_SRR0, r6 mtspr SPRN_SRR1, r7 - li r0,0 - stb r0,VCPU_CEDED(r4) /* cancel cede */ fast_guest_return: + li r0,0 + stb r0,VCPU_CEDED(r4) /* cancel cede */ mtspr SPRN_HSRR0,r10 mtspr SPRN_HSRR1,r11 @@ -686,6 +687,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) /* External interrupt, first check for host_ipi. If this is * set, we know the host wants us out so let's do it now */ +do_ext_interrupt: lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 bne ext_interrupt_to_host @@ -698,19 +700,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) lwzcix r3, r5, r7 rlwinm. r0, r3, 0, 0xffffff sync - bne 1f + beq 3f /* if nothing pending in the ICP */ - /* Nothing pending in the ICP, check for mediated interrupts - * and bounce it to the guest - */ - andi. r0, r11, MSR_EE - beq ext_interrupt_to_host /* shouldn't happen ?? */ - mfspr r5, SPRN_LPCR - andi. r0, r5, LPCR_MER - bne bounce_ext_interrupt - b ext_interrupt_to_host /* shouldn't happen ?? */ - -1: /* We found something in the ICP... + /* We found something in the ICP... * * If it's not an IPI, stash it in the PACA and return to * the host, we don't (yet) handle directing real external @@ -735,16 +727,33 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) bne- 1f /* Allright, looks like an IPI for the guest, we need to set MER */ - mfspr r8,SPRN_LPCR - ori r8,r8,LPCR_MER - mtspr SPRN_LPCR,r8 +3: + /* Check if any CPU is heading out to the host, if so head out too */ + ld r5, HSTATE_KVM_VCORE(r13) + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + bge ext_interrupt_to_host + + /* See if there is a pending interrupt for the guest */ + mfspr r8, SPRN_LPCR + ld r0, VCPU_PENDING_EXC(r9) + /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ + rldicl. r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 + rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH + beq 2f /* And if the guest EE is set, we can deliver immediately, else * we return to the guest with MER set */ andi. r0, r11, MSR_EE - bne bounce_ext_interrupt - mr r4, r9 + beq 2f + mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_EXTERNAL + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +2: mr r4, r9 + mtspr SPRN_LPCR, r8 b fast_guest_return /* We raced with the host, we need to resend that IPI, bummer */ @@ -1487,15 +1496,6 @@ ignore_hdec: mr r4,r9 b fast_guest_return -bounce_ext_interrupt: - mr r4,r9 - mtspr SPRN_SRR0,r10 - mtspr SPRN_SRR1,r11 - li r10,BOOK3S_INTERRUPT_EXTERNAL - li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ - rotldi r11,r11,63 - b fast_guest_return - _GLOBAL(kvmppc_h_set_dabr) std r4,VCPU_DABR(r3) /* Work around P7 bug where DABR can get corrupted on mtspr */ @@ -1601,6 +1601,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) b . kvm_end_cede: + /* get vcpu pointer */ + ld r4, HSTATE_KVM_VCPU(r13) + /* Woken by external or decrementer interrupt */ ld r1, HSTATE_HOST_R1(r13) @@ -1640,6 +1643,16 @@ kvm_end_cede: li r0,0 stb r0,HSTATE_NAPPING(r13) + /* Check the wake reason in SRR1 to see why we got here */ + mfspr r3, SPRN_SRR1 + rlwinm r3, r3, 44-31, 0x7 /* extract wake reason field */ + cmpwi r3, 4 /* was it an external interrupt? */ + li r12, BOOK3S_INTERRUPT_EXTERNAL + mr r9, r4 + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + beq do_ext_interrupt /* if so */ + /* see if any other thread is already exiting */ lwz r0,VCORE_ENTRY_EXIT(r5) cmpwi r0,0x100 @@ -1659,8 +1672,7 @@ kvm_cede_prodded: /* we've ceded but we want to give control to the host */ kvm_cede_exit: - li r3,H_TOO_HARD - blr + b hcall_real_fallback /* Try to handle a machine check in real mode */ machine_check_realmode: From d19bd86204f85d42873e07bb64a27587fc380b5b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 17 Apr 2013 20:32:04 +0000 Subject: [PATCH 173/204] KVM: PPC: Book3S: Add support for ibm,int-on/off RTAS calls This adds support for the ibm,int-on and ibm,int-off RTAS calls to the in-kernel XICS emulation and corrects the handling of the saved priority by the ibm,set-xive RTAS call. With this, ibm,int-off sets the specified interrupt's priority in its saved_priority field and sets the priority to 0xff (the least favoured value). ibm,int-on restores the saved_priority to the priority field, and ibm,set-xive sets both the priority and the saved_priority to the specified priority value. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/kvm/book3s_rtas.c | 40 ++++++++++++++ arch/powerpc/kvm/book3s_xics.c | 84 +++++++++++++++++++++++++----- arch/powerpc/kvm/book3s_xics.h | 2 +- 4 files changed, 113 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 1589fd8bf063..cfaa47995c0e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -174,6 +174,8 @@ extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority); extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority); +extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq); +extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq); /* * Cuts out inst bits with ordering according to spec. diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 77f9aa5f4ba5..3219ba895246 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -63,6 +63,44 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) out: args->rets[0] = rc; } + +static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq; + int rc; + + if (args->nargs != 1 || args->nret != 1) { + rc = -3; + goto out; + } + + irq = args->args[0]; + + rc = kvmppc_xics_int_off(vcpu->kvm, irq); + if (rc) + rc = -3; +out: + args->rets[0] = rc; +} + +static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq; + int rc; + + if (args->nargs != 1 || args->nret != 1) { + rc = -3; + goto out; + } + + irq = args->args[0]; + + rc = kvmppc_xics_int_on(vcpu->kvm, irq); + if (rc) + rc = -3; +out: + args->rets[0] = rc; +} #endif /* CONFIG_KVM_XICS */ struct rtas_handler { @@ -74,6 +112,8 @@ static struct rtas_handler rtas_handlers[] = { #ifdef CONFIG_KVM_XICS { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive }, { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive }, + { .name = "ibm,int-off", .handler = kvm_rtas_int_off }, + { .name = "ibm,int-on", .handler = kvm_rtas_int_on }, #endif }; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 7fd247cbd0d1..9fb2d3909c46 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -123,6 +123,28 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, mutex_unlock(&ics->lock); } +static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, + struct ics_irq_state *state, + u32 server, u32 priority, u32 saved_priority) +{ + bool deliver; + + mutex_lock(&ics->lock); + + state->server = server; + state->priority = priority; + state->saved_priority = saved_priority; + deliver = false; + if ((state->masked_pending || state->resend) && priority != MASKED) { + state->masked_pending = 0; + deliver = true; + } + + mutex_unlock(&ics->lock); + + return deliver; +} + int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) { struct kvmppc_xics *xics = kvm->arch.xics; @@ -130,7 +152,6 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) struct kvmppc_ics *ics; struct ics_irq_state *state; u16 src; - bool deliver; if (!xics) return -ENODEV; @@ -144,23 +165,11 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) if (!icp) return -EINVAL; - mutex_lock(&ics->lock); - XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n", irq, server, priority, state->masked_pending, state->resend); - state->server = server; - state->priority = priority; - deliver = false; - if ((state->masked_pending || state->resend) && priority != MASKED) { - state->masked_pending = 0; - deliver = true; - } - - mutex_unlock(&ics->lock); - - if (deliver) + if (write_xive(xics, ics, state, server, priority, priority)) icp_deliver_irq(xics, icp, irq); return 0; @@ -189,6 +198,53 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) return 0; } +int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + icp = kvmppc_xics_find_server(kvm, state->server); + if (!icp) + return -EINVAL; + + if (write_xive(xics, ics, state, state->server, state->saved_priority, + state->saved_priority)) + icp_deliver_irq(xics, icp, irq); + + return 0; +} + +int kvmppc_xics_int_off(struct kvm *kvm, u32 irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + write_xive(xics, ics, state, state->server, MASKED, state->priority); + + return 0; +} + /* -- ICP routines, including hcalls -- */ static inline bool icp_try_update(struct kvmppc_icp *icp, diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index c816c5a49c90..e4fdec3dde77 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -36,7 +36,7 @@ struct ics_irq_state { u32 number; u32 server; u8 priority; - u8 saved_priority; /* currently unused */ + u8 saved_priority; u8 resend; u8 masked_pending; u8 asserted; /* Only for LSI */ From 8b78645c93b5d469e8006d68dbc92edc2640c654 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 17 Apr 2013 20:32:26 +0000 Subject: [PATCH 174/204] KVM: PPC: Book3S: Facilities to save/restore XICS presentation ctrler state This adds the ability for userspace to save and restore the state of the XICS interrupt presentation controllers (ICPs) via the KVM_GET/SET_ONE_REG interface. Since there is one ICP per vcpu, we simply define a new 64-bit register in the ONE_REG space for the ICP state. The state includes the CPU priority setting, the pending IPI priority, and the priority and source number of any pending external interrupt. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 1 + arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/include/uapi/asm/kvm.h | 12 ++++ arch/powerpc/kvm/book3s.c | 19 ++++++ arch/powerpc/kvm/book3s_xics.c | 90 +++++++++++++++++++++++++++++ 5 files changed, 124 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index fb308be8521b..c09d1832e935 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1808,6 +1808,7 @@ registers, find a list below: PPC | KVM_REG_PPC_TLB2PS | 32 PPC | KVM_REG_PPC_TLB3PS | 32 PPC | KVM_REG_PPC_EPTCFG | 32 + PPC | KVM_REG_PPC_ICP_STATE | 64 ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index cfaa47995c0e..d7339df19259 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -313,6 +313,8 @@ extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); +extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); +extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); #else static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index eb9e25c194ad..427b9aca2a0f 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -390,6 +390,18 @@ struct kvm_get_htab_header { __u16 n_invalid; }; +/* Per-vcpu XICS interrupt controller state */ +#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) + +#define KVM_REG_PPC_ICP_CPPR_SHIFT 56 /* current proc priority */ +#define KVM_REG_PPC_ICP_CPPR_MASK 0xff +#define KVM_REG_PPC_ICP_XISR_SHIFT 32 /* interrupt status field */ +#define KVM_REG_PPC_ICP_XISR_MASK 0xffffff +#define KVM_REG_PPC_ICP_MFRR_SHIFT 24 /* pending IPI priority */ +#define KVM_REG_PPC_ICP_MFRR_MASK 0xff +#define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */ +#define KVM_REG_PPC_ICP_PPRI_MASK 0xff + /* Device control API: PPC-specific devices */ #define KVM_DEV_MPIC_GRP_MISC 1 #define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 1a4d787df507..700df6f1d32c 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -535,6 +535,15 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) &opcode, sizeof(u32)); break; } +#ifdef CONFIG_KVM_XICS + case KVM_REG_PPC_ICP_STATE: + if (!vcpu->arch.icp) { + r = -ENXIO; + break; + } + val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu)); + break; +#endif /* CONFIG_KVM_XICS */ default: r = -EINVAL; break; @@ -597,6 +606,16 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); break; #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_KVM_XICS + case KVM_REG_PPC_ICP_STATE: + if (!vcpu->arch.icp) { + r = -ENXIO; + break; + } + r = kvmppc_xics_set_icp(vcpu, + set_reg_val(reg->id, val)); + break; +#endif /* CONFIG_KVM_XICS */ default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 9fb2d3909c46..ee841ed8a690 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -954,6 +954,96 @@ int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) return 0; } +u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu) +{ + struct kvmppc_icp *icp = vcpu->arch.icp; + union kvmppc_icp_state state; + + if (!icp) + return 0; + state = icp->state; + return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) | + ((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) | + ((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) | + ((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT); +} + +int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) +{ + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + union kvmppc_icp_state old_state, new_state; + struct kvmppc_ics *ics; + u8 cppr, mfrr, pending_pri; + u32 xisr; + u16 src; + bool resend; + + if (!icp || !xics) + return -ENOENT; + + cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; + xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & + KVM_REG_PPC_ICP_XISR_MASK; + mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; + pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT; + + /* Require the new state to be internally consistent */ + if (xisr == 0) { + if (pending_pri != 0xff) + return -EINVAL; + } else if (xisr == XICS_IPI) { + if (pending_pri != mfrr || pending_pri >= cppr) + return -EINVAL; + } else { + if (pending_pri >= mfrr || pending_pri >= cppr) + return -EINVAL; + ics = kvmppc_xics_find_ics(xics, xisr, &src); + if (!ics) + return -EINVAL; + } + + new_state.raw = 0; + new_state.cppr = cppr; + new_state.xisr = xisr; + new_state.mfrr = mfrr; + new_state.pending_pri = pending_pri; + + /* + * Deassert the CPU interrupt request. + * icp_try_update will reassert it if necessary. + */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + /* + * Note that if we displace an interrupt from old_state.xisr, + * we don't mark it as rejected. We expect userspace to set + * the state of the interrupt sources to be consistent with + * the ICP states (either before or afterwards, which doesn't + * matter). We do handle resends due to CPPR becoming less + * favoured because that is necessary to end up with a + * consistent state in the situation where userspace restores + * the ICS states before the ICP states. + */ + do { + old_state = ACCESS_ONCE(icp->state); + + if (new_state.mfrr <= old_state.mfrr) { + resend = false; + new_state.need_resend = old_state.need_resend; + } else { + resend = old_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_try_update(icp, old_state, new_state, false)); + + if (resend) + icp_check_resend(xics, icp); + + return 0; +} + /* -- ioctls -- */ int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args) From cb0c8cda1301842498d1154bd6d0d1c44d1e7030 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 27 Apr 2013 12:58:00 +0200 Subject: [PATCH 175/204] KVM: VMX: remove unprintable characters from comment Slipped in while copy&pasting from the SDM. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5863adf71ede..0f0cb3110626 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7650,7 +7650,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } /* - * If the “load IA32_EFER” VM-entry control is 1, the following checks + * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: * - Bits reserved in the IA32_EFER MSR must be 0. * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of From 6614c7d042eb1096d4eba253b4952bec349f8593 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 26 Apr 2013 00:22:01 +0200 Subject: [PATCH 176/204] kvm, svm: Fix typo in printk message It is "exit_int_info". It is actually EXITINTINFO in the official docs but we don't like screaming docs. Signed-off-by: Borislav Petkov Signed-off-by: Gleb Natapov --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d6713e18bbc1..15c9cccd716b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3473,7 +3473,7 @@ static int handle_exit(struct kvm_vcpu *vcpu) exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) - printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " + printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, exit_code); From 730dca42c1d363c939da18c1499c7327c66e2b37 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 28 Apr 2013 10:50:52 +0200 Subject: [PATCH 177/204] KVM: x86: Rework request for immediate exit The VMX implementation of enable_irq_window raised KVM_REQ_IMMEDIATE_EXIT after we checked it in vcpu_enter_guest. This caused infinite loops on vmentry. Fix it by letting enable_irq_window signal the need for an immediate exit via its return value and drop KVM_REQ_IMMEDIATE_EXIT. This issue only affects nested VMX scenarios. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 3 ++- arch/x86/kvm/vmx.c | 15 ++++++++------- arch/x86/kvm/x86.c | 7 +++---- include/linux/kvm_host.h | 15 +++++++-------- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 18635ae42a8e..111b4a0c3907 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -694,7 +694,7 @@ struct kvm_x86_ops { bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); - void (*enable_irq_window)(struct kvm_vcpu *vcpu); + int (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int (*vm_has_apicv)(struct kvm *kvm); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 15c9cccd716b..7f896cbe717f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3632,7 +3632,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) return ret; } -static void enable_irq_window(struct kvm_vcpu *vcpu) +static int enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3646,6 +3646,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } + return 0; } static void enable_nmi_window(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0f0cb3110626..74c525e2c608 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4398,22 +4398,23 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) PIN_BASED_NMI_EXITING; } -static void enable_irq_window(struct kvm_vcpu *vcpu) +static int enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { + + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) /* * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. Ask L2 to exit - * right after entry, so we can inject to L1 more promptly. + * inject to L1 now because L2 must run. The caller will have + * to make L2 exit right after entry, so we can inject to L1 + * more promptly. */ - kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); - return; - } + return -EBUSY; cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + return 0; } static void enable_nmi_window(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a434bf3918d..c522260b5bbf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5692,7 +5692,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - bool req_immediate_exit = 0; + bool req_immediate_exit = false; if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) @@ -5734,8 +5734,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) record_steal_time(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); - req_immediate_exit = - kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) kvm_handle_pmu_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) @@ -5757,7 +5755,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); + req_immediate_exit = + kvm_x86_ops->enable_irq_window(vcpu) != 0; if (kvm_lapic_enabled(vcpu)) { /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 93a50054d46c..7bde42470e37 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -119,14 +119,13 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_APF_HALT 12 #define KVM_REQ_STEAL_UPDATE 13 #define KVM_REQ_NMI 14 -#define KVM_REQ_IMMEDIATE_EXIT 15 -#define KVM_REQ_PMU 16 -#define KVM_REQ_PMI 17 -#define KVM_REQ_WATCHDOG 18 -#define KVM_REQ_MASTERCLOCK_UPDATE 19 -#define KVM_REQ_MCLOCK_INPROGRESS 20 -#define KVM_REQ_EPR_EXIT 21 -#define KVM_REQ_SCAN_IOAPIC 22 +#define KVM_REQ_PMU 15 +#define KVM_REQ_PMI 16 +#define KVM_REQ_WATCHDOG 17 +#define KVM_REQ_MASTERCLOCK_UPDATE 18 +#define KVM_REQ_MCLOCK_INPROGRESS 19 +#define KVM_REQ_EPR_EXIT 20 +#define KVM_REQ_SCAN_IOAPIC 21 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 From 2a5bab1004729f3302c776e53ee7c895b98bb1ce Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 16 Apr 2013 13:49:18 -0600 Subject: [PATCH 178/204] kvm: Allow build-time configuration of KVM device assignment We hope to at some point deprecate KVM legacy device assignment in favor of VFIO-based assignment. Towards that end, allow legacy device assignment to be deconfigured. Signed-off-by: Alex Williamson Reviewed-by: Alexander Graf Acked-by: Michael S. Tsirkin Signed-off-by: Gleb Natapov --- arch/ia64/include/uapi/asm/kvm.h | 1 - arch/ia64/kvm/Kconfig | 13 +++++++++++-- arch/ia64/kvm/Makefile | 6 +++--- arch/ia64/kvm/kvm-ia64.c | 2 -- arch/x86/include/uapi/asm/kvm.h | 1 - arch/x86/kvm/Kconfig | 13 +++++++++++-- arch/x86/kvm/Makefile | 5 +++-- arch/x86/kvm/x86.c | 6 ++++-- include/linux/kvm_host.h | 30 ++++++++---------------------- include/uapi/linux/kvm.h | 4 ---- 10 files changed, 40 insertions(+), 41 deletions(-) diff --git a/arch/ia64/include/uapi/asm/kvm.h b/arch/ia64/include/uapi/asm/kvm.h index ec6c6b301238..99503c284400 100644 --- a/arch/ia64/include/uapi/asm/kvm.h +++ b/arch/ia64/include/uapi/asm/kvm.h @@ -27,7 +27,6 @@ /* Select x86 specific features in */ #define __KVM_HAVE_IOAPIC #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_DEVICE_ASSIGNMENT /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index 043183a06409..990b86420cc6 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -21,8 +21,6 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on BROKEN depends on HAVE_KVM && MODULES - # for device assignment: - depends on PCI depends on BROKEN select PREEMPT_NOTIFIERS select ANON_INODES @@ -51,6 +49,17 @@ config KVM_INTEL Provides support for KVM on Itanium 2 processors equipped with the VT extensions. +config KVM_DEVICE_ASSIGNMENT + bool "KVM legacy PCI device assignment support" + depends on KVM && PCI && IOMMU_API + default y + ---help--- + Provide support for legacy PCI device assignment through KVM. The + kernel now also supports a full featured userspace device driver + framework through VFIO, which supersedes much of this support. + + If unsure, say Y. + source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile index 511f64a78d56..1a4053789d01 100644 --- a/arch/ia64/kvm/Makefile +++ b/arch/ia64/kvm/Makefile @@ -49,10 +49,10 @@ ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o assigned-dev.o irqchip.o) + coalesced_mmio.o irq_comm.o) -ifeq ($(CONFIG_IOMMU_API),y) -common-objs += $(addprefix ../../../virt/kvm/, iommu.o) +ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y) +common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o) endif kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 032c54d63c3d..dcc560729c20 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1368,9 +1368,7 @@ void kvm_arch_sync_events(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); -#ifdef KVM_CAP_DEVICE_ASSIGNMENT kvm_free_all_assigned_devices(kvm); -#endif kfree(kvm->arch.vioapic); kvm_release_vm_pages(kvm); } diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a65ec29e6ffb..5d9a3033b3d7 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -29,7 +29,6 @@ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC #define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_DEVICE_ASSIGNMENT #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI #define __KVM_HAVE_GUEST_DEBUG diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 9d50efdb719d..a47a3e54b964 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -21,8 +21,6 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM depends on HIGH_RES_TIMERS - # for device assignment: - depends on PCI # for TASKSTATS/TASK_DELAY_ACCT: depends on NET select PREEMPT_NOTIFIERS @@ -83,6 +81,17 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows audit KVM MMU at runtime. +config KVM_DEVICE_ASSIGNMENT + bool "KVM legacy PCI device assignment support" + depends on KVM && PCI && IOMMU_API + default y + ---help--- + Provide support for legacy PCI device assignment through KVM. The + kernel now also supports a full featured userspace device driver + framework through VFIO, which supersedes much of this support. + + If unsure, say Y. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a797b8e43ade..d609e1d84048 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,8 +7,9 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o irqchip.o) -kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) + irqchip.o) +kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ + assigned-dev.o iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 145b1c81011b..8747fef7fd59 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2501,7 +2501,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_PIT2: @@ -2519,9 +2518,12 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: - case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT + case KVM_CAP_ASSIGN_DEV_IRQ: + case KVM_CAP_PCI_2_3: +#endif r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 309774715897..996661eb8e99 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -667,7 +667,6 @@ static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); -void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); @@ -736,7 +735,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); /* For vcpu->arch.iommu_flags */ #define KVM_IOMMU_CACHE_COHERENCY 0x1 -#ifdef CONFIG_IOMMU_API +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); int kvm_iommu_map_guest(struct kvm *kvm); @@ -745,7 +744,7 @@ int kvm_assign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev); int kvm_deassign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev); -#else /* CONFIG_IOMMU_API */ +#else static inline int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { @@ -757,28 +756,11 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, { } -static inline int kvm_iommu_map_guest(struct kvm *kvm) -{ - return -ENODEV; -} - static inline int kvm_iommu_unmap_guest(struct kvm *kvm) { return 0; } - -static inline int kvm_assign_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - return 0; -} - -static inline int kvm_deassign_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - return 0; -} -#endif /* CONFIG_IOMMU_API */ +#endif static inline void __guest_enter(void) { @@ -1032,11 +1014,13 @@ static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } #endif -#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, unsigned long arg); +void kvm_free_all_assigned_devices(struct kvm *kvm); + #else static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, @@ -1045,6 +1029,8 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, return -ENOTTY; } +static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} + #endif static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d4005192ad6e..965e5b52dee0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -561,9 +561,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ -#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT #define KVM_CAP_DEVICE_ASSIGNMENT 17 -#endif #define KVM_CAP_IOMMU 18 #ifdef __KVM_HAVE_MSI #define KVM_CAP_DEVICE_MSI 20 @@ -581,9 +579,7 @@ struct kvm_ppc_smmu_info { #endif #define KVM_CAP_IRQ_ROUTING 25 #define KVM_CAP_IRQ_INJECT_STATUS 26 -#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT #define KVM_CAP_DEVICE_DEASSIGNMENT 27 -#endif #ifdef __KVM_HAVE_MSIX #define KVM_CAP_DEVICE_MSIX 28 #endif From cbf64358588ae45dcf0207dbc97fba783577d64a Mon Sep 17 00:00:00 2001 From: Chegu Vinod Date: Sat, 27 Apr 2013 18:31:04 -0700 Subject: [PATCH 179/204] KVM: x86: Increase the "hard" max VCPU limit KVM guests today use 8bit APIC ids allowing for 256 ID's. Reserving one ID for Broadcast interrupts should leave 255 ID's. In case of KVM there is no need for reserving another ID for IO-APIC so the hard max limit for VCPUS can be increased from 254 to 255. (This was confirmed by Gleb Natapov http://article.gmane.org/gmane.comp.emulators.kvm.devel/99713 ) Signed-off-by: Chegu Vinod Signed-off-by: Gleb Natapov --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 38f2cc0a5a23..ec14b7245a4e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -31,7 +31,7 @@ #include #include -#define KVM_MAX_VCPUS 254 +#define KVM_MAX_VCPUS 255 #define KVM_SOFT_MAX_VCPUS 160 #define KVM_USER_MEM_SLOTS 125 /* memory slots that are not exposed to userspace */ From 5a2892ce72e010e3cb96b438d7cdddce0c88e0e6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 28 Apr 2013 09:24:41 +0200 Subject: [PATCH 180/204] KVM: nVMX: Skip PF interception check when queuing during nested run While a nested run is pending, vmx_queue_exception is only called to requeue exceptions that were previously picked up via vmx_cancel_injection. Therefore, we must not check for PF interception by L1, possibly causing a bogus nested vmexit. Signed-off-by: Jan Kiszka Signed-off-by: Gleb Natapov --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 74c525e2c608..e10217e99d2c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1917,7 +1917,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, u32 intr_info = nr | INTR_INFO_VALID_MASK; if (nr == PF_VECTOR && is_guest_mode(vcpu) && - nested_pf_handled(vcpu)) + !vmx->nested.nested_run_pending && nested_pf_handled(vcpu)) return; if (has_error_code) { From 210552c1bfe83122a480673660d5ca9821c944ae Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Mar 2013 03:18:00 +0000 Subject: [PATCH 181/204] ARM: KVM: add support for minimal host vs guest profiling In order to be able to correctly profile what is happening on the host, we need to be able to identify when we're running on the guest, and log these events differently. Perf offers a simple way to register callbacks into KVM. Mimic what x86 does and enjoy being able to profile your KVM host. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 3 ++ arch/arm/kvm/Makefile | 2 +- arch/arm/kvm/arm.c | 4 ++ arch/arm/kvm/perf.c | 68 +++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 arch/arm/kvm/perf.c diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 0c4e643d939e..78813b8fad32 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -207,4 +207,7 @@ static inline void __cpu_init_hyp_mode(unsigned long long pgd_ptr, kvm_call_hyp((void *)pgd_low, pgd_high, hyp_stack_ptr, vector_ptr); } +int kvm_perf_init(void); +int kvm_perf_teardown(void); + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 8dc5e76cb789..53c5ed83d16f 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -18,6 +18,6 @@ kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o -obj-y += coproc.o coproc_a15.o mmio.o psci.o +obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 1497e18a9e2a..7403f884a545 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -920,7 +920,10 @@ static int init_hyp_mode(void) if (err) goto out_free_mappings; + kvm_perf_init(); + kvm_info("Hyp mode initialized successfully\n"); + return 0; out_free_vfp: free_percpu(kvm_host_vfp_state); @@ -964,6 +967,7 @@ out_err: /* NOP: Compiling as a module not supported */ void kvm_arch_exit(void) { + kvm_perf_teardown(); } static int arm_init(void) diff --git a/arch/arm/kvm/perf.c b/arch/arm/kvm/perf.c new file mode 100644 index 000000000000..1a3849da0b4b --- /dev/null +++ b/arch/arm/kvm/perf.c @@ -0,0 +1,68 @@ +/* + * Based on the x86 implementation. + * + * Copyright (C) 2012 ARM Ltd. + * Author: Marc Zyngier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include + +static int kvm_is_in_guest(void) +{ + return kvm_arm_get_running_vcpu() != NULL; +} + +static int kvm_is_user_mode(void) +{ + struct kvm_vcpu *vcpu; + + vcpu = kvm_arm_get_running_vcpu(); + + if (vcpu) + return !vcpu_mode_priv(vcpu); + + return 0; +} + +static unsigned long kvm_get_guest_ip(void) +{ + struct kvm_vcpu *vcpu; + + vcpu = kvm_arm_get_running_vcpu(); + + if (vcpu) + return *vcpu_pc(vcpu); + + return 0; +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .is_in_guest = kvm_is_in_guest, + .is_user_mode = kvm_is_user_mode, + .get_guest_ip = kvm_get_guest_ip, +}; + +int kvm_perf_init(void) +{ + return perf_register_guest_info_callbacks(&kvm_guest_cbs); +} + +int kvm_perf_teardown(void) +{ + return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); +} From 372b7c1bc80510225ca91cba75bc0850a6e16c39 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 27 Mar 2013 15:56:11 +0000 Subject: [PATCH 182/204] ARM: KVM: arch_timer: use symbolic constants In clocksource/arm_arch_timer.h we define useful symbolic constants. Let's use them to make the KVM arch_timer code clearer. Signed-off-by: Mark Rutland Acked-by: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm/kvm/arch_timer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm/kvm/arch_timer.c b/arch/arm/kvm/arch_timer.c index 6ac938d46297..c55b6089e923 100644 --- a/arch/arm/kvm/arch_timer.c +++ b/arch/arm/kvm/arch_timer.c @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -64,7 +65,7 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - timer->cntv_ctl |= 1 << 1; /* Mask the interrupt in the guest */ + timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK; kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, vcpu->arch.timer_cpu.irq->irq, vcpu->arch.timer_cpu.irq->level); @@ -133,8 +134,8 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) cycle_t cval, now; u64 ns; - /* Check if the timer is enabled and unmasked first */ - if ((timer->cntv_ctl & 3) != 1) + if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || + !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE)) return; cval = timer->cntv_cval; From 6060df84cbe0d36b8e1415b68e3f67b77f27052a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 Apr 2013 19:12:01 +0100 Subject: [PATCH 183/204] ARM: KVM: simplify HYP mapping population The way we populate HYP mappings is a bit convoluted, to say the least. Passing a pointer around to keep track of the current PFN is quite odd, and we end-up having two different PTE accessors for no good reason. Simplify the whole thing by unifying the two PTE accessors, passing a pgprot_t around, and moving the various validity checks to the upper layers. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/mmu.c | 102 +++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 60 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2f12e4056408..30033219df84 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -125,54 +125,34 @@ void free_hyp_pmds(void) } static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, - unsigned long end) -{ - pte_t *pte; - unsigned long addr; - struct page *page; - - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { - unsigned long hyp_addr = KERN_TO_HYP(addr); - - pte = pte_offset_kernel(pmd, hyp_addr); - BUG_ON(!virt_addr_valid(addr)); - page = virt_to_page(addr); - kvm_set_pte(pte, mk_pte(page, PAGE_HYP)); - } -} - -static void create_hyp_io_pte_mappings(pmd_t *pmd, unsigned long start, - unsigned long end, - unsigned long *pfn_base) + unsigned long end, unsigned long pfn, + pgprot_t prot) { pte_t *pte; unsigned long addr; - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { - unsigned long hyp_addr = KERN_TO_HYP(addr); - - pte = pte_offset_kernel(pmd, hyp_addr); - BUG_ON(pfn_valid(*pfn_base)); - kvm_set_pte(pte, pfn_pte(*pfn_base, PAGE_HYP_DEVICE)); - (*pfn_base)++; + for (addr = start; addr < end; addr += PAGE_SIZE) { + pte = pte_offset_kernel(pmd, addr); + kvm_set_pte(pte, pfn_pte(pfn, prot)); + pfn++; } } static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, - unsigned long end, unsigned long *pfn_base) + unsigned long end, unsigned long pfn, + pgprot_t prot) { pmd_t *pmd; pte_t *pte; unsigned long addr, next; for (addr = start; addr < end; addr = next) { - unsigned long hyp_addr = KERN_TO_HYP(addr); - pmd = pmd_offset(pud, hyp_addr); + pmd = pmd_offset(pud, addr); BUG_ON(pmd_sect(*pmd)); if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL, hyp_addr); + pte = pte_alloc_one_kernel(NULL, addr); if (!pte) { kvm_err("Cannot allocate Hyp pte\n"); return -ENOMEM; @@ -182,25 +162,17 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, next = pmd_addr_end(addr, end); - /* - * If pfn_base is NULL, we map kernel pages into HYP with the - * virtual address. Otherwise, this is considered an I/O - * mapping and we map the physical region starting at - * *pfn_base to [start, end[. - */ - if (!pfn_base) - create_hyp_pte_mappings(pmd, addr, next); - else - create_hyp_io_pte_mappings(pmd, addr, next, pfn_base); + create_hyp_pte_mappings(pmd, addr, next, pfn, prot); + pfn += (next - addr) >> PAGE_SHIFT; } return 0; } -static int __create_hyp_mappings(void *from, void *to, unsigned long *pfn_base) +static int __create_hyp_mappings(pgd_t *pgdp, + unsigned long start, unsigned long end, + unsigned long pfn, pgprot_t prot) { - unsigned long start = (unsigned long)from; - unsigned long end = (unsigned long)to; pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -209,21 +181,14 @@ static int __create_hyp_mappings(void *from, void *to, unsigned long *pfn_base) if (start >= end) return -EINVAL; - /* Check for a valid kernel memory mapping */ - if (!pfn_base && (!virt_addr_valid(from) || !virt_addr_valid(to - 1))) - return -EINVAL; - /* Check for a valid kernel IO mapping */ - if (pfn_base && (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))) - return -EINVAL; mutex_lock(&kvm_hyp_pgd_mutex); - for (addr = start; addr < end; addr = next) { - unsigned long hyp_addr = KERN_TO_HYP(addr); - pgd = hyp_pgd + pgd_index(hyp_addr); - pud = pud_offset(pgd, hyp_addr); + for (addr = start & PAGE_MASK; addr < end; addr = next) { + pgd = pgdp + pgd_index(addr); + pud = pud_offset(pgd, addr); if (pud_none_or_clear_bad(pud)) { - pmd = pmd_alloc_one(NULL, hyp_addr); + pmd = pmd_alloc_one(NULL, addr); if (!pmd) { kvm_err("Cannot allocate Hyp pmd\n"); err = -ENOMEM; @@ -233,9 +198,10 @@ static int __create_hyp_mappings(void *from, void *to, unsigned long *pfn_base) } next = pgd_addr_end(addr, end); - err = create_hyp_pmd_mappings(pud, addr, next, pfn_base); + err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); if (err) goto out; + pfn += (next - addr) >> PAGE_SHIFT; } out: mutex_unlock(&kvm_hyp_pgd_mutex); @@ -255,22 +221,38 @@ out: */ int create_hyp_mappings(void *from, void *to) { - return __create_hyp_mappings(from, to, NULL); + unsigned long phys_addr = virt_to_phys(from); + unsigned long start = KERN_TO_HYP((unsigned long)from); + unsigned long end = KERN_TO_HYP((unsigned long)to); + + /* Check for a valid kernel memory mapping */ + if (!virt_addr_valid(from) || !virt_addr_valid(to - 1)) + return -EINVAL; + + return __create_hyp_mappings(hyp_pgd, start, end, + __phys_to_pfn(phys_addr), PAGE_HYP); } /** * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode * @from: The kernel start VA of the range * @to: The kernel end VA of the range (exclusive) - * @addr: The physical start address which gets mapped + * @phys_addr: The physical start address which gets mapped * * The resulting HYP VA is the same as the kernel VA, modulo * HYP_PAGE_OFFSET. */ -int create_hyp_io_mappings(void *from, void *to, phys_addr_t addr) +int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) { - unsigned long pfn = __phys_to_pfn(addr); - return __create_hyp_mappings(from, to, &pfn); + unsigned long start = KERN_TO_HYP((unsigned long)from); + unsigned long end = KERN_TO_HYP((unsigned long)to); + + /* Check for a valid kernel IO mapping */ + if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)) + return -EINVAL; + + return __create_hyp_mappings(hyp_pgd, start, end, + __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } /** From 3562c76dcb9ce84853c835eec12a911bf3a8e2da Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 Apr 2013 19:12:02 +0100 Subject: [PATCH 184/204] ARM: KVM: fix HYP mapping limitations around zero The current code for creating HYP mapping doesn't like to wrap around zero, which prevents from mapping anything into the last page of the virtual address space. It doesn't take much effort to remove this limitation, making the code more consistent with the rest of the kernel in the process. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/mmu.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 30033219df84..96d61daa23ba 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -131,11 +131,12 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, pte_t *pte; unsigned long addr; - for (addr = start; addr < end; addr += PAGE_SIZE) { + addr = start; + do { pte = pte_offset_kernel(pmd, addr); kvm_set_pte(pte, pfn_pte(pfn, prot)); pfn++; - } + } while (addr += PAGE_SIZE, addr != end); } static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, @@ -146,7 +147,8 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, pte_t *pte; unsigned long addr, next; - for (addr = start; addr < end; addr = next) { + addr = start; + do { pmd = pmd_offset(pud, addr); BUG_ON(pmd_sect(*pmd)); @@ -164,7 +166,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, create_hyp_pte_mappings(pmd, addr, next, pfn, prot); pfn += (next - addr) >> PAGE_SHIFT; - } + } while (addr = next, addr != end); return 0; } @@ -179,11 +181,10 @@ static int __create_hyp_mappings(pgd_t *pgdp, unsigned long addr, next; int err = 0; - if (start >= end) - return -EINVAL; - mutex_lock(&kvm_hyp_pgd_mutex); - for (addr = start & PAGE_MASK; addr < end; addr = next) { + addr = start & PAGE_MASK; + end = PAGE_ALIGN(end); + do { pgd = pgdp + pgd_index(addr); pud = pud_offset(pgd, addr); @@ -202,7 +203,7 @@ static int __create_hyp_mappings(pgd_t *pgdp, if (err) goto out; pfn += (next - addr) >> PAGE_SHIFT; - } + } while (addr = next, addr != end); out: mutex_unlock(&kvm_hyp_pgd_mutex); return err; @@ -216,8 +217,6 @@ out: * The same virtual address as the kernel virtual address is also used * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying * physical pages. - * - * Note: Wrapping around zero in the "to" address is not supported. */ int create_hyp_mappings(void *from, void *to) { From 2fb410596ca917fe6419be61ee5bc1782b17d947 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 Apr 2013 19:12:03 +0100 Subject: [PATCH 185/204] ARM: KVM: move to a KVM provided HYP idmap After the HYP page table rework, it is pretty easy to let the KVM code provide its own idmap, rather than expecting the kernel to provide it. It takes actually less code to do so. Acked-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/idmap.h | 1 - arch/arm/include/asm/kvm_mmu.h | 1 - arch/arm/kvm/mmu.c | 24 +++++++++++++++++++++++- arch/arm/mm/idmap.c | 32 +------------------------------- 4 files changed, 24 insertions(+), 34 deletions(-) diff --git a/arch/arm/include/asm/idmap.h b/arch/arm/include/asm/idmap.h index 1a66f907e5cc..bf863edb517d 100644 --- a/arch/arm/include/asm/idmap.h +++ b/arch/arm/include/asm/idmap.h @@ -8,7 +8,6 @@ #define __idmap __section(.idmap.text) noinline notrace extern pgd_t *idmap_pgd; -extern pgd_t *hyp_pgd; void setup_mm_for_reboot(void); diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 970f3b5fa109..3c71a1d4b7a3 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -21,7 +21,6 @@ #include #include -#include /* * We directly use the kernel VA for the HYP, as we can directly share diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 96d61daa23ba..bfc59279de1b 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -32,6 +32,7 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; +static pgd_t *hyp_pgd; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) @@ -715,12 +716,33 @@ phys_addr_t kvm_mmu_get_httbr(void) int kvm_mmu_init(void) { + unsigned long hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start); + unsigned long hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end); + int err; + + hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); if (!hyp_pgd) { kvm_err("Hyp mode PGD not allocated\n"); - return -ENOMEM; + err = -ENOMEM; + goto out; + } + + /* Create the idmap in the boot page tables */ + err = __create_hyp_mappings(boot_hyp_pgd, + hyp_idmap_start, hyp_idmap_end, + __phys_to_pfn(hyp_idmap_start), + PAGE_HYP); + + if (err) { + kvm_err("Failed to idmap %lx-%lx\n", + hyp_idmap_start, hyp_idmap_end); + goto out; } return 0; +out: + kfree(hyp_pgd); + return err; } /** diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index 5ee505c937d1..83cb3ac27095 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -8,7 +8,6 @@ #include #include #include -#include pgd_t *idmap_pgd; @@ -83,37 +82,10 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start, } while (pgd++, addr = next, addr != end); } -#if defined(CONFIG_ARM_VIRT_EXT) && defined(CONFIG_ARM_LPAE) -pgd_t *hyp_pgd; - -extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; - -static int __init init_static_idmap_hyp(void) -{ - hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); - if (!hyp_pgd) - return -ENOMEM; - - pr_info("Setting up static HYP identity map for 0x%p - 0x%p\n", - __hyp_idmap_text_start, __hyp_idmap_text_end); - identity_mapping_add(hyp_pgd, __hyp_idmap_text_start, - __hyp_idmap_text_end, PMD_SECT_AP1); - - return 0; -} -#else -static int __init init_static_idmap_hyp(void) -{ - return 0; -} -#endif - extern char __idmap_text_start[], __idmap_text_end[]; static int __init init_static_idmap(void) { - int ret; - idmap_pgd = pgd_alloc(&init_mm); if (!idmap_pgd) return -ENOMEM; @@ -123,12 +95,10 @@ static int __init init_static_idmap(void) identity_mapping_add(idmap_pgd, __idmap_text_start, __idmap_text_end, 0); - ret = init_static_idmap_hyp(); - /* Flush L1 for the hardware to see this page table content */ flush_cache_louis(); - return ret; + return 0; } early_initcall(init_static_idmap); From 0394e1f605208706e4e1999d06a4570b9f583b7f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 Apr 2013 19:12:04 +0100 Subject: [PATCH 186/204] ARM: KVM: enforce maximum size for identity mapped code We're about to move to an init procedure where we rely on the fact that the init code fits in a single page. Make sure we align the idmap text on a vector alignment, and that the code is not bigger than a single page. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kernel/vmlinux.lds.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index b571484e9f03..a871b8e00fca 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -20,7 +20,7 @@ VMLINUX_SYMBOL(__idmap_text_start) = .; \ *(.idmap.text) \ VMLINUX_SYMBOL(__idmap_text_end) = .; \ - ALIGN_FUNCTION(); \ + . = ALIGN(32); \ VMLINUX_SYMBOL(__hyp_idmap_text_start) = .; \ *(.hyp.idmap.text) \ VMLINUX_SYMBOL(__hyp_idmap_text_end) = .; @@ -315,3 +315,8 @@ SECTIONS */ ASSERT((__proc_info_end - __proc_info_begin), "missing CPU support") ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined") +/* + * The HYP init code can't be more than a page long. + * The above comment applies as well. + */ +ASSERT(((__hyp_idmap_text_end - __hyp_idmap_text_start) <= PAGE_SIZE), "HYP init code too big") From 4f728276fbf1e043010485d7e9275082a1c3d650 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 Apr 2013 19:12:05 +0100 Subject: [PATCH 187/204] ARM: KVM: rework HYP page table freeing There is no point in freeing HYP page tables differently from Stage-2. They now have the same requirements, and should be dealt with the same way. Promote unmap_stage2_range to be The One True Way, and get rid of a number of nasty bugs in the process (good thing we never actually called free_hyp_pmds before...). Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 2 +- arch/arm/kvm/arm.c | 2 +- arch/arm/kvm/mmu.c | 181 +++++++++++++++------------------ 3 files changed, 82 insertions(+), 103 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 3c71a1d4b7a3..92eb20d57942 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -32,7 +32,7 @@ int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); -void free_hyp_pmds(void); +void free_hyp_pgds(void); int kvm_alloc_stage2_pgd(struct kvm *kvm); void kvm_free_stage2_pgd(struct kvm *kvm); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 7403f884a545..16f164a5db86 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -928,7 +928,7 @@ static int init_hyp_mode(void) out_free_vfp: free_percpu(kvm_host_vfp_state); out_free_mappings: - free_hyp_pmds(); + free_hyp_pgds(); out_free_stack_pages: for_each_possible_cpu(cpu) free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index bfc59279de1b..7464824c17ef 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -72,56 +72,104 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) return p; } -static void free_ptes(pmd_t *pmd, unsigned long addr) +static void clear_pud_entry(pud_t *pud) { - pte_t *pte; - unsigned int i; + pmd_t *pmd_table = pmd_offset(pud, 0); + pud_clear(pud); + pmd_free(NULL, pmd_table); + put_page(virt_to_page(pud)); +} - for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { - if (!pmd_none(*pmd) && pmd_table(*pmd)) { - pte = pte_offset_kernel(pmd, addr); - pte_free_kernel(NULL, pte); - } - pmd++; +static void clear_pmd_entry(pmd_t *pmd) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + pmd_clear(pmd); + pte_free_kernel(NULL, pte_table); + put_page(virt_to_page(pmd)); +} + +static bool pmd_empty(pmd_t *pmd) +{ + struct page *pmd_page = virt_to_page(pmd); + return page_count(pmd_page) == 1; +} + +static void clear_pte_entry(pte_t *pte) +{ + if (pte_present(*pte)) { + kvm_set_pte(pte, __pte(0)); + put_page(virt_to_page(pte)); } } -static void free_hyp_pgd_entry(unsigned long addr) +static bool pte_empty(pte_t *pte) +{ + struct page *pte_page = virt_to_page(pte); + return page_count(pte_page) == 1; +} + +static void unmap_range(pgd_t *pgdp, unsigned long long start, u64 size) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - unsigned long hyp_addr = KERN_TO_HYP(addr); + pte_t *pte; + unsigned long long addr = start, end = start + size; + u64 range; - pgd = hyp_pgd + pgd_index(hyp_addr); - pud = pud_offset(pgd, hyp_addr); + while (addr < end) { + pgd = pgdp + pgd_index(addr); + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + addr += PUD_SIZE; + continue; + } - if (pud_none(*pud)) - return; - BUG_ON(pud_bad(*pud)); + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + addr += PMD_SIZE; + continue; + } - pmd = pmd_offset(pud, hyp_addr); - free_ptes(pmd, addr); - pmd_free(NULL, pmd); - pud_clear(pud); + pte = pte_offset_kernel(pmd, addr); + clear_pte_entry(pte); + range = PAGE_SIZE; + + /* If we emptied the pte, walk back up the ladder */ + if (pte_empty(pte)) { + clear_pmd_entry(pmd); + range = PMD_SIZE; + if (pmd_empty(pmd)) { + clear_pud_entry(pud); + range = PUD_SIZE; + } + } + + addr += range; + } } /** - * free_hyp_pmds - free a Hyp-mode level-2 tables and child level-3 tables + * free_hyp_pgds - free Hyp-mode page tables * - * Assumes this is a page table used strictly in Hyp-mode and therefore contains + * Assumes hyp_pgd is a page table used strictly in Hyp-mode and therefore contains * either mappings in the kernel memory area (above PAGE_OFFSET), or * device mappings in the vmalloc range (from VMALLOC_START to VMALLOC_END). */ -void free_hyp_pmds(void) +void free_hyp_pgds(void) { unsigned long addr; mutex_lock(&kvm_hyp_pgd_mutex); - for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - free_hyp_pgd_entry(addr); - for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - free_hyp_pgd_entry(addr); + + if (hyp_pgd) { + for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) + unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) + unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + kfree(hyp_pgd); + } + mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -136,6 +184,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, do { pte = pte_offset_kernel(pmd, addr); kvm_set_pte(pte, pfn_pte(pfn, prot)); + get_page(virt_to_page(pte)); pfn++; } while (addr += PAGE_SIZE, addr != end); } @@ -161,6 +210,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, return -ENOMEM; } pmd_populate_kernel(NULL, pmd, pte); + get_page(virt_to_page(pmd)); } next = pmd_addr_end(addr, end); @@ -197,6 +247,7 @@ static int __create_hyp_mappings(pgd_t *pgdp, goto out; } pud_populate(NULL, pud, pmd); + get_page(virt_to_page(pud)); } next = pgd_addr_end(addr, end); @@ -289,42 +340,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) return 0; } -static void clear_pud_entry(pud_t *pud) -{ - pmd_t *pmd_table = pmd_offset(pud, 0); - pud_clear(pud); - pmd_free(NULL, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_pmd_entry(pmd_t *pmd) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - pmd_clear(pmd); - pte_free_kernel(NULL, pte_table); - put_page(virt_to_page(pmd)); -} - -static bool pmd_empty(pmd_t *pmd) -{ - struct page *pmd_page = virt_to_page(pmd); - return page_count(pmd_page) == 1; -} - -static void clear_pte_entry(pte_t *pte) -{ - if (pte_present(*pte)) { - kvm_set_pte(pte, __pte(0)); - put_page(virt_to_page(pte)); - } -} - -static bool pte_empty(pte_t *pte) -{ - struct page *pte_page = virt_to_page(pte); - return page_count(pte_page) == 1; -} - /** * unmap_stage2_range -- Clear stage2 page table entries to unmap a range * @kvm: The VM pointer @@ -338,43 +353,7 @@ static bool pte_empty(pte_t *pte) */ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - phys_addr_t addr = start, end = start + size; - u64 range; - - while (addr < end) { - pgd = kvm->arch.pgd + pgd_index(addr); - pud = pud_offset(pgd, addr); - if (pud_none(*pud)) { - addr += PUD_SIZE; - continue; - } - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - addr += PMD_SIZE; - continue; - } - - pte = pte_offset_kernel(pmd, addr); - clear_pte_entry(pte); - range = PAGE_SIZE; - - /* If we emptied the pte, walk back up the ladder */ - if (pte_empty(pte)) { - clear_pmd_entry(pmd); - range = PMD_SIZE; - if (pmd_empty(pmd)) { - clear_pud_entry(pud); - range = PUD_SIZE; - } - } - - addr += range; - } + unmap_range(kvm->arch.pgd, start, size); } /** @@ -741,7 +720,7 @@ int kvm_mmu_init(void) return 0; out: - kfree(hyp_pgd); + free_hyp_pgds(); return err; } From 5a677ce044f18a341ab942e23516e52ad89f7687 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 Apr 2013 19:12:06 +0100 Subject: [PATCH 188/204] ARM: KVM: switch to a dual-step HYP init code Our HYP init code suffers from two major design issues: - it cannot support CPU hotplug, as we tear down the idmap very early - it cannot perform a TLB invalidation when switching from init to runtime mappings, as pages are manipulated from PL1 exclusively The hotplug problem mandates that we keep two sets of page tables (boot and runtime). The TLB problem mandates that we're able to transition from one PGD to another while in HYP, invalidating the TLBs in the process. To be able to do this, we need to share a page between the two page tables. A page that will have the same VA in both configurations. All we need is a VA that has the following properties: - This VA can't be used to represent a kernel mapping. - This VA will not conflict with the physical address of the kernel text The vectors page seems to satisfy this requirement: - The kernel never maps anything else there - The kernel text being copied at the beginning of the physical memory, it is unlikely to use the last 64kB (I doubt we'll ever support KVM on a system with something like 4MB of RAM, but patches are very welcome). Let's call this VA the trampoline VA. Now, we map our init page at 3 locations: - idmap in the boot pgd - trampoline VA in the boot pgd - trampoline VA in the runtime pgd The init scenario is now the following: - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd, runtime stack, runtime vectors - Enable the MMU with the boot pgd - Jump to a target into the trampoline page (remember, this is the same physical page!) - Now switch to the runtime pgd (same VA, and still the same physical page!) - Invalidate TLBs - Set stack and vectors - Profit! (or eret, if you only care about the code). Note that we keep the boot mapping permanently (it is not strictly an idmap anymore) to allow for CPU hotplug in later patches. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 31 +++++--- arch/arm/include/asm/kvm_mmu.h | 24 +++++- arch/arm/kvm/arm.c | 11 +-- arch/arm/kvm/init.S | 78 ++++++++++++++----- arch/arm/kvm/mmu.c | 132 +++++++++++++++++++++++--------- 5 files changed, 197 insertions(+), 79 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 78813b8fad32..6c2a35da867e 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -188,23 +188,30 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); -static inline void __cpu_init_hyp_mode(unsigned long long pgd_ptr, +static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr, + unsigned long long pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) { - unsigned long pgd_low, pgd_high; - - pgd_low = (pgd_ptr & ((1ULL << 32) - 1)); - pgd_high = (pgd_ptr >> 32ULL); - /* - * Call initialization code, and switch to the full blown - * HYP code. The init code doesn't need to preserve these registers as - * r1-r3 and r12 are already callee save according to the AAPCS. - * Note that we slightly misuse the prototype by casing the pgd_low to - * a void *. + * Call initialization code, and switch to the full blown HYP + * code. The init code doesn't need to preserve these + * registers as r0-r3 are already callee saved according to + * the AAPCS. + * Note that we slightly misuse the prototype by casing the + * stack pointer to a void *. + * + * We don't have enough registers to perform the full init in + * one go. Install the boot PGD first, and then install the + * runtime PGD, stack pointer and vectors. The PGDs are always + * passed as the third argument, in order to be passed into + * r2-r3 to the init code (yes, this is compliant with the + * PCS!). */ - kvm_call_hyp((void *)pgd_low, pgd_high, hyp_stack_ptr, vector_ptr); + + kvm_call_hyp(NULL, 0, boot_pgd_ptr); + + kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); } int kvm_perf_init(void); diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 92eb20d57942..24b767a8cdb9 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -19,17 +19,29 @@ #ifndef __ARM_KVM_MMU_H__ #define __ARM_KVM_MMU_H__ -#include -#include +#include +#include /* * We directly use the kernel VA for the HYP, as we can directly share * the mapping (HTTBR "covers" TTBR1). */ -#define HYP_PAGE_OFFSET_MASK (~0UL) +#define HYP_PAGE_OFFSET_MASK UL(~0) #define HYP_PAGE_OFFSET PAGE_OFFSET #define KERN_TO_HYP(kva) (kva) +/* + * Our virtual mapping for the boot-time MMU-enable code. Must be + * shared across all the page-tables. Conveniently, we use the vectors + * page, where no kernel data will ever be shared with HYP. + */ +#define TRAMPOLINE_VA UL(CONFIG_VECTORS_BASE) + +#ifndef __ASSEMBLY__ + +#include +#include + int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); void free_hyp_pgds(void); @@ -44,6 +56,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); +phys_addr_t kvm_mmu_get_boot_httbr(void); +phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); @@ -113,4 +127,8 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) } } +#define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) + +#endif /* !__ASSEMBLY__ */ + #endif /* __ARM_KVM_MMU_H__ */ diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 16f164a5db86..fc47bd721ab0 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -787,20 +787,22 @@ long kvm_arch_vm_ioctl(struct file *filp, static void cpu_init_hyp_mode(void *vector) { + unsigned long long boot_pgd_ptr; unsigned long long pgd_ptr; unsigned long hyp_stack_ptr; unsigned long stack_page; unsigned long vector_ptr; /* Switch from the HYP stub to our own HYP init vector */ - __hyp_set_vectors((unsigned long)vector); + __hyp_set_vectors(kvm_get_idmap_vector()); + boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr(); pgd_ptr = (unsigned long long)kvm_mmu_get_httbr(); stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; vector_ptr = (unsigned long)__kvm_hyp_vector; - __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); + __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); } /** @@ -853,11 +855,6 @@ static int init_hyp_mode(void) (void *)(long)init_phys_addr, 1); } - /* - * Unmap the identity mapping - */ - kvm_clear_hyp_idmap(); - /* * Map the Hyp-code called directly from the host */ diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S index 9f37a79b880b..f048338135f7 100644 --- a/arch/arm/kvm/init.S +++ b/arch/arm/kvm/init.S @@ -21,13 +21,33 @@ #include #include #include +#include /******************************************************************** * Hypervisor initialization * - should be called with: - * r0,r1 = Hypervisor pgd pointer - * r2 = top of Hyp stack (kernel VA) - * r3 = pointer to hyp vectors + * r0 = top of Hyp stack (kernel VA) + * r1 = pointer to hyp vectors + * r2,r3 = Hypervisor pgd pointer + * + * The init scenario is: + * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd, + * runtime stack, runtime vectors + * - Enable the MMU with the boot pgd + * - Jump to a target into the trampoline page (remember, this is the same + * physical page!) + * - Now switch to the runtime pgd (same VA, and still the same physical + * page!) + * - Invalidate TLBs + * - Set stack and vectors + * - Profit! (or eret, if you only care about the code). + * + * As we only have four registers available to pass parameters (and we + * need six), we split the init in two phases: + * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD. + * Provides the basic HYP init, and enable the MMU. + * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD. + * Switches to the runtime PGD, set stack and vectors. */ .text @@ -47,22 +67,25 @@ __kvm_hyp_init: W(b) . __do_hyp_init: + cmp r0, #0 @ We have a SP? + bne phase2 @ Yes, second stage init + @ Set the HTTBR to point to the hypervisor PGD pointer passed - mcrr p15, 4, r0, r1, c2 + mcrr p15, 4, r2, r3, c2 @ Set the HTCR and VTCR to the same shareability and cacheability @ settings as the non-secure TTBCR and with T0SZ == 0. mrc p15, 4, r0, c2, c0, 2 @ HTCR - ldr r12, =HTCR_MASK - bic r0, r0, r12 + ldr r2, =HTCR_MASK + bic r0, r0, r2 mrc p15, 0, r1, c2, c0, 2 @ TTBCR and r1, r1, #(HTCR_MASK & ~TTBCR_T0SZ) orr r0, r0, r1 mcr p15, 4, r0, c2, c0, 2 @ HTCR mrc p15, 4, r1, c2, c1, 2 @ VTCR - ldr r12, =VTCR_MASK - bic r1, r1, r12 + ldr r2, =VTCR_MASK + bic r1, r1, r2 bic r0, r0, #(~VTCR_HTCR_SH) @ clear non-reusable HTCR bits orr r1, r0, r1 orr r1, r1, #(KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S) @@ -85,24 +108,41 @@ __do_hyp_init: @ - Memory alignment checks: enabled @ - MMU: enabled (this code must be run from an identity mapping) mrc p15, 4, r0, c1, c0, 0 @ HSCR - ldr r12, =HSCTLR_MASK - bic r0, r0, r12 + ldr r2, =HSCTLR_MASK + bic r0, r0, r2 mrc p15, 0, r1, c1, c0, 0 @ SCTLR - ldr r12, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) - and r1, r1, r12 - ARM( ldr r12, =(HSCTLR_M | HSCTLR_A) ) - THUMB( ldr r12, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) - orr r1, r1, r12 + ldr r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) + and r1, r1, r2 + ARM( ldr r2, =(HSCTLR_M | HSCTLR_A) ) + THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) + orr r1, r1, r2 orr r0, r0, r1 isb mcr p15, 4, r0, c1, c0, 0 @ HSCR - isb - @ Set stack pointer and return to the kernel - mov sp, r2 + @ End of init phase-1 + eret + +phase2: + @ Set stack pointer + mov sp, r0 @ Set HVBAR to point to the HYP vectors - mcr p15, 4, r3, c12, c0, 0 @ HVBAR + mcr p15, 4, r1, c12, c0, 0 @ HVBAR + + @ Jump to the trampoline page + ldr r0, =TRAMPOLINE_VA + adr r1, target + bfi r0, r1, #0, #PAGE_SHIFT + mov pc, r0 + +target: @ We're now in the trampoline code, switch page tables + mcrr p15, 4, r2, r3, c2 + isb + + @ Invalidate the old TLBs + mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH + dsb eret diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 7464824c17ef..4646c17f571c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -32,9 +32,15 @@ extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; +static pgd_t *boot_hyp_pgd; static pgd_t *hyp_pgd; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); +static void *init_bounce_page; +static unsigned long hyp_idmap_start; +static unsigned long hyp_idmap_end; +static phys_addr_t hyp_idmap_vector; + static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); @@ -152,9 +158,12 @@ static void unmap_range(pgd_t *pgdp, unsigned long long start, u64 size) /** * free_hyp_pgds - free Hyp-mode page tables * - * Assumes hyp_pgd is a page table used strictly in Hyp-mode and therefore contains - * either mappings in the kernel memory area (above PAGE_OFFSET), or - * device mappings in the vmalloc range (from VMALLOC_START to VMALLOC_END). + * Assumes hyp_pgd is a page table used strictly in Hyp-mode and + * therefore contains either mappings in the kernel memory area (above + * PAGE_OFFSET), or device mappings in the vmalloc range (from + * VMALLOC_START to VMALLOC_END). + * + * boot_hyp_pgd should only map two pages for the init code. */ void free_hyp_pgds(void) { @@ -162,6 +171,12 @@ void free_hyp_pgds(void) mutex_lock(&kvm_hyp_pgd_mutex); + if (boot_hyp_pgd) { + unmap_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); + unmap_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + kfree(boot_hyp_pgd); + } + if (hyp_pgd) { for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); @@ -170,6 +185,7 @@ void free_hyp_pgds(void) kfree(hyp_pgd); } + kfree(init_bounce_page); mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -185,6 +201,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, pte = pte_offset_kernel(pmd, addr); kvm_set_pte(pte, pfn_pte(pfn, prot)); get_page(virt_to_page(pte)); + kvm_flush_dcache_to_poc(pte, sizeof(*pte)); pfn++; } while (addr += PAGE_SIZE, addr != end); } @@ -211,6 +228,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, } pmd_populate_kernel(NULL, pmd, pte); get_page(virt_to_page(pmd)); + kvm_flush_dcache_to_poc(pmd, sizeof(*pmd)); } next = pmd_addr_end(addr, end); @@ -248,6 +266,7 @@ static int __create_hyp_mappings(pgd_t *pgdp, } pud_populate(NULL, pud, pmd); get_page(virt_to_page(pud)); + kvm_flush_dcache_to_poc(pud, sizeof(*pud)); } next = pgd_addr_end(addr, end); @@ -689,18 +708,64 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) phys_addr_t kvm_mmu_get_httbr(void) { - VM_BUG_ON(!virt_addr_valid(hyp_pgd)); return virt_to_phys(hyp_pgd); } +phys_addr_t kvm_mmu_get_boot_httbr(void) +{ + return virt_to_phys(boot_hyp_pgd); +} + +phys_addr_t kvm_get_idmap_vector(void) +{ + return hyp_idmap_vector; +} + int kvm_mmu_init(void) { - unsigned long hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start); - unsigned long hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end); int err; + hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start); + hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end); + hyp_idmap_vector = virt_to_phys(__kvm_hyp_init); + + if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) { + /* + * Our init code is crossing a page boundary. Allocate + * a bounce page, copy the code over and use that. + */ + size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start; + phys_addr_t phys_base; + + init_bounce_page = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!init_bounce_page) { + kvm_err("Couldn't allocate HYP init bounce page\n"); + err = -ENOMEM; + goto out; + } + + memcpy(init_bounce_page, __hyp_idmap_text_start, len); + /* + * Warning: the code we just copied to the bounce page + * must be flushed to the point of coherency. + * Otherwise, the data may be sitting in L2, and HYP + * mode won't be able to observe it as it runs with + * caches off at that point. + */ + kvm_flush_dcache_to_poc(init_bounce_page, len); + + phys_base = virt_to_phys(init_bounce_page); + hyp_idmap_vector += phys_base - hyp_idmap_start; + hyp_idmap_start = phys_base; + hyp_idmap_end = phys_base + len; + + kvm_info("Using HYP init bounce page @%lx\n", + (unsigned long)phys_base); + } + hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); - if (!hyp_pgd) { + boot_hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL); + if (!hyp_pgd || !boot_hyp_pgd) { kvm_err("Hyp mode PGD not allocated\n"); err = -ENOMEM; goto out; @@ -718,39 +783,30 @@ int kvm_mmu_init(void) goto out; } + /* Map the very same page at the trampoline VA */ + err = __create_hyp_mappings(boot_hyp_pgd, + TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, + __phys_to_pfn(hyp_idmap_start), + PAGE_HYP); + if (err) { + kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n", + TRAMPOLINE_VA); + goto out; + } + + /* Map the same page again into the runtime page tables */ + err = __create_hyp_mappings(hyp_pgd, + TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, + __phys_to_pfn(hyp_idmap_start), + PAGE_HYP); + if (err) { + kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n", + TRAMPOLINE_VA); + goto out; + } + return 0; out: free_hyp_pgds(); return err; } - -/** - * kvm_clear_idmap - remove all idmaps from the hyp pgd - * - * Free the underlying pmds for all pgds in range and clear the pgds (but - * don't free them) afterwards. - */ -void kvm_clear_hyp_idmap(void) -{ - unsigned long addr, end; - unsigned long next; - pgd_t *pgd = hyp_pgd; - pud_t *pud; - pmd_t *pmd; - - addr = virt_to_phys(__hyp_idmap_text_start); - end = virt_to_phys(__hyp_idmap_text_end); - - pgd += pgd_index(addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - pud = pud_offset(pgd, addr); - pmd = pmd_offset(pud, addr); - - pud_clear(pud); - kvm_clean_pmd_entry(pmd); - pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK)); - } while (pgd++, addr = next, addr < end); -} From d157f4a5155f4fbd0d1da66b3d2f504c13bd194d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 Apr 2013 19:12:07 +0100 Subject: [PATCH 189/204] ARM: KVM: perform HYP initilization for hotplugged CPUs Now that we have the necessary infrastructure to boot a hotplugged CPU at any point in time, wire a CPU notifier that will perform the HYP init for the incoming CPU. Note that this depends on the platform code and/or firmware to boot the incoming CPU with HYP mode enabled and return to the kernel by following the normal boot path (HYP stub installed). Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 1 + arch/arm/kvm/arm.c | 49 +++++++++++++++++++++++----------- arch/arm/kvm/mmu.c | 35 +++++++++++++++++++----- 3 files changed, 63 insertions(+), 22 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 24b767a8cdb9..472ac7091003 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -44,6 +44,7 @@ int create_hyp_mappings(void *from, void *to); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); +void free_boot_hyp_pgd(void); void free_hyp_pgds(void); int kvm_alloc_stage2_pgd(struct kvm *kvm); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index fc47bd721ab0..6ea2aed0d29f 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -16,6 +16,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ +#include #include #include #include @@ -785,7 +786,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } } -static void cpu_init_hyp_mode(void *vector) +static void cpu_init_hyp_mode(void *dummy) { unsigned long long boot_pgd_ptr; unsigned long long pgd_ptr; @@ -805,12 +806,28 @@ static void cpu_init_hyp_mode(void *vector) __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); } +static int hyp_init_cpu_notify(struct notifier_block *self, + unsigned long action, void *cpu) +{ + switch (action) { + case CPU_STARTING: + case CPU_STARTING_FROZEN: + cpu_init_hyp_mode(NULL); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block hyp_init_cpu_nb = { + .notifier_call = hyp_init_cpu_notify, +}; + /** * Inits Hyp-mode on all online CPUs */ static int init_hyp_mode(void) { - phys_addr_t init_phys_addr; int cpu; int err = 0; @@ -842,19 +859,6 @@ static int init_hyp_mode(void) per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; } - /* - * Execute the init code on each CPU. - * - * Note: The stack is not mapped yet, so don't do anything else than - * initializing the hypervisor mode on each CPU using a local stack - * space for temporary storage. - */ - init_phys_addr = virt_to_phys(__kvm_hyp_init); - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, cpu_init_hyp_mode, - (void *)(long)init_phys_addr, 1); - } - /* * Map the Hyp-code called directly from the host */ @@ -899,6 +903,11 @@ static int init_hyp_mode(void) } } + /* + * Execute the init code on each CPU. + */ + on_each_cpu(cpu_init_hyp_mode, NULL, 1); + /* * Init HYP view of VGIC */ @@ -917,6 +926,10 @@ static int init_hyp_mode(void) if (err) goto out_free_mappings; +#ifndef CONFIG_HOTPLUG_CPU + free_boot_hyp_pgd(); +#endif + kvm_perf_init(); kvm_info("Hyp mode initialized successfully\n"); @@ -955,6 +968,12 @@ int kvm_arch_init(void *opaque) if (err) goto out_err; + err = register_cpu_notifier(&hyp_init_cpu_nb); + if (err) { + kvm_err("Cannot register HYP init CPU notifier (%d)\n", err); + goto out_err; + } + kvm_coproc_table_init(); return 0; out_err: diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 4646c17f571c..965706578f13 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -155,6 +155,31 @@ static void unmap_range(pgd_t *pgdp, unsigned long long start, u64 size) } } +/** + * free_boot_hyp_pgd - free HYP boot page tables + * + * Free the HYP boot page tables. The bounce page is also freed. + */ +void free_boot_hyp_pgd(void) +{ + mutex_lock(&kvm_hyp_pgd_mutex); + + if (boot_hyp_pgd) { + unmap_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); + unmap_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + kfree(boot_hyp_pgd); + boot_hyp_pgd = NULL; + } + + if (hyp_pgd) + unmap_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + + kfree(init_bounce_page); + init_bounce_page = NULL; + + mutex_unlock(&kvm_hyp_pgd_mutex); +} + /** * free_hyp_pgds - free Hyp-mode page tables * @@ -169,13 +194,9 @@ void free_hyp_pgds(void) { unsigned long addr; - mutex_lock(&kvm_hyp_pgd_mutex); + free_boot_hyp_pgd(); - if (boot_hyp_pgd) { - unmap_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); - unmap_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); - kfree(boot_hyp_pgd); - } + mutex_lock(&kvm_hyp_pgd_mutex); if (hyp_pgd) { for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) @@ -183,9 +204,9 @@ void free_hyp_pgds(void) for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); kfree(hyp_pgd); + hyp_pgd = NULL; } - kfree(init_bounce_page); mutex_unlock(&kvm_hyp_pgd_mutex); } From 17b1e31f9201fc102ee3ddd409988e47753e22f9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Apr 2013 16:47:18 +0100 Subject: [PATCH 190/204] ARM: KVM: add architecture specific hook for capabilities Most of the capabilities are common to both arm and arm64, but we still need to handle the exceptions. Introduce kvm_arch_dev_ioctl_check_extension, which both architectures implement (in the 32bit case, it just returns 0). Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 5 +++++ arch/arm/kvm/arm.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 6c2a35da867e..dcfcbf59fceb 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -214,6 +214,11 @@ static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr, kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); } +static inline int kvm_arch_dev_ioctl_check_extension(long ext) +{ + return 0; +} + int kvm_perf_init(void); int kvm_perf_teardown(void); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 6ea2aed0d29f..cc67cafc5b87 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -206,7 +206,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = KVM_MAX_VCPUS; break; default: - r = 0; + r = kvm_arch_dev_ioctl_check_extension(ext); break; } return r; From 3de50da6901521f9e520b8eb47d092779512e83c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 8 Apr 2013 16:47:19 +0100 Subject: [PATCH 191/204] ARM: KVM: promote vfp_host pointer to generic host cpu context We use the vfp_host pointer to store the host VFP context, should the guest start using VFP itself. Actually, we can use this pointer in a more generic way to store CPU speficic data, and arm64 is using it to dump the whole host state before switching to the guest. Simply rename the vfp_host field to host_cpu_context, and the corresponding type to kvm_cpu_context_t. No change in functionnality. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 8 +++++--- arch/arm/kernel/asm-offsets.c | 2 +- arch/arm/kvm/arm.c | 28 ++++++++++++++-------------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index dcfcbf59fceb..57cb786a6203 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -87,7 +87,7 @@ struct kvm_vcpu_fault_info { u32 hyp_pc; /* PC when exception was taken from Hyp mode */ }; -typedef struct vfp_hard_struct kvm_kernel_vfp_t; +typedef struct vfp_hard_struct kvm_cpu_context_t; struct kvm_vcpu_arch { struct kvm_regs regs; @@ -105,8 +105,10 @@ struct kvm_vcpu_arch { struct kvm_vcpu_fault_info fault; /* Floating point registers (VFP and Advanced SIMD/NEON) */ - kvm_kernel_vfp_t vfp_guest; - kvm_kernel_vfp_t *vfp_host; + struct vfp_hard_struct vfp_guest; + + /* Host FP context */ + kvm_cpu_context_t *host_cpu_context; /* VGIC state */ struct vgic_cpu vgic_cpu; diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index ee1ac39a58f0..92562a2e9793 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -154,7 +154,7 @@ int main(void) DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.midr)); DEFINE(VCPU_CP15, offsetof(struct kvm_vcpu, arch.cp15)); DEFINE(VCPU_VFP_GUEST, offsetof(struct kvm_vcpu, arch.vfp_guest)); - DEFINE(VCPU_VFP_HOST, offsetof(struct kvm_vcpu, arch.vfp_host)); + DEFINE(VCPU_VFP_HOST, offsetof(struct kvm_vcpu, arch.host_cpu_context)); DEFINE(VCPU_REGS, offsetof(struct kvm_vcpu, arch.regs)); DEFINE(VCPU_USR_REGS, offsetof(struct kvm_vcpu, arch.regs.usr_regs)); DEFINE(VCPU_SVC_REGS, offsetof(struct kvm_vcpu, arch.regs.svc_regs)); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index cc67cafc5b87..089c0a40514c 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -49,7 +49,7 @@ __asm__(".arch_extension virt"); #endif static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -static kvm_kernel_vfp_t __percpu *kvm_host_vfp_state; +static kvm_cpu_context_t __percpu *kvm_host_cpu_state; static unsigned long hyp_default_vectors; /* Per-CPU variable containing the currently running vcpu. */ @@ -317,7 +317,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { vcpu->cpu = cpu; - vcpu->arch.vfp_host = this_cpu_ptr(kvm_host_vfp_state); + vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); /* * Check whether this vcpu requires the cache to be flushed on @@ -882,24 +882,24 @@ static int init_hyp_mode(void) } /* - * Map the host VFP structures + * Map the host CPU structures */ - kvm_host_vfp_state = alloc_percpu(kvm_kernel_vfp_t); - if (!kvm_host_vfp_state) { + kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t); + if (!kvm_host_cpu_state) { err = -ENOMEM; - kvm_err("Cannot allocate host VFP state\n"); + kvm_err("Cannot allocate host CPU state\n"); goto out_free_mappings; } for_each_possible_cpu(cpu) { - kvm_kernel_vfp_t *vfp; + kvm_cpu_context_t *cpu_ctxt; - vfp = per_cpu_ptr(kvm_host_vfp_state, cpu); - err = create_hyp_mappings(vfp, vfp + 1); + cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu); + err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1); if (err) { - kvm_err("Cannot map host VFP state: %d\n", err); - goto out_free_vfp; + kvm_err("Cannot map host CPU state: %d\n", err); + goto out_free_context; } } @@ -913,7 +913,7 @@ static int init_hyp_mode(void) */ err = kvm_vgic_hyp_init(); if (err) - goto out_free_vfp; + goto out_free_context; #ifdef CONFIG_KVM_ARM_VGIC vgic_present = true; @@ -935,8 +935,8 @@ static int init_hyp_mode(void) kvm_info("Hyp mode initialized successfully\n"); return 0; -out_free_vfp: - free_percpu(kvm_host_vfp_state); +out_free_context: + free_percpu(kvm_host_cpu_state); out_free_mappings: free_hyp_pgds(); out_free_stack_pages: From aa404ddf952fa59c07575529ce93435538a3aebe Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 22 Apr 2013 18:57:46 -0700 Subject: [PATCH 192/204] KVM: ARM: Fix API documentation for ONE_REG encoding Unless I'm mistaken, the size field was encoded 4 bits off and a wrong value was used for 64-bit FP registers. Signed-off-by: Christoffer Dall --- Documentation/virtual/kvm/api.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c09d1832e935..59fd9e6a5366 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1814,22 +1814,22 @@ ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: ARM core registers have the following id bit patterns: - 0x4002 0000 0010 + 0x4020 0000 0010 ARM 32-bit CP15 registers have the following id bit patterns: - 0x4002 0000 000F + 0x4020 0000 000F ARM 64-bit CP15 registers have the following id bit patterns: - 0x4003 0000 000F + 0x4030 0000 000F ARM CCSIDR registers are demultiplexed by CSSELR value: - 0x4002 0000 0011 00 + 0x4020 0000 0011 00 ARM 32-bit VFP control registers have the following id bit patterns: - 0x4002 0000 0012 1 + 0x4020 0000 0012 1 ARM 64-bit FP registers have the following id bit patterns: - 0x4002 0000 0012 0 + 0x4030 0000 0012 0 4.69 KVM_GET_ONE_REG From d21a1c83c7595e387545632e44cd7797b76e19cc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Apr 2013 18:30:36 +0200 Subject: [PATCH 193/204] ARM: KVM: define KVM_ARM_MAX_VCPUS unconditionally The CONFIG_KVM_ARM_MAX_VCPUS symbol is needed in order to build the kernel/context_tracking.c code, which includes the vgic data structures implictly through the kvm headers. Definining the symbol to zero on builds without KVM resolves this build error: In file included from include/linux/kvm_host.h:33:0, from kernel/context_tracking.c:18: arch/arm/include/asm/kvm_host.h:28:23: warning: "CONFIG_KVM_ARM_MAX_VCPUS" is not defined [-Wundef] #define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS ^ arch/arm/include/asm/kvm_vgic.h:34:24: note: in expansion of macro 'KVM_MAX_VCPUS' #define VGIC_MAX_CPUS KVM_MAX_VCPUS ^ arch/arm/include/asm/kvm_vgic.h:38:6: note: in expansion of macro 'VGIC_MAX_CPUS' #if (VGIC_MAX_CPUS > 8) ^ In file included from arch/arm/include/asm/kvm_host.h:41:0, from include/linux/kvm_host.h:33, from kernel/context_tracking.c:18: arch/arm/include/asm/kvm_vgic.h:59:11: error: 'CONFIG_KVM_ARM_MAX_VCPUS' undeclared here (not in a function) } percpu[VGIC_MAX_CPUS]; ^ Signed-off-by: Arnd Bergmann Cc: Marc Zyngier Cc: Christoffer Dall Cc: Will Deacon Signed-off-by: Christoffer Dall --- arch/arm/kvm/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 49dd64e579c2..370e1a8af6ac 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -41,9 +41,9 @@ config KVM_ARM_HOST Provides host support for ARM processors. config KVM_ARM_MAX_VCPUS - int "Number maximum supported virtual CPUs per VM" - depends on KVM_ARM_HOST - default 4 + int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST + default 4 if KVM_ARM_HOST + default 0 help Static number of max supported virtual CPUs per VM. From df759217386fcff9c5e45547d1578270fd592a1b Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 23 Apr 2013 14:26:31 -0700 Subject: [PATCH 194/204] KVM: ARM: Fix spelling in error message s/unkown/unknown/ Signed-off-by: Christoffer Dall --- arch/arm/kvm/handle_exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 26ad17310a1e..3d74a0be47db 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -115,7 +115,7 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) if (hsr_ec >= ARRAY_SIZE(arm_exit_handlers) || !arm_exit_handlers[hsr_ec]) { - kvm_err("Unkown exception class: hsr: %#08x\n", + kvm_err("Unknown exception class: hsr: %#08x\n", (unsigned int)kvm_vcpu_get_hsr(vcpu)); BUG(); } From d4e071ce6acf8d5eddb7615a953193a8b0ad7c38 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 17 Apr 2013 12:52:01 +0200 Subject: [PATCH 195/204] ARM: KVM: iterate over all CPUs for CPU compatibility check kvm_target_cpus() checks the compatibility of the used CPU with KVM, which is currently limited to ARM Cortex-A15 cores. However by calling it only once on any random CPU it assumes that all cores are the same, which is not necessarily the case (for example in Big.Little). [ I cut some of the commit message and changed the formatting of the code slightly to pass checkpatch and look more like the rest of the kvm/arm init code - Christoffer ] Signed-off-by: Andre Przywara Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 089c0a40514c..5bc99b452b04 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -947,21 +947,30 @@ out_err: return err; } +static void check_kvm_target_cpu(void *ret) +{ + *(int *)ret = kvm_target_cpu(); +} + /** * Initialize Hyp-mode and memory mappings on all CPUs. */ int kvm_arch_init(void *opaque) { int err; + int ret, cpu; if (!is_hyp_mode_available()) { kvm_err("HYP mode not available\n"); return -ENODEV; } - if (kvm_target_cpu() < 0) { - kvm_err("Target CPU not supported!\n"); - return -ENODEV; + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); + if (ret < 0) { + kvm_err("Error, CPU %d not supported!\n", cpu); + return -ENODEV; + } } err = init_hyp_mode(); From 4cee4b72f1e2600e19779a14d4d9a4f4016ce49f Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 29 Apr 2013 10:54:08 -0600 Subject: [PATCH 196/204] kvm: KVM_CAP_IOMMU only available with device assignment Fix build with CONFIG_PCI unset by linking KVM_CAP_IOMMU to device assignment config option. It has no purpose otherwise. Signed-off-by: Alex Williamson Reported-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Marcelo Tosatti --- arch/ia64/kvm/kvm-ia64.c | 2 ++ arch/x86/kvm/x86.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index dcc560729c20..5b2dc0d10c8f 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -204,9 +204,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_IOMMU: r = iommu_present(&pci_bus_type); break; +#endif default: r = 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8747fef7fd59..96f914e828d4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2544,9 +2544,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_IOMMU: r = iommu_present(&pci_bus_type); break; +#endif case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; From 121ac4540f606026722ebb1bae68d9c43e518db9 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Mon, 29 Apr 2013 14:07:48 +0000 Subject: [PATCH 197/204] kvm/ppc/mpic: remove default routes from documentation The default routes were removed from the code during patchset respinning, but were not removed from the documentation. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/devices/mpic.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt index ad0ac778f20a..8257397adc3c 100644 --- a/Documentation/virtual/kvm/devices/mpic.txt +++ b/Documentation/virtual/kvm/devices/mpic.txt @@ -50,7 +50,4 @@ IRQ Routing: regard to any subdivisions in chip documentation such as "internal" or "external" interrupts. - Default routes are established for these pins, with the GSI being equal - to the pin number. - Access to non-SRC interrupts is not implemented through IRQ routing mechanisms. From 398d87836e3074ff630805b77d024a88ac288606 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 30 Apr 2013 14:57:13 +0000 Subject: [PATCH 198/204] kvm/ppc/mpic: fix mmio region lists when multiple guests used Keeping a linked list of statically defined objects doesn't work very well when we have multiple guests. :-P Switch to an array of constant objects. This fixes a hang when multiple guests are used. Signed-off-by: Scott Wood [agraf: remove struct list_head from mem_reg] Signed-off-by: Alexander Graf --- arch/powerpc/kvm/mpic.c | 53 +++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index f3148f8cdc12..886ebf6f698d 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -184,11 +184,14 @@ struct irq_dest { uint32_t outputs_active[NUM_OUTPUTS]; }; +#define MAX_MMIO_REGIONS 10 + struct openpic { struct kvm *kvm; struct kvm_device *dev; struct kvm_io_device mmio; - struct list_head mmio_regions; + const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS]; + int num_mmio_regions; atomic_t users; gpa_t reg_base; @@ -1238,62 +1241,71 @@ static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr) } struct mem_reg { - struct list_head list; int (*read)(void *opaque, gpa_t addr, u32 *ptr); int (*write)(void *opaque, gpa_t addr, u32 val); gpa_t start_addr; int size; }; -static struct mem_reg openpic_gbl_mmio = { +static const struct mem_reg openpic_gbl_mmio = { .write = openpic_gbl_write, .read = openpic_gbl_read, .start_addr = OPENPIC_GLB_REG_START, .size = OPENPIC_GLB_REG_SIZE, }; -static struct mem_reg openpic_tmr_mmio = { +static const struct mem_reg openpic_tmr_mmio = { .write = openpic_tmr_write, .read = openpic_tmr_read, .start_addr = OPENPIC_TMR_REG_START, .size = OPENPIC_TMR_REG_SIZE, }; -static struct mem_reg openpic_cpu_mmio = { +static const struct mem_reg openpic_cpu_mmio = { .write = openpic_cpu_write, .read = openpic_cpu_read, .start_addr = OPENPIC_CPU_REG_START, .size = OPENPIC_CPU_REG_SIZE, }; -static struct mem_reg openpic_src_mmio = { +static const struct mem_reg openpic_src_mmio = { .write = openpic_src_write, .read = openpic_src_read, .start_addr = OPENPIC_SRC_REG_START, .size = OPENPIC_SRC_REG_SIZE, }; -static struct mem_reg openpic_msi_mmio = { +static const struct mem_reg openpic_msi_mmio = { .read = openpic_msi_read, .write = openpic_msi_write, .start_addr = OPENPIC_MSI_REG_START, .size = OPENPIC_MSI_REG_SIZE, }; -static struct mem_reg openpic_summary_mmio = { +static const struct mem_reg openpic_summary_mmio = { .read = openpic_summary_read, .write = openpic_summary_write, .start_addr = OPENPIC_SUMMARY_REG_START, .size = OPENPIC_SUMMARY_REG_SIZE, }; +static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr) +{ + if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) { + WARN(1, "kvm mpic: too many mmio regions\n"); + return; + } + + opp->mmio_regions[opp->num_mmio_regions++] = mr; +} + static void fsl_common_init(struct openpic *opp) { int i; int virq = MAX_SRC; - list_add(&openpic_msi_mmio.list, &opp->mmio_regions); - list_add(&openpic_summary_mmio.list, &opp->mmio_regions); + add_mmio_region(opp, &openpic_msi_mmio); + add_mmio_region(opp, &openpic_summary_mmio); opp->vid = VID_REVISION_1_2; opp->vir = VIR_GENERIC; @@ -1330,10 +1342,10 @@ static void fsl_common_init(struct openpic *opp) static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr) { - struct list_head *node; + int i; - list_for_each(node, &opp->mmio_regions) { - struct mem_reg *mr = list_entry(node, struct mem_reg, list); + for (i = 0; i < opp->num_mmio_regions; i++) { + const struct mem_reg *mr = opp->mmio_regions[i]; if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) continue; @@ -1346,10 +1358,10 @@ static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr) static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val) { - struct list_head *node; + int i; - list_for_each(node, &opp->mmio_regions) { - struct mem_reg *mr = list_entry(node, struct mem_reg, list); + for (i = 0; i < opp->num_mmio_regions; i++) { + const struct mem_reg *mr = opp->mmio_regions[i]; if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) continue; @@ -1660,11 +1672,10 @@ static int mpic_create(struct kvm_device *dev, u32 type) opp->model = type; spin_lock_init(&opp->lock); - INIT_LIST_HEAD(&opp->mmio_regions); - list_add(&openpic_gbl_mmio.list, &opp->mmio_regions); - list_add(&openpic_tmr_mmio.list, &opp->mmio_regions); - list_add(&openpic_src_mmio.list, &opp->mmio_regions); - list_add(&openpic_cpu_mmio.list, &opp->mmio_regions); + add_mmio_region(opp, &openpic_gbl_mmio); + add_mmio_region(opp, &openpic_tmr_mmio); + add_mmio_region(opp, &openpic_src_mmio); + add_mmio_region(opp, &openpic_cpu_mmio); switch (opp->model) { case KVM_DEV_TYPE_FSL_MPIC_20: From 1d6f6b73396859fb13c1222c19d0ec1421777847 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 30 Apr 2013 14:57:14 +0000 Subject: [PATCH 199/204] kvm/ppc/mpic: remove users This is an unused (no pun intended) leftover from when this code did reference counting. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/mpic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 886ebf6f698d..eaac187eb089 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -192,7 +192,6 @@ struct openpic { struct kvm_io_device mmio; const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS]; int num_mmio_regions; - atomic_t users; gpa_t reg_base; spinlock_t lock; From ed840ee9c8f15a726a1e7b98c9f5cfc49c2a0ff2 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 26 Apr 2013 14:53:39 +0000 Subject: [PATCH 200/204] kvm/ppc: Hold srcu lock when calling kvm_io_bus_read/write These functions do an srcu_dereference without acquiring the srcu lock themselves. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/powerpc.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 31084c6335c9..270773f0d955 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -622,6 +622,8 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int rt, unsigned int bytes, int is_bigendian) { + int idx, ret; + if (bytes > sizeof(run->mmio.data)) { printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, run->mmio.len); @@ -637,8 +639,14 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->mmio_is_write = 0; vcpu->arch.mmio_sign_extend = 0; - if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, - bytes, &run->mmio.data)) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!ret) { kvmppc_complete_mmio_load(vcpu, run); vcpu->mmio_needed = 0; return EMULATE_DONE; @@ -663,6 +671,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_bigendian) { void *data = run->mmio.data; + int idx, ret; if (bytes > sizeof(run->mmio.data)) { printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, @@ -692,8 +701,14 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, } } - if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, - bytes, &run->mmio.data)) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!ret) { vcpu->mmio_needed = 0; return EMULATE_DONE; } From d133b40f2cdd527af01090ffd6a041485d1a29b4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 1 May 2013 19:17:31 +0000 Subject: [PATCH 201/204] kvm/ppc/mpic: fix missing unlock in set_base_addr() Add the missing unlock before return from function set_base_addr() when disables the mapping. Introduced by commit 5df554ad5b7522ea62b0ff9d5be35183494efc21 (kvm/ppc/mpic: in-kernel MPIC emulation) Signed-off-by: Wei Yongjun Signed-off-by: Alexander Graf --- arch/powerpc/kvm/mpic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index eaac187eb089..2861ae9eaae6 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -1486,8 +1486,8 @@ static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) map_mmio(opp); - mutex_unlock(&opp->kvm->slots_lock); out: + mutex_unlock(&opp->kvm->slots_lock); return 0; } From 5975a2e0950291a6bfe9fd5880e7952ff87764be Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 27 Apr 2013 00:28:37 +0000 Subject: [PATCH 202/204] KVM: PPC: Book3S: Add API for in-kernel XICS emulation This adds the API for userspace to instantiate an XICS device in a VM and connect VCPUs to it. The API consists of a new device type for the KVM_CREATE_DEVICE ioctl, a new capability KVM_CAP_IRQ_XICS, which functions similarly to KVM_CAP_IRQ_MPIC, and the KVM_IRQ_LINE ioctl, which is used to assert and deassert interrupt inputs of the XICS. The XICS device has one attribute group, KVM_DEV_XICS_GRP_SOURCES. Each attribute within this group corresponds to the state of one interrupt source. The attribute number is the same as the interrupt source number. This does not support irq routing or irqfd yet. Signed-off-by: Paul Mackerras Acked-by: David Gibson Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 8 + Documentation/virtual/kvm/devices/xics.txt | 66 +++++++ arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/include/uapi/asm/kvm.h | 12 ++ arch/powerpc/kvm/book3s_xics.c | 194 ++++++++++++++++++--- arch/powerpc/kvm/book3s_xics.h | 1 + arch/powerpc/kvm/irq.h | 3 + arch/powerpc/kvm/powerpc.c | 22 +++ include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 2 + virt/kvm/kvm_main.c | 5 + 11 files changed, 289 insertions(+), 27 deletions(-) create mode 100644 Documentation/virtual/kvm/devices/xics.txt diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c09d1832e935..03492f95ed39 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2772,3 +2772,11 @@ Parameters: args[0] is the MPIC device fd args[1] is the MPIC CPU number for this vcpu This capability connects the vcpu to an in-kernel MPIC device. + +6.7 KVM_CAP_IRQ_XICS + +Architectures: ppc +Parameters: args[0] is the XICS device fd + args[1] is the XICS CPU number (server ID) for this vcpu + +This capability connects the vcpu to an in-kernel XICS device. diff --git a/Documentation/virtual/kvm/devices/xics.txt b/Documentation/virtual/kvm/devices/xics.txt new file mode 100644 index 000000000000..42864935ac5d --- /dev/null +++ b/Documentation/virtual/kvm/devices/xics.txt @@ -0,0 +1,66 @@ +XICS interrupt controller + +Device type supported: KVM_DEV_TYPE_XICS + +Groups: + KVM_DEV_XICS_SOURCES + Attributes: One per interrupt source, indexed by the source number. + +This device emulates the XICS (eXternal Interrupt Controller +Specification) defined in PAPR. The XICS has a set of interrupt +sources, each identified by a 20-bit source number, and a set of +Interrupt Control Presentation (ICP) entities, also called "servers", +each associated with a virtual CPU. + +The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH +capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and +the interrupt server number (i.e. the vcpu number from the XICS's +point of view) in args[1] of the kvm_enable_cap struct. Each ICP has +64 bits of state which can be read and written using the +KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit +state word has the following bitfields, starting at the +least-significant end of the word: + +* Unused, 16 bits + +* Pending interrupt priority, 8 bits + Zero is the highest priority, 255 means no interrupt is pending. + +* Pending IPI (inter-processor interrupt) priority, 8 bits + Zero is the highest priority, 255 means no IPI is pending. + +* Pending interrupt source number, 24 bits + Zero means no interrupt pending, 2 means an IPI is pending + +* Current processor priority, 8 bits + Zero is the highest priority, meaning no interrupts can be + delivered, and 255 is the lowest priority. + +Each source has 64 bits of state that can be read and written using +the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the +KVM_DEV_XICS_SOURCES attribute group, with the attribute number being +the interrupt source number. The 64 bit state word has the following +bitfields, starting from the least-significant end of the word: + +* Destination (server number), 32 bits + This specifies where the interrupt should be sent, and is the + interrupt server number specified for the destination vcpu. + +* Priority, 8 bits + This is the priority specified for this interrupt source, where 0 is + the highest priority and 255 is the lowest. An interrupt with a + priority of 255 will never be delivered. + +* Level sensitive flag, 1 bit + This bit is 1 for a level-sensitive interrupt source, or 0 for + edge-sensitive (or MSI). + +* Masked flag, 1 bit + This bit is set to 1 if the interrupt is masked (cannot be delivered + regardless of its priority), for example by the ibm,int-off RTAS + call, or 0 if it is not masked. + +* Pending flag, 1 bit + This bit is 1 if the source has a pending interrupt, otherwise 0. + +Only one XICS instance may be created per VM. diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d7339df19259..a5287fe03d77 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -315,6 +315,8 @@ extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); +extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, + struct kvm_vcpu *vcpu, u32 cpu); #else static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 427b9aca2a0f..0fb1a6e9ff90 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -499,4 +499,16 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a) #define KVM_REG_PPC_EPTCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b) +/* PPC64 eXternal Interrupt Controller Specification */ +#define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ + +/* Layout of 64-bit source attribute values */ +#define KVM_XICS_DESTINATION_SHIFT 0 +#define KVM_XICS_DESTINATION_MASK 0xffffffffULL +#define KVM_XICS_PRIORITY_SHIFT 32 +#define KVM_XICS_PRIORITY_MASK 0xff +#define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40) +#define KVM_XICS_MASKED (1ULL << 41) +#define KVM_XICS_PENDING (1ULL << 42) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index ee841ed8a690..f7a103756618 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -55,8 +56,6 @@ * * - Make ICS lockless as well, or at least a per-interrupt lock or hashed * locks array to improve scalability - * - * - ioctl's to save/restore the entire state for snapshot & migration */ /* -- ICS routines -- */ @@ -64,7 +63,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); -static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) +static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level, + bool report_status) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -81,6 +81,9 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) if (!state->exists) return -EINVAL; + if (report_status) + return state->asserted; + /* * We set state->asserted locklessly. This should be fine as * we are the only setter, thus concurrent access is undefined @@ -96,7 +99,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) /* Attempt delivery */ icp_deliver_irq(xics, NULL, irq); - return 0; + return state->asserted; } static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, @@ -891,8 +894,8 @@ static void xics_debugfs_init(struct kvmppc_xics *xics) kfree(name); } -struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, - struct kvmppc_xics *xics, int irq) +static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, + struct kvmppc_xics *xics, int irq) { struct kvmppc_ics *ics; int i, icsid; @@ -1044,34 +1047,138 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) return 0; } -/* -- ioctls -- */ - -int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args) +static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) { - struct kvmppc_xics *xics; - int r; + int ret; + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val, prio; - /* locking against multiple callers? */ + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return -ENOENT; - xics = kvm->arch.xics; - if (!xics) - return -ENODEV; - - switch (args->level) { - case KVM_INTERRUPT_SET: - case KVM_INTERRUPT_SET_LEVEL: - case KVM_INTERRUPT_UNSET: - r = ics_deliver_irq(xics, args->irq, args->level); - break; - default: - r = -EINVAL; + irqp = &ics->irq_state[idx]; + mutex_lock(&ics->lock); + ret = -ENOENT; + if (irqp->exists) { + val = irqp->server; + prio = irqp->priority; + if (prio == MASKED) { + val |= KVM_XICS_MASKED; + prio = irqp->saved_priority; + } + val |= prio << KVM_XICS_PRIORITY_SHIFT; + if (irqp->asserted) + val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING; + else if (irqp->masked_pending || irqp->resend) + val |= KVM_XICS_PENDING; + ret = 0; } + mutex_unlock(&ics->lock); - return r; + if (!ret && put_user(val, ubufp)) + ret = -EFAULT; + + return ret; } -void kvmppc_xics_free(struct kvmppc_xics *xics) +static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) { + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val; + u8 prio; + u32 server; + + if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) + return -ENOENT; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) { + ics = kvmppc_xics_create_ics(xics->kvm, xics, irq); + if (!ics) + return -ENOMEM; + } + irqp = &ics->irq_state[idx]; + if (get_user(val, ubufp)) + return -EFAULT; + + server = val & KVM_XICS_DESTINATION_MASK; + prio = val >> KVM_XICS_PRIORITY_SHIFT; + if (prio != MASKED && + kvmppc_xics_find_server(xics->kvm, server) == NULL) + return -EINVAL; + + mutex_lock(&ics->lock); + irqp->server = server; + irqp->saved_priority = prio; + if (val & KVM_XICS_MASKED) + prio = MASKED; + irqp->priority = prio; + irqp->resend = 0; + irqp->masked_pending = 0; + irqp->asserted = 0; + if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) + irqp->asserted = 1; + irqp->exists = 1; + mutex_unlock(&ics->lock); + + if (val & KVM_XICS_PENDING) + icp_deliver_irq(xics, NULL, irqp->number); + + return 0; +} + +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, + bool line_status) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + + return ics_deliver_irq(xics, irq, level, line_status); +} + +static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_set_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_get_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && + attr->attr < KVMPPC_XICS_NR_IRQS) + return 0; + break; + } + return -ENXIO; +} + +static void kvmppc_xics_free(struct kvm_device *dev) +{ + struct kvmppc_xics *xics = dev->private; int i; struct kvm *kvm = xics->kvm; @@ -1083,17 +1190,21 @@ void kvmppc_xics_free(struct kvmppc_xics *xics) for (i = 0; i <= xics->max_icsid; i++) kfree(xics->ics[i]); kfree(xics); + kfree(dev); } -int kvm_xics_create(struct kvm *kvm, u32 type) +static int kvmppc_xics_create(struct kvm_device *dev, u32 type) { struct kvmppc_xics *xics; + struct kvm *kvm = dev->kvm; int ret = 0; xics = kzalloc(sizeof(*xics), GFP_KERNEL); if (!xics) return -ENOMEM; + dev->private = xics; + xics->dev = dev; xics->kvm = kvm; /* Already there ? */ @@ -1120,6 +1231,35 @@ int kvm_xics_create(struct kvm *kvm, u32 type) return 0; } +struct kvm_device_ops kvm_xics_ops = { + .name = "kvm-xics", + .create = kvmppc_xics_create, + .destroy = kvmppc_xics_free, + .set_attr = xics_set_attr, + .get_attr = xics_get_attr, + .has_attr = xics_has_attr, +}; + +int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 xcpu) +{ + struct kvmppc_xics *xics = dev->private; + int r = -EBUSY; + + if (dev->ops != &kvm_xics_ops) + return -EPERM; + if (xics->kvm != vcpu->kvm) + return -EPERM; + if (vcpu->arch.irq_type) + return -EBUSY; + + r = kvmppc_xics_create_icp(vcpu, xcpu); + if (!r) + vcpu->arch.irq_type = KVMPPC_IRQ_XICS; + + return r; +} + void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { if (!vcpu->arch.icp) diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index e4fdec3dde77..dd9326c5c19b 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -88,6 +88,7 @@ struct kvmppc_ics { struct kvmppc_xics { struct kvm *kvm; + struct kvm_device *dev; struct dentry *dentry; u32 max_icsid; bool real_mode; diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h index f1e27fdc8c2e..5a9a10b90762 100644 --- a/arch/powerpc/kvm/irq.h +++ b/arch/powerpc/kvm/irq.h @@ -9,6 +9,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm) #ifdef CONFIG_KVM_MPIC ret = ret || (kvm->arch.mpic != NULL); +#endif +#ifdef CONFIG_KVM_XICS + ret = ret || (kvm->arch.xics != NULL); #endif smp_rmb(); return ret; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 270773f0d955..6316ee336e88 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -342,6 +342,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_SPAPR_TCE: case KVM_CAP_PPC_ALLOC_HTAB: case KVM_CAP_PPC_RTAS: +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: +#endif r = 1; break; #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -837,6 +840,25 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; } #endif +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: { + struct file *filp; + struct kvm_device *dev; + + r = -EBADF; + filp = fget(cap->args[0]); + if (!filp) + break; + + r = -EPERM; + dev = kvm_device_from_filp(filp); + if (dev) + r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); + + fput(filp); + break; + } +#endif /* CONFIG_KVM_XICS */ default: r = -EINVAL; break; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 996661eb8e99..7823b6369c5e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1086,6 +1086,7 @@ void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); extern struct kvm_device_ops kvm_mpic_ops; +extern struct kvm_device_ops kvm_xics_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 965e5b52dee0..a5c86fc34a37 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -665,6 +665,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_DEVICE_CTRL 89 #define KVM_CAP_IRQ_MPIC 90 #define KVM_CAP_PPC_RTAS 91 +#define KVM_CAP_IRQ_XICS 92 #ifdef KVM_CAP_IRQ_ROUTING @@ -837,6 +838,7 @@ struct kvm_device_attr { #define KVM_DEV_TYPE_FSL_MPIC_20 1 #define KVM_DEV_TYPE_FSL_MPIC_42 2 +#define KVM_DEV_TYPE_XICS 3 /* * ioctls for VM fds diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5da9f02a2a67..e6e7abe16d05 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2246,6 +2246,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, case KVM_DEV_TYPE_FSL_MPIC_42: ops = &kvm_mpic_ops; break; +#endif +#ifdef CONFIG_KVM_XICS + case KVM_DEV_TYPE_XICS: + ops = &kvm_xics_ops; + break; #endif default: return -ENODEV; From 03b28f8133165dbe4cd922054d599e26b8119508 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 29 Apr 2013 16:46:42 +0200 Subject: [PATCH 203/204] KVM: x86: Account for failing enable_irq_window for NMI window request With VMX, enable_irq_window can now return -EBUSY, in which case an immediate exit shall be requested before entering the guest. Account for this also in enable_nmi_window which uses enable_irq_window in absence of vnmi support, e.g. Reviewed-by: Paolo Bonzini Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 5 +++-- arch/x86/kvm/vmx.c | 16 +++++++--------- arch/x86/kvm/x86.c | 3 ++- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ec14b7245a4e..3741c653767c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -695,7 +695,7 @@ struct kvm_x86_ops { int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); - void (*enable_nmi_window)(struct kvm_vcpu *vcpu); + int (*enable_nmi_window)(struct kvm_vcpu *vcpu); int (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int (*vm_has_apicv)(struct kvm *kvm); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7f896cbe717f..3421d5a92c06 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3649,13 +3649,13 @@ static int enable_irq_window(struct kvm_vcpu *vcpu) return 0; } -static void enable_nmi_window(struct kvm_vcpu *vcpu) +static int enable_nmi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) - return; /* IRET will cause a vm exit */ + return 0; /* IRET will cause a vm exit */ /* * Something prevents NMI from been injected. Single step over possible @@ -3664,6 +3664,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_bp_intercept(vcpu); + return 0; } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e10217e99d2c..e53a5f7aa8a9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4417,22 +4417,20 @@ static int enable_irq_window(struct kvm_vcpu *vcpu) return 0; } -static void enable_nmi_window(struct kvm_vcpu *vcpu) +static int enable_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (!cpu_has_virtual_nmis()) { - enable_irq_window(vcpu); - return; - } + if (!cpu_has_virtual_nmis()) + return enable_irq_window(vcpu); + + if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) + return enable_irq_window(vcpu); - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { - enable_irq_window(vcpu); - return; - } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + return 0; } static void vmx_inject_irq(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96f914e828d4..94f35d22c5fb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5756,7 +5756,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* enable NMI/IRQ window open exits if needed */ if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); + req_immediate_exit = + kvm_x86_ops->enable_nmi_window(vcpu) != 0; else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) req_immediate_exit = kvm_x86_ops->enable_irq_window(vcpu) != 0; From db6ae6158186a17165ef990bda2895ae7594b039 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 30 Apr 2013 20:00:45 -0500 Subject: [PATCH 204/204] kvm: Add compat_ioctl for device control API This API shouldn't have 32/64-bit issues, but VFS assumes it does unless told otherwise. Signed-off-by: Scott Wood Signed-off-by: Gleb Natapov --- virt/kvm/kvm_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e6e7abe16d05..8fd325a0a1b1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2221,6 +2221,9 @@ static int kvm_device_release(struct inode *inode, struct file *filp) static const struct file_operations kvm_device_fops = { .unlocked_ioctl = kvm_device_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = kvm_device_ioctl, +#endif .release = kvm_device_release, };