2008-04-17 12:28:09 +08:00
|
|
|
/*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License, version 2, as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
*
|
|
|
|
* Copyright IBM Corp. 2008
|
|
|
|
*
|
|
|
|
* Authors: Hollis Blanchard <hollisb@us.ibm.com>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __POWERPC_KVM_PPC_H__
|
|
|
|
#define __POWERPC_KVM_PPC_H__
|
|
|
|
|
|
|
|
/* This file exists just so we can dereference kvm_vcpu, avoiding nested header
|
|
|
|
* dependencies. */
|
|
|
|
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <linux/timer.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/kvm_types.h>
|
|
|
|
#include <linux/kvm_host.h>
|
2012-09-26 04:31:56 +08:00
|
|
|
#include <linux/bug.h>
|
2010-01-15 21:49:12 +08:00
|
|
|
#ifdef CONFIG_PPC_BOOK3S
|
|
|
|
#include <asm/kvm_book3s.h>
|
2010-04-16 06:11:40 +08:00
|
|
|
#else
|
|
|
|
#include <asm/kvm_booke.h>
|
2010-01-15 21:49:12 +08:00
|
|
|
#endif
|
KVM: PPC: Allow book3s_hv guests to use SMT processor modes
This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7. The host still has to run single-threaded.
This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability. The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.
To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode. KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline). To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it. Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.
When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host. This number is exported
to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.
We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host. We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked. This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.
When a vcore starts to run, it executes in the context of one of the
vcpu threads. The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).
It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running. In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest. It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.
Note that there is no fixed relationship between the hardware thread
number and the vcpu number. Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:23:08 +08:00
|
|
|
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
|
|
|
|
#include <asm/paca.h>
|
|
|
|
#endif
|
2008-04-17 12:28:09 +08:00
|
|
|
|
|
|
|
enum emulation_result {
|
|
|
|
EMULATE_DONE, /* no further processing */
|
|
|
|
EMULATE_DO_MMIO, /* kvm_run filled with MMIO request */
|
|
|
|
EMULATE_DO_DCR, /* kvm_run filled with DCR request */
|
|
|
|
EMULATE_FAIL, /* can't emulate this instruction */
|
2010-02-19 18:00:31 +08:00
|
|
|
EMULATE_AGAIN, /* something went wrong. go again */
|
2013-04-08 08:32:13 +08:00
|
|
|
EMULATE_EXIT_USER, /* emulation requires exit to user-space */
|
2008-04-17 12:28:09 +08:00
|
|
|
};
|
|
|
|
|
2011-06-29 08:19:50 +08:00
|
|
|
extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
|
2008-04-17 12:28:09 +08:00
|
|
|
extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
|
2009-10-30 13:47:07 +08:00
|
|
|
extern void kvmppc_handler_highmem(void);
|
2008-04-17 12:28:09 +08:00
|
|
|
|
|
|
|
extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
|
|
|
|
extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
|
|
|
|
unsigned int rt, unsigned int bytes,
|
|
|
|
int is_bigendian);
|
2010-02-19 18:00:30 +08:00
|
|
|
extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
|
|
|
|
unsigned int rt, unsigned int bytes,
|
|
|
|
int is_bigendian);
|
2008-04-17 12:28:09 +08:00
|
|
|
extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
|
2010-02-19 18:00:29 +08:00
|
|
|
u64 val, unsigned int bytes, int is_bigendian);
|
2008-04-17 12:28:09 +08:00
|
|
|
|
|
|
|
extern int kvmppc_emulate_instruction(struct kvm_run *run,
|
|
|
|
struct kvm_vcpu *vcpu);
|
2008-05-22 07:22:51 +08:00
|
|
|
extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
|
2008-11-05 23:36:16 +08:00
|
|
|
extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
|
2011-04-28 06:24:21 +08:00
|
|
|
extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
|
2011-11-17 20:39:59 +08:00
|
|
|
extern void kvmppc_decrementer_func(unsigned long data);
|
2011-08-10 19:57:08 +08:00
|
|
|
extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
|
2012-08-09 04:38:19 +08:00
|
|
|
extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
|
|
|
|
extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu);
|
2008-04-17 12:28:09 +08:00
|
|
|
|
2009-01-04 06:22:59 +08:00
|
|
|
/* Core-specific hooks */
|
|
|
|
|
2008-12-03 05:51:53 +08:00
|
|
|
extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
|
2008-12-03 05:51:55 +08:00
|
|
|
unsigned int gtlb_idx);
|
2008-04-17 12:28:09 +08:00
|
|
|
extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
|
2008-07-26 02:54:53 +08:00
|
|
|
extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
|
2009-01-04 06:22:59 +08:00
|
|
|
extern void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu);
|
2010-04-16 06:11:45 +08:00
|
|
|
extern int kvmppc_mmu_init(struct kvm_vcpu *vcpu);
|
2009-01-04 06:23:03 +08:00
|
|
|
extern int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
|
|
|
|
extern int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
|
2009-01-04 06:23:02 +08:00
|
|
|
extern gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index,
|
|
|
|
gva_t eaddr);
|
2009-01-04 06:23:11 +08:00
|
|
|
extern void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu);
|
|
|
|
extern void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu);
|
2008-11-05 23:36:14 +08:00
|
|
|
|
2008-11-05 23:36:18 +08:00
|
|
|
extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm,
|
|
|
|
unsigned int id);
|
|
|
|
extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu);
|
2008-11-05 23:36:17 +08:00
|
|
|
extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu);
|
2008-11-05 23:36:14 +08:00
|
|
|
extern int kvmppc_core_check_processor_compat(void);
|
2008-11-05 23:36:17 +08:00
|
|
|
extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
|
|
|
|
struct kvm_translation *tr);
|
2008-11-05 23:36:14 +08:00
|
|
|
|
|
|
|
extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
|
|
|
|
extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
|
|
|
|
|
2012-02-16 22:07:37 +08:00
|
|
|
extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
|
2008-11-05 23:36:14 +08:00
|
|
|
extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
|
2010-01-08 09:58:07 +08:00
|
|
|
extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
|
2008-11-05 23:36:14 +08:00
|
|
|
extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
|
2009-12-22 03:21:24 +08:00
|
|
|
extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
|
2008-11-05 23:36:14 +08:00
|
|
|
extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
|
|
|
|
struct kvm_interrupt *irq);
|
2013-02-14 22:00:25 +08:00
|
|
|
extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu);
|
2012-07-31 06:19:50 +08:00
|
|
|
extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
|
2012-08-13 18:50:35 +08:00
|
|
|
extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu);
|
2008-11-05 23:36:16 +08:00
|
|
|
|
2008-11-05 23:36:18 +08:00
|
|
|
extern int kvmppc_booke_init(void);
|
|
|
|
extern void kvmppc_booke_exit(void);
|
|
|
|
|
2008-11-25 01:37:38 +08:00
|
|
|
extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
|
2010-07-29 20:47:48 +08:00
|
|
|
extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
|
2011-06-15 07:34:41 +08:00
|
|
|
extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
|
2008-11-25 01:37:38 +08:00
|
|
|
|
KVM: PPC: Book3S HV: Make the guest hash table size configurable
This adds a new ioctl to enable userspace to control the size of the guest
hashed page table (HPT) and to clear it out when resetting the guest.
The KVM_PPC_ALLOCATE_HTAB ioctl is a VM ioctl and takes as its parameter
a pointer to a u32 containing the desired order of the HPT (log base 2
of the size in bytes), which is updated on successful return to the
actual order of the HPT which was allocated.
There must be no vcpus running at the time of this ioctl. To enforce
this, we now keep a count of the number of vcpus running in
kvm->arch.vcpus_running.
If the ioctl is called when a HPT has already been allocated, we don't
reallocate the HPT but just clear it out. We first clear the
kvm->arch.rma_setup_done flag, which has two effects: (a) since we hold
the kvm->lock mutex, it will prevent any vcpus from starting to run until
we're done, and (b) it means that the first vcpu to run after we're done
will re-establish the VRMA if necessary.
If userspace doesn't call this ioctl before running the first vcpu, the
kernel will allocate a default-sized HPT at that point. We do it then
rather than when creating the VM, as the code did previously, so that
userspace has a chance to do the ioctl if it wants.
When allocating the HPT, we can allocate either from the kernel page
allocator, or from the preallocated pool. If userspace is asking for
a different size from the preallocated HPTs, we first try to allocate
using the kernel page allocator. Then we try to allocate from the
preallocated pool, and then if that fails, we try allocating decreasing
sizes from the kernel page allocator, down to the minimum size allowed
(256kB). Note that the kernel page allocator limits allocations to
1 << CONFIG_FORCE_MAX_ZONEORDER pages, which by default corresponds to
16MB (on 64-bit powerpc, at least).
Signed-off-by: Paul Mackerras <paulus@samba.org>
[agraf: fix module compilation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2012-05-04 10:32:53 +08:00
|
|
|
extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp);
|
|
|
|
extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp);
|
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:21:34 +08:00
|
|
|
extern void kvmppc_free_hpt(struct kvm *kvm);
|
|
|
|
extern long kvmppc_prepare_vrma(struct kvm *kvm,
|
|
|
|
struct kvm_userspace_memory_region *mem);
|
KVM: PPC: Only get pages when actually needed, not in prepare_memory_region()
This removes the code from kvmppc_core_prepare_memory_region() that
looked up the VMA for the region being added and called hva_to_page
to get the pfns for the memory. We have no guarantee that there will
be anything mapped there at the time of the KVM_SET_USER_MEMORY_REGION
ioctl call; userspace can do that ioctl and then map memory into the
region later.
Instead we defer looking up the pfn for each memory page until it is
needed, which generally means when the guest does an H_ENTER hcall on
the page. Since we can't call get_user_pages in real mode, if we don't
already have the pfn for the page, kvmppc_h_enter() will return
H_TOO_HARD and we then call kvmppc_virtmode_h_enter() once we get back
to kernel context. That calls kvmppc_get_guest_page() to get the pfn
for the page, and then calls back to kvmppc_h_enter() to redo the HPTE
insertion.
When the first vcpu starts executing, we need to have the RMO or VRMA
region mapped so that the guest's real mode accesses will work. Thus
we now have a check in kvmppc_vcpu_run() to see if the RMO/VRMA is set
up and if not, call kvmppc_hv_setup_rma(). It checks if the memslot
starting at guest physical 0 now has RMO memory mapped there; if so it
sets it up for the guest, otherwise on POWER7 it sets up the VRMA.
The function that does that, kvmppc_map_vrma, is now a bit simpler,
as it calls kvmppc_virtmode_h_enter instead of creating the HPTE itself.
Since we are now potentially updating entries in the slot_phys[]
arrays from multiple vcpu threads, we now have a spinlock protecting
those updates to ensure that we don't lose track of any references
to pages.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
2011-12-12 20:31:00 +08:00
|
|
|
extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
|
2011-12-12 20:31:41 +08:00
|
|
|
struct kvm_memory_slot *memslot, unsigned long porder);
|
2011-06-29 08:22:05 +08:00
|
|
|
extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
|
2013-04-18 04:30:26 +08:00
|
|
|
|
2011-06-29 08:22:41 +08:00
|
|
|
extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
|
|
|
|
struct kvm_create_spapr_tce *args);
|
2012-03-16 05:58:34 +08:00
|
|
|
extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
|
|
|
|
unsigned long ioba, unsigned long tce);
|
2013-07-02 13:45:17 +08:00
|
|
|
extern struct kvm_rma_info *kvm_alloc_rma(void);
|
|
|
|
extern void kvm_release_rma(struct kvm_rma_info *ri);
|
2013-07-02 13:45:16 +08:00
|
|
|
extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
|
|
|
|
extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
|
2011-06-29 08:19:22 +08:00
|
|
|
extern int kvmppc_core_init_vm(struct kvm *kvm);
|
|
|
|
extern void kvmppc_core_destroy_vm(struct kvm *kvm);
|
2013-10-08 00:48:00 +08:00
|
|
|
extern void kvmppc_core_free_memslot(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *free,
|
2012-09-11 21:27:46 +08:00
|
|
|
struct kvm_memory_slot *dont);
|
2013-10-08 00:48:00 +08:00
|
|
|
extern int kvmppc_core_create_memslot(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *slot,
|
2012-09-11 21:27:46 +08:00
|
|
|
unsigned long npages);
|
2011-06-29 08:19:22 +08:00
|
|
|
extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
|
2012-09-11 21:27:46 +08:00
|
|
|
struct kvm_memory_slot *memslot,
|
2011-06-29 08:19:22 +08:00
|
|
|
struct kvm_userspace_memory_region *mem);
|
|
|
|
extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
|
2012-09-11 21:28:18 +08:00
|
|
|
struct kvm_userspace_memory_region *mem,
|
2013-02-27 18:45:25 +08:00
|
|
|
const struct kvm_memory_slot *old);
|
2012-04-27 03:43:42 +08:00
|
|
|
extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
|
|
|
|
struct kvm_ppc_smmu_info *info);
|
2012-09-11 21:28:18 +08:00
|
|
|
extern void kvmppc_core_flush_memslot(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *memslot);
|
2011-06-29 08:19:22 +08:00
|
|
|
|
2011-12-20 23:34:43 +08:00
|
|
|
extern int kvmppc_bookehv_init(void);
|
|
|
|
extern void kvmppc_bookehv_exit(void);
|
|
|
|
|
2012-08-10 18:28:50 +08:00
|
|
|
extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
|
|
|
|
|
KVM: PPC: Book3S HV: Provide a method for userspace to read and write the HPT
A new ioctl, KVM_PPC_GET_HTAB_FD, returns a file descriptor. Reads on
this fd return the contents of the HPT (hashed page table), writes
create and/or remove entries in the HPT. There is a new capability,
KVM_CAP_PPC_HTAB_FD, to indicate the presence of the ioctl. The ioctl
takes an argument structure with the index of the first HPT entry to
read out and a set of flags. The flags indicate whether the user is
intending to read or write the HPT, and whether to return all entries
or only the "bolted" entries (those with the bolted bit, 0x10, set in
the first doubleword).
This is intended for use in implementing qemu's savevm/loadvm and for
live migration. Therefore, on reads, the first pass returns information
about all HPTEs (or all bolted HPTEs). When the first pass reaches the
end of the HPT, it returns from the read. Subsequent reads only return
information about HPTEs that have changed since they were last read.
A read that finds no changed HPTEs in the HPT following where the last
read finished will return 0 bytes.
The format of the data provides a simple run-length compression of the
invalid entries. Each block of data starts with a header that indicates
the index (position in the HPT, which is just an array), the number of
valid entries starting at that index (may be zero), and the number of
invalid entries following those valid entries. The valid entries, 16
bytes each, follow the header. The invalid entries are not explicitly
represented.
Signed-off-by: Paul Mackerras <paulus@samba.org>
[agraf: fix documentation]
Signed-off-by: Alexander Graf <agraf@suse.de>
2012-11-20 06:57:20 +08:00
|
|
|
extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
|
|
|
|
|
2013-04-12 22:08:46 +08:00
|
|
|
int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
|
|
|
|
|
2013-04-18 04:30:00 +08:00
|
|
|
extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
|
|
|
|
extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
|
|
|
|
extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
|
2013-04-18 04:30:26 +08:00
|
|
|
extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
|
|
|
|
u32 priority);
|
|
|
|
extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
|
|
|
|
u32 *priority);
|
2013-04-18 04:32:04 +08:00
|
|
|
extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
|
|
|
|
extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
|
2013-04-18 04:30:00 +08:00
|
|
|
|
2013-10-08 00:47:53 +08:00
|
|
|
union kvmppc_one_reg {
|
|
|
|
u32 wval;
|
|
|
|
u64 dval;
|
|
|
|
vector128 vval;
|
|
|
|
u64 vsxval[2];
|
|
|
|
struct {
|
|
|
|
u64 addr;
|
|
|
|
u64 length;
|
|
|
|
} vpaval;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct kvmppc_ops {
|
2013-10-08 00:48:01 +08:00
|
|
|
struct module *owner;
|
2013-10-08 00:47:53 +08:00
|
|
|
int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
|
|
|
|
int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
|
|
|
|
int (*get_one_reg)(struct kvm_vcpu *vcpu, u64 id,
|
|
|
|
union kvmppc_one_reg *val);
|
|
|
|
int (*set_one_reg)(struct kvm_vcpu *vcpu, u64 id,
|
|
|
|
union kvmppc_one_reg *val);
|
|
|
|
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
|
|
|
|
void (*vcpu_put)(struct kvm_vcpu *vcpu);
|
|
|
|
void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr);
|
|
|
|
int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
|
|
|
|
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id);
|
|
|
|
void (*vcpu_free)(struct kvm_vcpu *vcpu);
|
|
|
|
int (*check_requests)(struct kvm_vcpu *vcpu);
|
|
|
|
int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log);
|
|
|
|
void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot);
|
|
|
|
int (*prepare_memory_region)(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *memslot,
|
|
|
|
struct kvm_userspace_memory_region *mem);
|
|
|
|
void (*commit_memory_region)(struct kvm *kvm,
|
|
|
|
struct kvm_userspace_memory_region *mem,
|
|
|
|
const struct kvm_memory_slot *old);
|
|
|
|
int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
|
|
|
|
int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
|
|
|
|
unsigned long end);
|
|
|
|
int (*age_hva)(struct kvm *kvm, unsigned long hva);
|
|
|
|
int (*test_age_hva)(struct kvm *kvm, unsigned long hva);
|
|
|
|
void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte);
|
|
|
|
void (*mmu_destroy)(struct kvm_vcpu *vcpu);
|
|
|
|
void (*free_memslot)(struct kvm_memory_slot *free,
|
|
|
|
struct kvm_memory_slot *dont);
|
|
|
|
int (*create_memslot)(struct kvm_memory_slot *slot,
|
|
|
|
unsigned long npages);
|
|
|
|
int (*init_vm)(struct kvm *kvm);
|
|
|
|
void (*destroy_vm)(struct kvm *kvm);
|
|
|
|
int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info);
|
|
|
|
int (*emulate_op)(struct kvm_run *run, struct kvm_vcpu *vcpu,
|
|
|
|
unsigned int inst, int *advance);
|
|
|
|
int (*emulate_mtspr)(struct kvm_vcpu *vcpu, int sprn, ulong spr_val);
|
|
|
|
int (*emulate_mfspr)(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val);
|
|
|
|
void (*fast_vcpu_kick)(struct kvm_vcpu *vcpu);
|
|
|
|
long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl,
|
|
|
|
unsigned long arg);
|
|
|
|
|
|
|
|
};
|
|
|
|
|
2013-10-08 00:48:01 +08:00
|
|
|
extern struct kvmppc_ops *kvmppc_hv_ops;
|
|
|
|
extern struct kvmppc_ops *kvmppc_pr_ops;
|
2013-10-08 00:47:53 +08:00
|
|
|
|
2013-10-08 00:48:02 +08:00
|
|
|
static inline bool is_kvmppc_hv_enabled(struct kvm *kvm)
|
|
|
|
{
|
|
|
|
return kvm->arch.kvm_ops == kvmppc_hv_ops;
|
|
|
|
}
|
|
|
|
|
2010-02-19 18:00:42 +08:00
|
|
|
/*
|
|
|
|
* Cuts out inst bits with ordering according to spec.
|
|
|
|
* That means the leftmost bit is zero. All given bits are included.
|
|
|
|
*/
|
|
|
|
static inline u32 kvmppc_get_field(u64 inst, int msb, int lsb)
|
|
|
|
{
|
|
|
|
u32 r;
|
|
|
|
u32 mask;
|
|
|
|
|
|
|
|
BUG_ON(msb > lsb);
|
|
|
|
|
|
|
|
mask = (1 << (lsb - msb + 1)) - 1;
|
|
|
|
r = (inst >> (63 - lsb)) & mask;
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Replaces inst bits with ordering according to spec.
|
|
|
|
*/
|
|
|
|
static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
|
|
|
|
{
|
|
|
|
u32 r;
|
|
|
|
u32 mask;
|
|
|
|
|
|
|
|
BUG_ON(msb > lsb);
|
|
|
|
|
|
|
|
mask = ((1 << (lsb - msb + 1)) - 1) << (63 - lsb);
|
|
|
|
r = (inst & ~mask) | ((value << (63 - lsb)) & mask);
|
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2012-09-26 04:31:56 +08:00
|
|
|
#define one_reg_size(id) \
|
|
|
|
(1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
|
|
|
|
|
|
|
|
#define get_reg_val(id, reg) ({ \
|
|
|
|
union kvmppc_one_reg __u; \
|
|
|
|
switch (one_reg_size(id)) { \
|
|
|
|
case 4: __u.wval = (reg); break; \
|
|
|
|
case 8: __u.dval = (reg); break; \
|
|
|
|
default: BUG(); \
|
|
|
|
} \
|
|
|
|
__u; \
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
#define set_reg_val(id, val) ({ \
|
|
|
|
u64 __v; \
|
|
|
|
switch (one_reg_size(id)) { \
|
|
|
|
case 4: __v = (val).wval; break; \
|
|
|
|
case 8: __v = (val).dval; break; \
|
|
|
|
default: BUG(); \
|
|
|
|
} \
|
|
|
|
__v; \
|
|
|
|
})
|
|
|
|
|
2013-10-08 00:47:53 +08:00
|
|
|
int kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
|
2011-04-28 06:24:21 +08:00
|
|
|
int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
|
|
|
|
|
2013-10-08 00:47:53 +08:00
|
|
|
int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
|
2011-04-28 06:24:21 +08:00
|
|
|
int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
|
|
|
|
|
2011-12-12 20:26:50 +08:00
|
|
|
int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
|
|
|
|
int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
|
2012-09-26 04:31:56 +08:00
|
|
|
int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
|
|
|
|
int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
|
2011-12-12 20:26:50 +08:00
|
|
|
|
2011-04-28 06:24:21 +08:00
|
|
|
void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
|
|
|
|
|
2013-04-12 22:08:46 +08:00
|
|
|
struct openpic;
|
|
|
|
|
2013-10-08 00:47:52 +08:00
|
|
|
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
|
2013-07-02 13:45:16 +08:00
|
|
|
extern void kvm_cma_reserve(void) __init;
|
KVM: PPC: Allow book3s_hv guests to use SMT processor modes
This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7. The host still has to run single-threaded.
This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability. The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.
To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode. KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline). To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it. Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.
When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host. This number is exported
to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.
We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host. We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked. This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.
When a vcore starts to run, it executes in the context of one of the
vcpu threads. The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).
It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running. In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest. It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.
Note that there is no fixed relationship between the hardware thread
number and the vcpu number. Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:23:08 +08:00
|
|
|
static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
|
|
|
|
{
|
|
|
|
paca[cpu].kvm_hstate.xics_phys = addr;
|
|
|
|
}
|
KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests
This adds infrastructure which will be needed to allow book3s_hv KVM to
run on older POWER processors, including PPC970, which don't support
the Virtual Real Mode Area (VRMA) facility, but only the Real Mode
Offset (RMO) facility. These processors require a physically
contiguous, aligned area of memory for each guest. When the guest does
an access in real mode (MMU off), the address is compared against a
limit value, and if it is lower, the address is ORed with an offset
value (from the Real Mode Offset Register (RMOR)) and the result becomes
the real address for the access. The size of the RMA has to be one of
a set of supported values, which usually includes 64MB, 128MB, 256MB
and some larger powers of 2.
Since we are unlikely to be able to allocate 64MB or more of physically
contiguous memory after the kernel has been running for a while, we
allocate a pool of RMAs at boot time using the bootmem allocator. The
size and number of the RMAs can be set using the kvm_rma_size=xx and
kvm_rma_count=xx kernel command line options.
KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability
of the pool of preallocated RMAs. The capability value is 1 if the
processor can use an RMA but doesn't require one (because it supports
the VRMA facility), or 2 if the processor requires an RMA for each guest.
This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the
pool and returns a file descriptor which can be used to map the RMA. It
also returns the size of the RMA in the argument structure.
Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION
ioctl calls from userspace. To cope with this, we now preallocate the
kvm->arch.ram_pginfo array when the VM is created with a size sufficient
for up to 64GB of guest memory. Subsequently we will get rid of this
array and use memory associated with each memslot instead.
This moves most of the code that translates the user addresses into
host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level
to kvmppc_core_prepare_memory_region. Also, instead of having to look
up the VMA for each page in order to check the page size, we now check
that the pages we get are compound pages of 16MB. However, if we are
adding memory that is mapped to an RMA, we don't bother with calling
get_user_pages_fast and instead just offset from the base pfn for the
RMA.
Typically the RMA gets added after vcpus are created, which makes it
inconvenient to have the LPCR (logical partition control register) value
in the vcpu->arch struct, since the LPCR controls whether the processor
uses RMA or VRMA for the guest. This moves the LPCR value into the
kvm->arch struct and arranges for the MER (mediated external request)
bit, which is the only bit that varies between vcpus, to be set in
assembly code when going into the guest if there is a pending external
interrupt request.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:25:44 +08:00
|
|
|
|
2013-04-18 04:30:50 +08:00
|
|
|
static inline u32 kvmppc_get_xics_latch(void)
|
|
|
|
{
|
2013-10-08 00:47:56 +08:00
|
|
|
u32 xirr;
|
2013-04-18 04:30:50 +08:00
|
|
|
|
2013-10-08 00:47:56 +08:00
|
|
|
xirr = get_paca()->kvm_hstate.saved_xirr;
|
2013-04-18 04:30:50 +08:00
|
|
|
get_paca()->kvm_hstate.saved_xirr = 0;
|
|
|
|
return xirr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
|
|
|
|
{
|
|
|
|
paca[cpu].kvm_hstate.host_ipi = host_ipi;
|
|
|
|
}
|
|
|
|
|
2013-10-08 00:47:53 +08:00
|
|
|
static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
2013-10-08 00:48:01 +08:00
|
|
|
vcpu->kvm->arch.kvm_ops->fast_vcpu_kick(vcpu);
|
2013-10-08 00:47:53 +08:00
|
|
|
}
|
KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests
This adds infrastructure which will be needed to allow book3s_hv KVM to
run on older POWER processors, including PPC970, which don't support
the Virtual Real Mode Area (VRMA) facility, but only the Real Mode
Offset (RMO) facility. These processors require a physically
contiguous, aligned area of memory for each guest. When the guest does
an access in real mode (MMU off), the address is compared against a
limit value, and if it is lower, the address is ORed with an offset
value (from the Real Mode Offset Register (RMOR)) and the result becomes
the real address for the access. The size of the RMA has to be one of
a set of supported values, which usually includes 64MB, 128MB, 256MB
and some larger powers of 2.
Since we are unlikely to be able to allocate 64MB or more of physically
contiguous memory after the kernel has been running for a while, we
allocate a pool of RMAs at boot time using the bootmem allocator. The
size and number of the RMAs can be set using the kvm_rma_size=xx and
kvm_rma_count=xx kernel command line options.
KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability
of the pool of preallocated RMAs. The capability value is 1 if the
processor can use an RMA but doesn't require one (because it supports
the VRMA facility), or 2 if the processor requires an RMA for each guest.
This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the
pool and returns a file descriptor which can be used to map the RMA. It
also returns the size of the RMA in the argument structure.
Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION
ioctl calls from userspace. To cope with this, we now preallocate the
kvm->arch.ram_pginfo array when the VM is created with a size sufficient
for up to 64GB of guest memory. Subsequently we will get rid of this
array and use memory associated with each memslot instead.
This moves most of the code that translates the user addresses into
host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level
to kvmppc_core_prepare_memory_region. Also, instead of having to look
up the VMA for each page in order to check the page size, we now check
that the pages we get are compound pages of 16MB. However, if we are
adding memory that is mapped to an RMA, we don't bother with calling
get_user_pages_fast and instead just offset from the base pfn for the
RMA.
Typically the RMA gets added after vcpus are created, which makes it
inconvenient to have the LPCR (logical partition control register) value
in the vcpu->arch struct, since the LPCR controls whether the processor
uses RMA or VRMA for the guest. This moves the LPCR value into the
kvm->arch struct and arranges for the MER (mediated external request)
bit, which is the only bit that varies between vcpus, to be set in
assembly code when going into the guest if there is a pending external
interrupt request.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:25:44 +08:00
|
|
|
|
KVM: PPC: Allow book3s_hv guests to use SMT processor modes
This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7. The host still has to run single-threaded.
This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability. The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.
To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode. KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline). To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it. Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.
When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host. This number is exported
to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.
We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host. We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked. This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.
When a vcore starts to run, it executes in the context of one of the
vcpu threads. The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).
It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running. In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest. It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.
Note that there is no fixed relationship between the hardware thread
number and the vcpu number. Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:23:08 +08:00
|
|
|
#else
|
2013-07-02 13:45:16 +08:00
|
|
|
static inline void __init kvm_cma_reserve(void)
|
|
|
|
{}
|
|
|
|
|
KVM: PPC: Allow book3s_hv guests to use SMT processor modes
This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7. The host still has to run single-threaded.
This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability. The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.
To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode. KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline). To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it. Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.
When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host. This number is exported
to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.
We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host. We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked. This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.
When a vcore starts to run, it executes in the context of one of the
vcpu threads. The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).
It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running. In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest. It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.
Note that there is no fixed relationship between the hardware thread
number and the vcpu number. Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:23:08 +08:00
|
|
|
static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
|
|
|
|
{}
|
KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests
This adds infrastructure which will be needed to allow book3s_hv KVM to
run on older POWER processors, including PPC970, which don't support
the Virtual Real Mode Area (VRMA) facility, but only the Real Mode
Offset (RMO) facility. These processors require a physically
contiguous, aligned area of memory for each guest. When the guest does
an access in real mode (MMU off), the address is compared against a
limit value, and if it is lower, the address is ORed with an offset
value (from the Real Mode Offset Register (RMOR)) and the result becomes
the real address for the access. The size of the RMA has to be one of
a set of supported values, which usually includes 64MB, 128MB, 256MB
and some larger powers of 2.
Since we are unlikely to be able to allocate 64MB or more of physically
contiguous memory after the kernel has been running for a while, we
allocate a pool of RMAs at boot time using the bootmem allocator. The
size and number of the RMAs can be set using the kvm_rma_size=xx and
kvm_rma_count=xx kernel command line options.
KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability
of the pool of preallocated RMAs. The capability value is 1 if the
processor can use an RMA but doesn't require one (because it supports
the VRMA facility), or 2 if the processor requires an RMA for each guest.
This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the
pool and returns a file descriptor which can be used to map the RMA. It
also returns the size of the RMA in the argument structure.
Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION
ioctl calls from userspace. To cope with this, we now preallocate the
kvm->arch.ram_pginfo array when the VM is created with a size sufficient
for up to 64GB of guest memory. Subsequently we will get rid of this
array and use memory associated with each memslot instead.
This moves most of the code that translates the user addresses into
host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level
to kvmppc_core_prepare_memory_region. Also, instead of having to look
up the VMA for each page in order to check the page size, we now check
that the pages we get are compound pages of 16MB. However, if we are
adding memory that is mapped to an RMA, we don't bother with calling
get_user_pages_fast and instead just offset from the base pfn for the
RMA.
Typically the RMA gets added after vcpus are created, which makes it
inconvenient to have the LPCR (logical partition control register) value
in the vcpu->arch struct, since the LPCR controls whether the processor
uses RMA or VRMA for the guest. This moves the LPCR value into the
kvm->arch struct and arranges for the MER (mediated external request)
bit, which is the only bit that varies between vcpus, to be set in
assembly code when going into the guest if there is a pending external
interrupt request.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:25:44 +08:00
|
|
|
|
2013-04-18 04:30:50 +08:00
|
|
|
static inline u32 kvmppc_get_xics_latch(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
|
|
|
|
{}
|
|
|
|
|
|
|
|
static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
|
|
|
kvm_vcpu_kick(vcpu);
|
|
|
|
}
|
2013-04-18 04:30:26 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_KVM_XICS
|
|
|
|
static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
|
|
|
return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
|
|
|
|
}
|
|
|
|
extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
|
|
|
|
extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
|
|
|
|
extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
|
|
|
|
extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
|
2013-04-18 04:32:26 +08:00
|
|
|
extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
|
|
|
|
extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
|
2013-04-27 08:28:37 +08:00
|
|
|
extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
|
|
|
|
struct kvm_vcpu *vcpu, u32 cpu);
|
2013-04-18 04:30:26 +08:00
|
|
|
#else
|
|
|
|
static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
|
|
|
|
{ return 0; }
|
|
|
|
static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
|
|
|
|
static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
|
|
|
|
unsigned long server)
|
|
|
|
{ return -EINVAL; }
|
|
|
|
static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
|
|
|
|
struct kvm_irq_level *args)
|
|
|
|
{ return -ENOTTY; }
|
|
|
|
static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
|
|
|
|
{ return 0; }
|
KVM: PPC: Allow book3s_hv guests to use SMT processor modes
This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7. The host still has to run single-threaded.
This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability. The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.
To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode. KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline). To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it. Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.
When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host. This number is exported
to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.
We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host. We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked. This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.
When a vcore starts to run, it executes in the context of one of the
vcpu threads. The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).
It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running. In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest. It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.
Note that there is no fixed relationship between the hardware thread
number and the vcpu number. Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 08:23:08 +08:00
|
|
|
#endif
|
|
|
|
|
2013-01-05 01:12:48 +08:00
|
|
|
static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_KVM_BOOKE_HV
|
|
|
|
mtspr(SPRN_GEPR, epr);
|
|
|
|
#elif defined(CONFIG_BOOKE)
|
|
|
|
vcpu->arch.epr = epr;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2013-04-12 22:08:46 +08:00
|
|
|
#ifdef CONFIG_KVM_MPIC
|
|
|
|
|
|
|
|
void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
|
2013-04-12 22:08:47 +08:00
|
|
|
int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
|
|
|
|
u32 cpu);
|
|
|
|
void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu);
|
2013-04-12 22:08:46 +08:00
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-04-12 22:08:47 +08:00
|
|
|
static inline int kvmppc_mpic_connect_vcpu(struct kvm_device *dev,
|
|
|
|
struct kvm_vcpu *vcpu, u32 cpu)
|
|
|
|
{
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvmppc_mpic_disconnect_vcpu(struct openpic *opp,
|
|
|
|
struct kvm_vcpu *vcpu)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-04-12 22:08:46 +08:00
|
|
|
#endif /* CONFIG_KVM_MPIC */
|
|
|
|
|
2011-08-19 04:25:21 +08:00
|
|
|
int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
|
|
|
|
struct kvm_config_tlb *cfg);
|
|
|
|
int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
|
|
|
|
struct kvm_dirty_tlb *cfg);
|
|
|
|
|
2011-12-20 23:34:20 +08:00
|
|
|
long kvmppc_alloc_lpid(void);
|
|
|
|
void kvmppc_claim_lpid(long lpid);
|
|
|
|
void kvmppc_free_lpid(long lpid);
|
|
|
|
void kvmppc_init_lpid(unsigned long nr_lpids);
|
|
|
|
|
2012-08-03 19:56:33 +08:00
|
|
|
static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
|
|
|
|
{
|
|
|
|
struct page *page;
|
2013-04-25 14:33:57 +08:00
|
|
|
/*
|
|
|
|
* We can only access pages that the kernel maps
|
|
|
|
* as memory. Bail out for unmapped ones.
|
|
|
|
*/
|
|
|
|
if (!pfn_valid(pfn))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Clear i-cache for new pages */
|
2012-08-03 19:56:33 +08:00
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
if (!test_bit(PG_arch_1, &page->flags)) {
|
|
|
|
flush_dcache_icache_page(page);
|
|
|
|
set_bit(PG_arch_1, &page->flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-11 06:47:39 +08:00
|
|
|
/*
|
|
|
|
* Please call after prepare_to_enter. This function puts the lazy ee and irq
|
|
|
|
* disabled tracking state back to normal mode, without actually enabling
|
|
|
|
* interrupts.
|
|
|
|
*/
|
|
|
|
static inline void kvmppc_fix_ee_before_entry(void)
|
2012-08-13 07:04:19 +08:00
|
|
|
{
|
2013-07-11 06:47:39 +08:00
|
|
|
trace_hardirqs_on();
|
|
|
|
|
2012-08-13 07:04:19 +08:00
|
|
|
#ifdef CONFIG_PPC64
|
|
|
|
/* Only need to enable IRQs by hard enabling them after this */
|
|
|
|
local_paca->irq_happened = 0;
|
|
|
|
local_paca->soft_enabled = 1;
|
|
|
|
#endif
|
|
|
|
}
|
2012-08-03 19:56:33 +08:00
|
|
|
|
2012-10-11 14:13:22 +08:00
|
|
|
static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
|
|
|
|
{
|
|
|
|
ulong ea;
|
2012-10-11 14:13:23 +08:00
|
|
|
ulong msr_64bit = 0;
|
2012-10-11 14:13:22 +08:00
|
|
|
|
|
|
|
ea = kvmppc_get_gpr(vcpu, rb);
|
|
|
|
if (ra)
|
|
|
|
ea += kvmppc_get_gpr(vcpu, ra);
|
|
|
|
|
2012-10-11 14:13:23 +08:00
|
|
|
#if defined(CONFIG_PPC_BOOK3E_64)
|
|
|
|
msr_64bit = MSR_CM;
|
|
|
|
#elif defined(CONFIG_PPC_BOOK3S_64)
|
|
|
|
msr_64bit = MSR_SF;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (!(vcpu->arch.shared->msr & msr_64bit))
|
|
|
|
ea = (uint32_t)ea;
|
|
|
|
|
2012-10-11 14:13:22 +08:00
|
|
|
return ea;
|
|
|
|
}
|
|
|
|
|
2013-04-18 04:30:50 +08:00
|
|
|
extern void xics_wake_cpu(int cpu);
|
|
|
|
|
2008-04-17 12:28:09 +08:00
|
|
|
#endif /* __POWERPC_KVM_PPC_H__ */
|