Merge remote-tracking branch 'kvm/next' into kvm-next-5.20
KVM/s390, KVM/x86 and common infrastructure changes for 5.20 x86: * Permit guests to ignore single-bit ECC errors * Fix races in gfn->pfn cache refresh; do not pin pages tracked by the cache * Intel IPI virtualization * Allow getting/setting pending triple fault with KVM_GET/SET_VCPU_EVENTS * PEBS virtualization * Simplify PMU emulation by just using PERF_TYPE_RAW events * More accurate event reinjection on SVM (avoid retrying instructions) * Allow getting/setting the state of the speaker port data bit * Refuse starting the kvm-intel module if VM-Entry/VM-Exit controls are inconsistent * "Notify" VM exit (detect microarchitectural hangs) for Intel * Cleanups for MCE MSR emulation s390: * add an interface to provide a hypervisor dump for secure guests * improve selftests to use TAP interface * enable interpretive execution of zPCI instructions (for PCI passthrough) * First part of deferred teardown * CPU Topology * PV attestation * Minor fixes Generic: * new selftests API using struct kvm_vcpu instead of a (vm, id) tuple x86: * Use try_cmpxchg64 instead of cmpxchg64 * Bugfixes * Ignore benign host accesses to PMU MSRs when PMU is disabled * Allow disabling KVM's "MONITOR/MWAIT are NOPs!" behavior * x86/MMU: Allow NX huge pages to be disabled on a per-vm basis * Port eager page splitting to shadow MMU as well * Enable CMCI capability by default and handle injected UCNA errors * Expose pid of vcpu threads in debugfs * x2AVIC support for AMD * cleanup PIO emulation * Fixes for LLDT/LTR emulation * Don't require refcounted "struct page" to create huge SPTEs x86 cleanups: * Use separate namespaces for guest PTEs and shadow PTEs bitmasks * PIO emulation * Reorganize rmap API, mostly around rmap destruction * Do not workaround very old KVM bugs for L0 that runs with nesting enabled * new selftests API for CPUID
This commit is contained in:
commit
63f4b21041
|
@ -2418,8 +2418,7 @@
|
|||
the KVM_CLEAR_DIRTY ioctl, and only for the pages being
|
||||
cleared.
|
||||
|
||||
Eager page splitting currently only supports splitting
|
||||
huge pages mapped by the TDP MMU.
|
||||
Eager page splitting is only supported when kvm.tdp_mmu=Y.
|
||||
|
||||
Default is Y (on).
|
||||
|
||||
|
|
|
@ -1150,6 +1150,10 @@ The following bits are defined in the flags field:
|
|||
fields contain a valid state. This bit will be set whenever
|
||||
KVM_CAP_EXCEPTION_PAYLOAD is enabled.
|
||||
|
||||
- KVM_VCPUEVENT_VALID_TRIPLE_FAULT may be set to signal that the
|
||||
triple_fault_pending field contains a valid state. This bit will
|
||||
be set whenever KVM_CAP_X86_TRIPLE_FAULT_EVENT is enabled.
|
||||
|
||||
ARM64:
|
||||
^^^^^^
|
||||
|
||||
|
@ -1245,6 +1249,10 @@ can be set in the flags field to signal that the
|
|||
exception_has_payload, exception_payload, and exception.pending fields
|
||||
contain a valid state and shall be written into the VCPU.
|
||||
|
||||
If KVM_CAP_X86_TRIPLE_FAULT_EVENT is enabled, KVM_VCPUEVENT_VALID_TRIPLE_FAULT
|
||||
can be set in flags field to signal that the triple_fault field contains
|
||||
a valid state and shall be written into the VCPU.
|
||||
|
||||
ARM64:
|
||||
^^^^^^
|
||||
|
||||
|
@ -2998,7 +3006,9 @@ KVM_CREATE_PIT2. The state is returned in the following structure::
|
|||
Valid flags are::
|
||||
|
||||
/* disable PIT in HPET legacy mode */
|
||||
#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
|
||||
#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
|
||||
/* speaker port data bit enabled */
|
||||
#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002
|
||||
|
||||
This IOCTL replaces the obsolete KVM_GET_PIT.
|
||||
|
||||
|
@ -5127,7 +5137,15 @@ into ESA mode. This reset is a superset of the initial reset.
|
|||
__u32 reserved[3];
|
||||
};
|
||||
|
||||
cmd values:
|
||||
**Ultravisor return codes**
|
||||
The Ultravisor return (reason) codes are provided by the kernel if a
|
||||
Ultravisor call has been executed to achieve the results expected by
|
||||
the command. Therefore they are independent of the IOCTL return
|
||||
code. If KVM changes `rc`, its value will always be greater than 0
|
||||
hence setting it to 0 before issuing a PV command is advised to be
|
||||
able to detect a change of `rc`.
|
||||
|
||||
**cmd values:**
|
||||
|
||||
KVM_PV_ENABLE
|
||||
Allocate memory and register the VM with the Ultravisor, thereby
|
||||
|
@ -5143,7 +5161,6 @@ KVM_PV_ENABLE
|
|||
===== =============================
|
||||
|
||||
KVM_PV_DISABLE
|
||||
|
||||
Deregister the VM from the Ultravisor and reclaim the memory that
|
||||
had been donated to the Ultravisor, making it usable by the kernel
|
||||
again. All registered VCPUs are converted back to non-protected
|
||||
|
@ -5160,6 +5177,117 @@ KVM_PV_VM_VERIFY
|
|||
Verify the integrity of the unpacked image. Only if this succeeds,
|
||||
KVM is allowed to start protected VCPUs.
|
||||
|
||||
KVM_PV_INFO
|
||||
:Capability: KVM_CAP_S390_PROTECTED_DUMP
|
||||
|
||||
Presents an API that provides Ultravisor related data to userspace
|
||||
via subcommands. len_max is the size of the user space buffer,
|
||||
len_written is KVM's indication of how much bytes of that buffer
|
||||
were actually written to. len_written can be used to determine the
|
||||
valid fields if more response fields are added in the future.
|
||||
|
||||
::
|
||||
|
||||
enum pv_cmd_info_id {
|
||||
KVM_PV_INFO_VM,
|
||||
KVM_PV_INFO_DUMP,
|
||||
};
|
||||
|
||||
struct kvm_s390_pv_info_header {
|
||||
__u32 id;
|
||||
__u32 len_max;
|
||||
__u32 len_written;
|
||||
__u32 reserved;
|
||||
};
|
||||
|
||||
struct kvm_s390_pv_info {
|
||||
struct kvm_s390_pv_info_header header;
|
||||
struct kvm_s390_pv_info_dump dump;
|
||||
struct kvm_s390_pv_info_vm vm;
|
||||
};
|
||||
|
||||
**subcommands:**
|
||||
|
||||
KVM_PV_INFO_VM
|
||||
This subcommand provides basic Ultravisor information for PV
|
||||
hosts. These values are likely also exported as files in the sysfs
|
||||
firmware UV query interface but they are more easily available to
|
||||
programs in this API.
|
||||
|
||||
The installed calls and feature_indication members provide the
|
||||
installed UV calls and the UV's other feature indications.
|
||||
|
||||
The max_* members provide information about the maximum number of PV
|
||||
vcpus, PV guests and PV guest memory size.
|
||||
|
||||
::
|
||||
|
||||
struct kvm_s390_pv_info_vm {
|
||||
__u64 inst_calls_list[4];
|
||||
__u64 max_cpus;
|
||||
__u64 max_guests;
|
||||
__u64 max_guest_addr;
|
||||
__u64 feature_indication;
|
||||
};
|
||||
|
||||
|
||||
KVM_PV_INFO_DUMP
|
||||
This subcommand provides information related to dumping PV guests.
|
||||
|
||||
::
|
||||
|
||||
struct kvm_s390_pv_info_dump {
|
||||
__u64 dump_cpu_buffer_len;
|
||||
__u64 dump_config_mem_buffer_per_1m;
|
||||
__u64 dump_config_finalize_len;
|
||||
};
|
||||
|
||||
KVM_PV_DUMP
|
||||
:Capability: KVM_CAP_S390_PROTECTED_DUMP
|
||||
|
||||
Presents an API that provides calls which facilitate dumping a
|
||||
protected VM.
|
||||
|
||||
::
|
||||
|
||||
struct kvm_s390_pv_dmp {
|
||||
__u64 subcmd;
|
||||
__u64 buff_addr;
|
||||
__u64 buff_len;
|
||||
__u64 gaddr; /* For dump storage state */
|
||||
};
|
||||
|
||||
**subcommands:**
|
||||
|
||||
KVM_PV_DUMP_INIT
|
||||
Initializes the dump process of a protected VM. If this call does
|
||||
not succeed all other subcommands will fail with -EINVAL. This
|
||||
subcommand will return -EINVAL if a dump process has not yet been
|
||||
completed.
|
||||
|
||||
Not all PV vms can be dumped, the owner needs to set `dump
|
||||
allowed` PCF bit 34 in the SE header to allow dumping.
|
||||
|
||||
KVM_PV_DUMP_CONFIG_STOR_STATE
|
||||
Stores `buff_len` bytes of tweak component values starting with
|
||||
the 1MB block specified by the absolute guest address
|
||||
(`gaddr`). `buff_len` needs to be `conf_dump_storage_state_len`
|
||||
aligned and at least >= the `conf_dump_storage_state_len` value
|
||||
provided by the dump uv_info data. buff_user might be written to
|
||||
even if an error rc is returned. For instance if we encounter a
|
||||
fault after writing the first page of data.
|
||||
|
||||
KVM_PV_DUMP_COMPLETE
|
||||
If the subcommand succeeds it completes the dump process and lets
|
||||
KVM_PV_DUMP_INIT be called again.
|
||||
|
||||
On success `conf_dump_finalize_len` bytes of completion data will be
|
||||
stored to the `buff_addr`. The completion data contains a key
|
||||
derivation seed, IV, tweak nonce and encryption keys as well as an
|
||||
authentication tag all of which are needed to decrypt the dump at a
|
||||
later time.
|
||||
|
||||
|
||||
4.126 KVM_X86_SET_MSR_FILTER
|
||||
----------------------------
|
||||
|
||||
|
@ -5811,6 +5939,78 @@ of CPUID leaf 0xD on the host.
|
|||
|
||||
This ioctl injects an event channel interrupt directly to the guest vCPU.
|
||||
|
||||
4.136 KVM_S390_PV_CPU_COMMAND
|
||||
-----------------------------
|
||||
|
||||
:Capability: KVM_CAP_S390_PROTECTED_DUMP
|
||||
:Architectures: s390
|
||||
:Type: vcpu ioctl
|
||||
:Parameters: none
|
||||
:Returns: 0 on success, < 0 on error
|
||||
|
||||
This ioctl closely mirrors `KVM_S390_PV_COMMAND` but handles requests
|
||||
for vcpus. It re-uses the kvm_s390_pv_dmp struct and hence also shares
|
||||
the command ids.
|
||||
|
||||
**command:**
|
||||
|
||||
KVM_PV_DUMP
|
||||
Presents an API that provides calls which facilitate dumping a vcpu
|
||||
of a protected VM.
|
||||
|
||||
**subcommand:**
|
||||
|
||||
KVM_PV_DUMP_CPU
|
||||
Provides encrypted dump data like register values.
|
||||
The length of the returned data is provided by uv_info.guest_cpu_stor_len.
|
||||
|
||||
4.137 KVM_S390_ZPCI_OP
|
||||
----------------------
|
||||
|
||||
:Capability: KVM_CAP_S390_ZPCI_OP
|
||||
:Architectures: s390
|
||||
:Type: vm ioctl
|
||||
:Parameters: struct kvm_s390_zpci_op (in)
|
||||
:Returns: 0 on success, <0 on error
|
||||
|
||||
Used to manage hardware-assisted virtualization features for zPCI devices.
|
||||
|
||||
Parameters are specified via the following structure::
|
||||
|
||||
struct kvm_s390_zpci_op {
|
||||
/* in */
|
||||
__u32 fh; /* target device */
|
||||
__u8 op; /* operation to perform */
|
||||
__u8 pad[3];
|
||||
union {
|
||||
/* for KVM_S390_ZPCIOP_REG_AEN */
|
||||
struct {
|
||||
__u64 ibv; /* Guest addr of interrupt bit vector */
|
||||
__u64 sb; /* Guest addr of summary bit */
|
||||
__u32 flags;
|
||||
__u32 noi; /* Number of interrupts */
|
||||
__u8 isc; /* Guest interrupt subclass */
|
||||
__u8 sbo; /* Offset of guest summary bit vector */
|
||||
__u16 pad;
|
||||
} reg_aen;
|
||||
__u64 reserved[8];
|
||||
} u;
|
||||
};
|
||||
|
||||
The type of operation is specified in the "op" field.
|
||||
KVM_S390_ZPCIOP_REG_AEN is used to register the VM for adapter event
|
||||
notification interpretation, which will allow firmware delivery of adapter
|
||||
events directly to the vm, with KVM providing a backup delivery mechanism;
|
||||
KVM_S390_ZPCIOP_DEREG_AEN is used to subsequently disable interpretation of
|
||||
adapter event notifications.
|
||||
|
||||
The target zPCI function must also be specified via the "fh" field. For the
|
||||
KVM_S390_ZPCIOP_REG_AEN operation, additional information to establish firmware
|
||||
delivery must be provided via the "reg_aen" struct.
|
||||
|
||||
The "pad" and "reserved" fields may be used for future extensions and should be
|
||||
set to 0s by userspace.
|
||||
|
||||
5. The kvm_run structure
|
||||
========================
|
||||
|
||||
|
@ -6414,6 +6614,26 @@ array field represents return values. The userspace should update the return
|
|||
values of SBI call before resuming the VCPU. For more details on RISC-V SBI
|
||||
spec refer, https://github.com/riscv/riscv-sbi-doc.
|
||||
|
||||
::
|
||||
|
||||
/* KVM_EXIT_NOTIFY */
|
||||
struct {
|
||||
#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0)
|
||||
__u32 flags;
|
||||
} notify;
|
||||
|
||||
Used on x86 systems. When the VM capability KVM_CAP_X86_NOTIFY_VMEXIT is
|
||||
enabled, a VM exit generated if no event window occurs in VM non-root mode
|
||||
for a specified amount of time. Once KVM_X86_NOTIFY_VMEXIT_USER is set when
|
||||
enabling the cap, it would exit to userspace with the exit reason
|
||||
KVM_EXIT_NOTIFY for further handling. The "flags" field contains more
|
||||
detailed info.
|
||||
|
||||
The valid value for 'flags' is:
|
||||
|
||||
- KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid
|
||||
in VMCS. It would run into unknown result if resume the target VM.
|
||||
|
||||
::
|
||||
|
||||
/* Fix the size of the union. */
|
||||
|
@ -7357,8 +7577,71 @@ The valid bits in cap.args[0] are:
|
|||
hypercall instructions. Executing the
|
||||
incorrect hypercall instruction will
|
||||
generate a #UD within the guest.
|
||||
|
||||
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if
|
||||
they are intercepted) as NOPs regardless of
|
||||
whether or not MONITOR/MWAIT are supported
|
||||
according to guest CPUID. When this quirk
|
||||
is disabled and KVM_X86_DISABLE_EXITS_MWAIT
|
||||
is not set (MONITOR/MWAIT are intercepted),
|
||||
KVM will inject a #UD on MONITOR/MWAIT if
|
||||
they're unsupported per guest CPUID. Note,
|
||||
KVM will modify MONITOR/MWAIT support in
|
||||
guest CPUID on writes to MISC_ENABLE if
|
||||
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
|
||||
disabled.
|
||||
=================================== ============================================
|
||||
|
||||
7.32 KVM_CAP_MAX_VCPU_ID
|
||||
------------------------
|
||||
|
||||
:Architectures: x86
|
||||
:Target: VM
|
||||
:Parameters: args[0] - maximum APIC ID value set for current VM
|
||||
:Returns: 0 on success, -EINVAL if args[0] is beyond KVM_MAX_VCPU_IDS
|
||||
supported in KVM or if it has been set.
|
||||
|
||||
This capability allows userspace to specify maximum possible APIC ID
|
||||
assigned for current VM session prior to the creation of vCPUs, saving
|
||||
memory for data structures indexed by the APIC ID. Userspace is able
|
||||
to calculate the limit to APIC ID values from designated
|
||||
CPU topology.
|
||||
|
||||
The value can be changed only until KVM_ENABLE_CAP is set to a nonzero
|
||||
value or until a vCPU is created. Upon creation of the first vCPU,
|
||||
if the value was set to zero or KVM_ENABLE_CAP was not invoked, KVM
|
||||
uses the return value of KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPU_ID) as
|
||||
the maximum APIC ID.
|
||||
|
||||
7.33 KVM_CAP_X86_NOTIFY_VMEXIT
|
||||
------------------------------
|
||||
|
||||
:Architectures: x86
|
||||
:Target: VM
|
||||
:Parameters: args[0] is the value of notify window as well as some flags
|
||||
:Returns: 0 on success, -EINVAL if args[0] contains invalid flags or notify
|
||||
VM exit is unsupported.
|
||||
|
||||
Bits 63:32 of args[0] are used for notify window.
|
||||
Bits 31:0 of args[0] are for some flags. Valid bits are::
|
||||
|
||||
#define KVM_X86_NOTIFY_VMEXIT_ENABLED (1 << 0)
|
||||
#define KVM_X86_NOTIFY_VMEXIT_USER (1 << 1)
|
||||
|
||||
This capability allows userspace to configure the notify VM exit on/off
|
||||
in per-VM scope during VM creation. Notify VM exit is disabled by default.
|
||||
When userspace sets KVM_X86_NOTIFY_VMEXIT_ENABLED bit in args[0], VMM will
|
||||
enable this feature with the notify window provided, which will generate
|
||||
a VM exit if no event window occurs in VM non-root mode for a specified of
|
||||
time (notify window).
|
||||
|
||||
If KVM_X86_NOTIFY_VMEXIT_USER is set in args[0], upon notify VM exits happen,
|
||||
KVM would exit to userspace for handling.
|
||||
|
||||
This capability is aimed to mitigate the threat that malicious VMs can
|
||||
cause CPU stuck (due to event windows don't open up) and make the CPU
|
||||
unavailable to host or other VMs.
|
||||
|
||||
8. Other capabilities.
|
||||
======================
|
||||
|
||||
|
@ -7965,6 +8248,61 @@ should adjust CPUID leaf 0xA to reflect that the PMU is disabled.
|
|||
When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of
|
||||
type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request.
|
||||
|
||||
8.37 KVM_CAP_S390_PROTECTED_DUMP
|
||||
--------------------------------
|
||||
|
||||
:Capability: KVM_CAP_S390_PROTECTED_DUMP
|
||||
:Architectures: s390
|
||||
:Type: vm
|
||||
|
||||
This capability indicates that KVM and the Ultravisor support dumping
|
||||
PV guests. The `KVM_PV_DUMP` command is available for the
|
||||
`KVM_S390_PV_COMMAND` ioctl and the `KVM_PV_INFO` command provides
|
||||
dump related UV data. Also the vcpu ioctl `KVM_S390_PV_CPU_COMMAND` is
|
||||
available and supports the `KVM_PV_DUMP_CPU` subcommand.
|
||||
|
||||
8.38 KVM_CAP_VM_DISABLE_NX_HUGE_PAGES
|
||||
---------------------------
|
||||
|
||||
:Capability KVM_CAP_VM_DISABLE_NX_HUGE_PAGES
|
||||
:Architectures: x86
|
||||
:Type: vm
|
||||
:Parameters: arg[0] must be 0.
|
||||
:Returns 0 on success, -EPERM if the userspace process does not
|
||||
have CAP_SYS_BOOT, -EINVAL if args[0] is not 0 or any vCPUs have been
|
||||
created.
|
||||
|
||||
This capability disables the NX huge pages mitigation for iTLB MULTIHIT.
|
||||
|
||||
The capability has no effect if the nx_huge_pages module parameter is not set.
|
||||
|
||||
This capability may only be set before any vCPUs are created.
|
||||
|
||||
8.39 KVM_CAP_S390_CPU_TOPOLOGY
|
||||
------------------------------
|
||||
|
||||
:Capability: KVM_CAP_S390_CPU_TOPOLOGY
|
||||
:Architectures: s390
|
||||
:Type: vm
|
||||
|
||||
This capability indicates that KVM will provide the S390 CPU Topology
|
||||
facility which consist of the interpretation of the PTF instruction for
|
||||
the function code 2 along with interception and forwarding of both the
|
||||
PTF instruction with function codes 0 or 1 and the STSI(15,1,x)
|
||||
instruction to the userland hypervisor.
|
||||
|
||||
The stfle facility 11, CPU Topology facility, should not be indicated
|
||||
to the guest without this capability.
|
||||
|
||||
When this capability is present, KVM provides a new attribute group
|
||||
on vm fd, KVM_S390_VM_CPU_TOPOLOGY.
|
||||
This new attribute allows to get, set or clear the Modified Change
|
||||
Topology Report (MTCR) bit of the SCA through the kvm_device_attr
|
||||
structure.
|
||||
|
||||
When getting the Modified Change Topology Report value, the attr->addr
|
||||
must point to a byte where the value will be stored or retrieved from.
|
||||
|
||||
9. Known KVM API problems
|
||||
=========================
|
||||
|
||||
|
|
|
@ -10,3 +10,4 @@ KVM for s390 systems
|
|||
s390-diag
|
||||
s390-pv
|
||||
s390-pv-boot
|
||||
s390-pv-dump
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========================================
|
||||
s390 (IBM Z) Protected Virtualization dumps
|
||||
===========================================
|
||||
|
||||
Summary
|
||||
-------
|
||||
|
||||
Dumping a VM is an essential tool for debugging problems inside
|
||||
it. This is especially true when a protected VM runs into trouble as
|
||||
there's no way to access its memory and registers from the outside
|
||||
while it's running.
|
||||
|
||||
However when dumping a protected VM we need to maintain its
|
||||
confidentiality until the dump is in the hands of the VM owner who
|
||||
should be the only one capable of analysing it.
|
||||
|
||||
The confidentiality of the VM dump is ensured by the Ultravisor who
|
||||
provides an interface to KVM over which encrypted CPU and memory data
|
||||
can be requested. The encryption is based on the Customer
|
||||
Communication Key which is the key that's used to encrypt VM data in a
|
||||
way that the customer is able to decrypt.
|
||||
|
||||
|
||||
Dump process
|
||||
------------
|
||||
|
||||
A dump is done in 3 steps:
|
||||
|
||||
**Initiation**
|
||||
|
||||
This step initializes the dump process, generates cryptographic seeds
|
||||
and extracts dump keys with which the VM dump data will be encrypted.
|
||||
|
||||
**Data gathering**
|
||||
|
||||
Currently there are two types of data that can be gathered from a VM:
|
||||
the memory and the vcpu state.
|
||||
|
||||
The vcpu state contains all the important registers, general, floating
|
||||
point, vector, control and tod/timers of a vcpu. The vcpu dump can
|
||||
contain incomplete data if a vcpu is dumped while an instruction is
|
||||
emulated with help of the hypervisor. This is indicated by a flag bit
|
||||
in the dump data. For the same reason it is very important to not only
|
||||
write out the encrypted vcpu state, but also the unencrypted state
|
||||
from the hypervisor.
|
||||
|
||||
The memory state is further divided into the encrypted memory and its
|
||||
metadata comprised of the encryption tweaks and status flags. The
|
||||
encrypted memory can simply be read once it has been exported. The
|
||||
time of the export does not matter as no re-encryption is
|
||||
needed. Memory that has been swapped out and hence was exported can be
|
||||
read from the swap and written to the dump target without need for any
|
||||
special actions.
|
||||
|
||||
The tweaks / status flags for the exported pages need to be requested
|
||||
from the Ultravisor.
|
||||
|
||||
**Finalization**
|
||||
|
||||
The finalization step will provide the data needed to be able to
|
||||
decrypt the vcpu and memory data and end the dump process. When this
|
||||
step completes successfully a new dump initiation can be started.
|
|
@ -17594,6 +17594,7 @@ M: Eric Farman <farman@linux.ibm.com>
|
|||
L: linux-s390@vger.kernel.org
|
||||
L: kvm@vger.kernel.org
|
||||
S: Supported
|
||||
F: arch/s390/kvm/pci*
|
||||
F: drivers/vfio/pci/vfio_pci_zdev.c
|
||||
F: include/uapi/linux/vfio_zdev.h
|
||||
|
||||
|
|
|
@ -786,7 +786,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
|||
{
|
||||
phys_addr_t addr;
|
||||
int ret = 0;
|
||||
struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
|
||||
struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
|
||||
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
|
||||
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
|
||||
KVM_PGTABLE_PROT_R |
|
||||
|
|
|
@ -351,11 +351,10 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
|
|||
int ret = 0;
|
||||
unsigned long pfn;
|
||||
phys_addr_t addr, end;
|
||||
struct kvm_mmu_memory_cache pcache;
|
||||
|
||||
memset(&pcache, 0, sizeof(pcache));
|
||||
pcache.gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0;
|
||||
pcache.gfp_zero = __GFP_ZERO;
|
||||
struct kvm_mmu_memory_cache pcache = {
|
||||
.gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0,
|
||||
.gfp_zero = __GFP_ZERO,
|
||||
};
|
||||
|
||||
end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
|
||||
pfn = __phys_to_pfn(hpa);
|
||||
|
|
|
@ -41,6 +41,12 @@ void uv_query_info(void)
|
|||
uv_info.max_num_sec_conf = uvcb.max_num_sec_conf;
|
||||
uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id;
|
||||
uv_info.uv_feature_indications = uvcb.uv_feature_indications;
|
||||
uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions;
|
||||
uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf;
|
||||
uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len;
|
||||
uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len;
|
||||
uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver;
|
||||
uv_info.supp_att_pflags = uvcb.supp_att_pflags;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
|
||||
|
|
|
@ -12,10 +12,11 @@
|
|||
|
||||
#include <linux/bit_spinlock.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <asm/tpi.h>
|
||||
|
||||
struct airq_struct {
|
||||
struct hlist_node list; /* Handler queueing. */
|
||||
void (*handler)(struct airq_struct *airq, bool floating);
|
||||
void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info);
|
||||
u8 *lsi_ptr; /* Local-Summary-Indicator pointer */
|
||||
u8 lsi_mask; /* Local-Summary-Indicator mask */
|
||||
u8 isc; /* Interrupt-subclass */
|
||||
|
@ -46,8 +47,10 @@ struct airq_iv {
|
|||
#define AIRQ_IV_PTR 4 /* Allocate the ptr array */
|
||||
#define AIRQ_IV_DATA 8 /* Allocate the data array */
|
||||
#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */
|
||||
#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */
|
||||
|
||||
struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags);
|
||||
struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags,
|
||||
unsigned long *vec);
|
||||
void airq_iv_release(struct airq_iv *iv);
|
||||
unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num);
|
||||
void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num);
|
||||
|
|
|
@ -147,5 +147,42 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
|
|||
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
|
||||
unsigned long gaddr, unsigned long vmaddr);
|
||||
int gmap_mark_unmergeable(void);
|
||||
void s390_reset_acc(struct mm_struct *mm);
|
||||
void s390_unlist_old_asce(struct gmap *gmap);
|
||||
int s390_replace_asce(struct gmap *gmap);
|
||||
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
|
||||
int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, bool interruptible);
|
||||
|
||||
/**
|
||||
* s390_uv_destroy_range - Destroy a range of pages in the given mm.
|
||||
* @mm: the mm on which to operate on
|
||||
* @start: the start of the range
|
||||
* @end: the end of the range
|
||||
*
|
||||
* This function will call cond_sched, so it should not generate stalls, but
|
||||
* it will otherwise only return when it completed.
|
||||
*/
|
||||
static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
(void)__s390_uv_destroy_range(mm, start, end, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* s390_uv_destroy_range_interruptible - Destroy a range of pages in the
|
||||
* given mm, but stop when a fatal signal is received.
|
||||
* @mm: the mm on which to operate on
|
||||
* @start: the start of the range
|
||||
* @end: the end of the range
|
||||
*
|
||||
* This function will call cond_sched, so it should not generate stalls. If
|
||||
* a fatal signal is received, it will return with -EINTR immediately,
|
||||
* without finishing destroying the whole range. Upon successful
|
||||
* completion, 0 is returned.
|
||||
*/
|
||||
static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
return __s390_uv_destroy_range(mm, start, end, true);
|
||||
}
|
||||
#endif /* _ASM_S390_GMAP_H */
|
||||
|
|
|
@ -19,6 +19,8 @@
|
|||
#include <linux/kvm.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <asm/debug.h>
|
||||
#include <asm/cpu.h>
|
||||
#include <asm/fpu/api.h>
|
||||
|
@ -93,19 +95,30 @@ union ipte_control {
|
|||
};
|
||||
};
|
||||
|
||||
union sca_utility {
|
||||
__u16 val;
|
||||
struct {
|
||||
__u16 mtcr : 1;
|
||||
__u16 reserved : 15;
|
||||
};
|
||||
};
|
||||
|
||||
struct bsca_block {
|
||||
union ipte_control ipte_control;
|
||||
__u64 reserved[5];
|
||||
__u64 mcn;
|
||||
__u64 reserved2;
|
||||
union sca_utility utility;
|
||||
__u8 reserved2[6];
|
||||
struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
|
||||
};
|
||||
|
||||
struct esca_block {
|
||||
union ipte_control ipte_control;
|
||||
__u64 reserved1[7];
|
||||
__u64 reserved1[6];
|
||||
union sca_utility utility;
|
||||
__u8 reserved2[6];
|
||||
__u64 mcn[4];
|
||||
__u64 reserved2[20];
|
||||
__u64 reserved3[20];
|
||||
struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
|
||||
};
|
||||
|
||||
|
@ -249,12 +262,16 @@ struct kvm_s390_sie_block {
|
|||
#define ECB_SPECI 0x08
|
||||
#define ECB_SRSI 0x04
|
||||
#define ECB_HOSTPROTINT 0x02
|
||||
#define ECB_PTF 0x01
|
||||
__u8 ecb; /* 0x0061 */
|
||||
#define ECB2_CMMA 0x80
|
||||
#define ECB2_IEP 0x20
|
||||
#define ECB2_PFMFI 0x08
|
||||
#define ECB2_ESCA 0x04
|
||||
#define ECB2_ZPCI_LSI 0x02
|
||||
__u8 ecb2; /* 0x0062 */
|
||||
#define ECB3_AISI 0x20
|
||||
#define ECB3_AISII 0x10
|
||||
#define ECB3_DEA 0x08
|
||||
#define ECB3_AES 0x04
|
||||
#define ECB3_RI 0x01
|
||||
|
@ -759,6 +776,7 @@ struct kvm_vm_stat {
|
|||
u64 inject_pfault_done;
|
||||
u64 inject_service_signal;
|
||||
u64 inject_virtio;
|
||||
u64 aen_forward;
|
||||
};
|
||||
|
||||
struct kvm_arch_memory_slot {
|
||||
|
@ -923,6 +941,8 @@ struct kvm_s390_pv {
|
|||
u64 guest_len;
|
||||
unsigned long stor_base;
|
||||
void *stor_var;
|
||||
bool dumping;
|
||||
struct mmu_notifier mmu_notifier;
|
||||
};
|
||||
|
||||
struct kvm_arch{
|
||||
|
@ -939,6 +959,7 @@ struct kvm_arch{
|
|||
int use_cmma;
|
||||
int use_pfmfi;
|
||||
int use_skf;
|
||||
int use_zpci_interp;
|
||||
int user_cpu_state_ctrl;
|
||||
int user_sigp;
|
||||
int user_stsi;
|
||||
|
@ -962,6 +983,8 @@ struct kvm_arch{
|
|||
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
|
||||
struct kvm_s390_gisa_interrupt gisa_int;
|
||||
struct kvm_s390_pv pv;
|
||||
struct list_head kzdev_list;
|
||||
spinlock_t kzdev_list_lock;
|
||||
};
|
||||
|
||||
#define KVM_HVA_ERR_BAD (-1UL)
|
||||
|
@ -1012,4 +1035,19 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
|
|||
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
|
||||
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
|
||||
|
||||
#define __KVM_HAVE_ARCH_VM_FREE
|
||||
void kvm_arch_free_vm(struct kvm *kvm);
|
||||
|
||||
#ifdef CONFIG_VFIO_PCI_ZDEV_KVM
|
||||
int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm);
|
||||
void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev);
|
||||
#else
|
||||
static inline int kvm_s390_pci_register_kvm(struct zpci_dev *dev,
|
||||
struct kvm *kvm)
|
||||
{
|
||||
return -EPERM;
|
||||
}
|
||||
static inline void kvm_s390_pci_unregister_kvm(struct zpci_dev *dev) {}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -18,7 +18,7 @@ typedef struct {
|
|||
unsigned long asce_limit;
|
||||
unsigned long vdso_base;
|
||||
/* The mmu context belongs to a secure guest. */
|
||||
atomic_t is_protected;
|
||||
atomic_t protected_count;
|
||||
/*
|
||||
* The following bitfields need a down_write on the mm
|
||||
* semaphore when they are written to. As they are only
|
||||
|
|
|
@ -26,7 +26,7 @@ static inline int init_new_context(struct task_struct *tsk,
|
|||
INIT_LIST_HEAD(&mm->context.gmap_list);
|
||||
cpumask_clear(&mm->context.cpu_attach_mask);
|
||||
atomic_set(&mm->context.flush_count, 0);
|
||||
atomic_set(&mm->context.is_protected, 0);
|
||||
atomic_set(&mm->context.protected_count, 0);
|
||||
mm->context.gmap_asce = 0;
|
||||
mm->context.flush_mm = 0;
|
||||
#ifdef CONFIG_PGSTE
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <asm-generic/pci.h>
|
||||
#include <asm/pci_clp.h>
|
||||
#include <asm/pci_debug.h>
|
||||
#include <asm/pci_insn.h>
|
||||
#include <asm/sclp.h>
|
||||
|
||||
#define PCIBIOS_MIN_IO 0x1000
|
||||
|
@ -97,6 +98,7 @@ struct zpci_bar_struct {
|
|||
};
|
||||
|
||||
struct s390_domain;
|
||||
struct kvm_zdev;
|
||||
|
||||
#define ZPCI_FUNCTIONS_PER_BUS 256
|
||||
struct zpci_bus {
|
||||
|
@ -123,11 +125,14 @@ struct zpci_dev {
|
|||
enum zpci_state state;
|
||||
u32 fid; /* function ID, used by sclp */
|
||||
u32 fh; /* function handle, used by insn's */
|
||||
u32 gisa; /* GISA designation for passthrough */
|
||||
u16 vfn; /* virtual function number */
|
||||
u16 pchid; /* physical channel ID */
|
||||
u16 maxstbl; /* Maximum store block size */
|
||||
u8 pfgid; /* function group ID */
|
||||
u8 pft; /* pci function type */
|
||||
u8 port;
|
||||
u8 dtsm; /* Supported DT mask */
|
||||
u8 rid_available : 1;
|
||||
u8 has_hp_slot : 1;
|
||||
u8 has_resources : 1;
|
||||
|
@ -186,7 +191,10 @@ struct zpci_dev {
|
|||
|
||||
struct dentry *debugfs_dev;
|
||||
|
||||
/* IOMMU and passthrough */
|
||||
struct s390_domain *s390_domain; /* s390 IOMMU domain data */
|
||||
struct kvm_zdev *kzdev;
|
||||
struct mutex kzdev_lock;
|
||||
};
|
||||
|
||||
static inline bool zdev_enabled(struct zpci_dev *zdev)
|
||||
|
@ -198,6 +206,9 @@ extern const struct attribute_group *zpci_attr_groups[];
|
|||
extern unsigned int s390_pci_force_floating __initdata;
|
||||
extern unsigned int s390_pci_no_rid;
|
||||
|
||||
extern union zpci_sic_iib *zpci_aipb;
|
||||
extern struct airq_iv *zpci_aif_sbv;
|
||||
|
||||
/* -----------------------------------------------------------------------------
|
||||
Prototypes
|
||||
----------------------------------------------------------------------------- */
|
||||
|
|
|
@ -153,9 +153,11 @@ struct clp_rsp_query_pci_grp {
|
|||
u8 : 6;
|
||||
u8 frame : 1;
|
||||
u8 refresh : 1; /* TLB refresh mode */
|
||||
u16 reserved2;
|
||||
u16 : 3;
|
||||
u16 maxstbl : 13; /* Maximum store block size */
|
||||
u16 mui;
|
||||
u16 : 16;
|
||||
u8 dtsm; /* Supported DT mask */
|
||||
u8 reserved3;
|
||||
u16 maxfaal;
|
||||
u16 : 4;
|
||||
u16 dnoi : 12;
|
||||
|
@ -173,7 +175,8 @@ struct clp_req_set_pci {
|
|||
u16 reserved2;
|
||||
u8 oc; /* operation controls */
|
||||
u8 ndas; /* number of dma spaces */
|
||||
u64 reserved3;
|
||||
u32 reserved3;
|
||||
u32 gisa; /* GISA designation */
|
||||
} __packed;
|
||||
|
||||
/* Set PCI function response */
|
||||
|
|
|
@ -98,6 +98,15 @@ struct zpci_fib {
|
|||
u32 gd;
|
||||
} __packed __aligned(8);
|
||||
|
||||
/* Set Interruption Controls Operation Controls */
|
||||
#define SIC_IRQ_MODE_ALL 0
|
||||
#define SIC_IRQ_MODE_SINGLE 1
|
||||
#define SIC_SET_AENI_CONTROLS 2
|
||||
#define SIC_IRQ_MODE_DIRECT 4
|
||||
#define SIC_IRQ_MODE_D_ALL 16
|
||||
#define SIC_IRQ_MODE_D_SINGLE 17
|
||||
#define SIC_IRQ_MODE_SET_CPU 18
|
||||
|
||||
/* directed interruption information block */
|
||||
struct zpci_diib {
|
||||
u32 : 1;
|
||||
|
@ -119,9 +128,20 @@ struct zpci_cdiib {
|
|||
u64 : 64;
|
||||
} __packed __aligned(8);
|
||||
|
||||
/* adapter interruption parameters block */
|
||||
struct zpci_aipb {
|
||||
u64 faisb;
|
||||
u64 gait;
|
||||
u16 : 13;
|
||||
u16 afi : 3;
|
||||
u32 : 32;
|
||||
u16 faal;
|
||||
} __packed __aligned(8);
|
||||
|
||||
union zpci_sic_iib {
|
||||
struct zpci_diib diib;
|
||||
struct zpci_cdiib cdiib;
|
||||
struct zpci_aipb aipb;
|
||||
};
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(have_mio);
|
||||
|
@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset);
|
|||
int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len);
|
||||
int __zpci_store_block(const u64 *data, u64 req, u64 offset);
|
||||
void zpci_barrier(void);
|
||||
int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
|
||||
|
||||
static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc)
|
||||
{
|
||||
union zpci_sic_iib iib = {{0}};
|
||||
|
||||
return __zpci_set_irq_ctrl(ctl, isc, &iib);
|
||||
}
|
||||
int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -525,7 +525,7 @@ static inline int mm_has_pgste(struct mm_struct *mm)
|
|||
static inline int mm_is_protected(struct mm_struct *mm)
|
||||
{
|
||||
#ifdef CONFIG_PGSTE
|
||||
if (unlikely(atomic_read(&mm->context.is_protected)))
|
||||
if (unlikely(atomic_read(&mm->context.protected_count)))
|
||||
return 1;
|
||||
#endif
|
||||
return 0;
|
||||
|
@ -1182,9 +1182,22 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
|
|||
} else {
|
||||
res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
|
||||
}
|
||||
/* At this point the reference through the mapping is still present */
|
||||
if (mm_is_protected(mm) && pte_present(res))
|
||||
uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
|
||||
/* Nothing to do */
|
||||
if (!mm_is_protected(mm) || !pte_present(res))
|
||||
return res;
|
||||
/*
|
||||
* At this point the reference through the mapping is still present.
|
||||
* The notifier should have destroyed all protected vCPUs at this
|
||||
* point, so the destroy should be successful.
|
||||
*/
|
||||
if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK))
|
||||
return res;
|
||||
/*
|
||||
* If something went wrong and the page could not be destroyed, or
|
||||
* if this is not a mm teardown, the slower export is used as
|
||||
* fallback instead.
|
||||
*/
|
||||
uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
@ -88,6 +88,10 @@ struct sclp_info {
|
|||
unsigned char has_sipl : 1;
|
||||
unsigned char has_dirq : 1;
|
||||
unsigned char has_iplcc : 1;
|
||||
unsigned char has_zpci_lsi : 1;
|
||||
unsigned char has_aisii : 1;
|
||||
unsigned char has_aeni : 1;
|
||||
unsigned char has_aisi : 1;
|
||||
unsigned int ibc;
|
||||
unsigned int mtid;
|
||||
unsigned int mtid_cp;
|
||||
|
|
|
@ -19,6 +19,19 @@ struct tpi_info {
|
|||
u32 :12;
|
||||
} __packed __aligned(4);
|
||||
|
||||
/* I/O-Interruption Code as stored by TPI for an Adapter I/O */
|
||||
struct tpi_adapter_info {
|
||||
u32 aism:8;
|
||||
u32 :22;
|
||||
u32 error:1;
|
||||
u32 forward:1;
|
||||
u32 reserved;
|
||||
u32 adapter_IO:1;
|
||||
u32 directed_irq:1;
|
||||
u32 isc:3;
|
||||
u32 :27;
|
||||
} __packed __aligned(4);
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ASM_S390_TPI_H */
|
||||
|
|
|
@ -50,6 +50,10 @@
|
|||
#define UVC_CMD_SET_UNSHARE_ALL 0x0340
|
||||
#define UVC_CMD_PIN_PAGE_SHARED 0x0341
|
||||
#define UVC_CMD_UNPIN_PAGE_SHARED 0x0342
|
||||
#define UVC_CMD_DUMP_INIT 0x0400
|
||||
#define UVC_CMD_DUMP_CONF_STOR_STATE 0x0401
|
||||
#define UVC_CMD_DUMP_CPU 0x0402
|
||||
#define UVC_CMD_DUMP_COMPLETE 0x0403
|
||||
#define UVC_CMD_SET_SHARED_ACCESS 0x1000
|
||||
#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001
|
||||
#define UVC_CMD_RETR_ATTEST 0x1020
|
||||
|
@ -77,6 +81,10 @@ enum uv_cmds_inst {
|
|||
BIT_UVC_CMD_UNSHARE_ALL = 20,
|
||||
BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
|
||||
BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
|
||||
BIT_UVC_CMD_DUMP_INIT = 24,
|
||||
BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25,
|
||||
BIT_UVC_CMD_DUMP_CPU = 26,
|
||||
BIT_UVC_CMD_DUMP_COMPLETE = 27,
|
||||
BIT_UVC_CMD_RETR_ATTEST = 28,
|
||||
};
|
||||
|
||||
|
@ -110,7 +118,16 @@ struct uv_cb_qui {
|
|||
u8 reserved88[158 - 136]; /* 0x0088 */
|
||||
u16 max_guest_cpu_id; /* 0x009e */
|
||||
u64 uv_feature_indications; /* 0x00a0 */
|
||||
u8 reserveda8[200 - 168]; /* 0x00a8 */
|
||||
u64 reserveda8; /* 0x00a8 */
|
||||
u64 supp_se_hdr_versions; /* 0x00b0 */
|
||||
u64 supp_se_hdr_pcf; /* 0x00b8 */
|
||||
u64 reservedc0; /* 0x00c0 */
|
||||
u64 conf_dump_storage_state_len; /* 0x00c8 */
|
||||
u64 conf_dump_finalize_len; /* 0x00d0 */
|
||||
u64 reservedd8; /* 0x00d8 */
|
||||
u64 supp_att_req_hdr_ver; /* 0x00e0 */
|
||||
u64 supp_att_pflags; /* 0x00e8 */
|
||||
u8 reservedf0[256 - 240]; /* 0x00f0 */
|
||||
} __packed __aligned(8);
|
||||
|
||||
/* Initialize Ultravisor */
|
||||
|
@ -240,6 +257,31 @@ struct uv_cb_attest {
|
|||
u64 reserved168[4]; /* 0x0168 */
|
||||
} __packed __aligned(8);
|
||||
|
||||
struct uv_cb_dump_cpu {
|
||||
struct uv_cb_header header;
|
||||
u64 reserved08[2];
|
||||
u64 cpu_handle;
|
||||
u64 dump_area_origin;
|
||||
u64 reserved28[5];
|
||||
} __packed __aligned(8);
|
||||
|
||||
struct uv_cb_dump_stor_state {
|
||||
struct uv_cb_header header;
|
||||
u64 reserved08[2];
|
||||
u64 config_handle;
|
||||
u64 dump_area_origin;
|
||||
u64 gaddr;
|
||||
u64 reserved28[4];
|
||||
} __packed __aligned(8);
|
||||
|
||||
struct uv_cb_dump_complete {
|
||||
struct uv_cb_header header;
|
||||
u64 reserved08[2];
|
||||
u64 config_handle;
|
||||
u64 dump_area_origin;
|
||||
u64 reserved30[5];
|
||||
} __packed __aligned(8);
|
||||
|
||||
static inline int __uv_call(unsigned long r1, unsigned long r2)
|
||||
{
|
||||
int cc;
|
||||
|
@ -307,6 +349,12 @@ struct uv_info {
|
|||
unsigned int max_num_sec_conf;
|
||||
unsigned short max_guest_cpu_id;
|
||||
unsigned long uv_feature_indications;
|
||||
unsigned long supp_se_hdr_ver;
|
||||
unsigned long supp_se_hdr_pcf;
|
||||
unsigned long conf_dump_storage_state_len;
|
||||
unsigned long conf_dump_finalize_len;
|
||||
unsigned long supp_att_req_hdr_ver;
|
||||
unsigned long supp_att_pflags;
|
||||
};
|
||||
|
||||
extern struct uv_info uv_info;
|
||||
|
@ -378,6 +426,7 @@ static inline int is_prot_virt_host(void)
|
|||
}
|
||||
|
||||
int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
|
||||
int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
|
||||
int uv_destroy_owned_page(unsigned long paddr);
|
||||
int uv_convert_from_secure(unsigned long paddr);
|
||||
int uv_convert_owned_from_secure(unsigned long paddr);
|
||||
|
|
|
@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req {
|
|||
#define KVM_S390_VM_CRYPTO 2
|
||||
#define KVM_S390_VM_CPU_MODEL 3
|
||||
#define KVM_S390_VM_MIGRATION 4
|
||||
#define KVM_S390_VM_CPU_TOPOLOGY 5
|
||||
|
||||
/* kvm attributes for mem_ctrl */
|
||||
#define KVM_S390_VM_MEM_ENABLE_CMMA 0
|
||||
|
|
|
@ -234,6 +234,32 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr,
|
|||
return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
|
||||
}
|
||||
|
||||
/**
|
||||
* should_export_before_import - Determine whether an export is needed
|
||||
* before an import-like operation
|
||||
* @uvcb: the Ultravisor control block of the UVC to be performed
|
||||
* @mm: the mm of the process
|
||||
*
|
||||
* Returns whether an export is needed before every import-like operation.
|
||||
* This is needed for shared pages, which don't trigger a secure storage
|
||||
* exception when accessed from a different guest.
|
||||
*
|
||||
* Although considered as one, the Unpin Page UVC is not an actual import,
|
||||
* so it is not affected.
|
||||
*
|
||||
* No export is needed also when there is only one protected VM, because the
|
||||
* page cannot belong to the wrong VM in that case (there is no "other VM"
|
||||
* it can belong to).
|
||||
*
|
||||
* Return: true if an export is needed before every import, otherwise false.
|
||||
*/
|
||||
static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
|
||||
{
|
||||
if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
|
||||
return false;
|
||||
return atomic_read(&mm->context.protected_count) > 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Requests the Ultravisor to make a page accessible to a guest.
|
||||
* If it's brought in the first time, it will be cleared. If
|
||||
|
@ -277,6 +303,8 @@ again:
|
|||
|
||||
lock_page(page);
|
||||
ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
|
||||
if (should_export_before_import(uvcb, gmap->mm))
|
||||
uv_convert_from_secure(page_to_phys(page));
|
||||
rc = make_secure_pte(ptep, uaddr, page, uvcb);
|
||||
pte_unmap_unlock(ptep, ptelock);
|
||||
unlock_page(page);
|
||||
|
@ -334,6 +362,61 @@ int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
|
||||
|
||||
/**
|
||||
* gmap_destroy_page - Destroy a guest page.
|
||||
* @gmap: the gmap of the guest
|
||||
* @gaddr: the guest address to destroy
|
||||
*
|
||||
* An attempt will be made to destroy the given guest page. If the attempt
|
||||
* fails, an attempt is made to export the page. If both attempts fail, an
|
||||
* appropriate error is returned.
|
||||
*/
|
||||
int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long uaddr;
|
||||
struct page *page;
|
||||
int rc;
|
||||
|
||||
rc = -EFAULT;
|
||||
mmap_read_lock(gmap->mm);
|
||||
|
||||
uaddr = __gmap_translate(gmap, gaddr);
|
||||
if (IS_ERR_VALUE(uaddr))
|
||||
goto out;
|
||||
vma = vma_lookup(gmap->mm, uaddr);
|
||||
if (!vma)
|
||||
goto out;
|
||||
/*
|
||||
* Huge pages should not be able to become secure
|
||||
*/
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
goto out;
|
||||
|
||||
rc = 0;
|
||||
/* we take an extra reference here */
|
||||
page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET);
|
||||
if (IS_ERR_OR_NULL(page))
|
||||
goto out;
|
||||
rc = uv_destroy_owned_page(page_to_phys(page));
|
||||
/*
|
||||
* Fault handlers can race; it is possible that two CPUs will fault
|
||||
* on the same secure page. One CPU can destroy the page, reboot,
|
||||
* re-enter secure mode and import it, while the second CPU was
|
||||
* stuck at the beginning of the handler. At some point the second
|
||||
* CPU will be able to progress, and it will not be able to destroy
|
||||
* the page. In that case we do not want to terminate the process,
|
||||
* we instead try to export the page.
|
||||
*/
|
||||
if (rc)
|
||||
rc = uv_convert_owned_from_secure(page_to_phys(page));
|
||||
put_page(page);
|
||||
out:
|
||||
mmap_read_unlock(gmap->mm);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gmap_destroy_page);
|
||||
|
||||
/*
|
||||
* To be called with the page locked or with an extra reference! This will
|
||||
* prevent gmap_make_secure from touching the page concurrently. Having 2
|
||||
|
@ -392,6 +475,54 @@ static ssize_t uv_query_facilities(struct kobject *kobj,
|
|||
static struct kobj_attribute uv_query_facilities_attr =
|
||||
__ATTR(facilities, 0444, uv_query_facilities, NULL);
|
||||
|
||||
static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver);
|
||||
}
|
||||
|
||||
static struct kobj_attribute uv_query_supp_se_hdr_ver_attr =
|
||||
__ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL);
|
||||
|
||||
static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf);
|
||||
}
|
||||
|
||||
static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr =
|
||||
__ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL);
|
||||
|
||||
static ssize_t uv_query_dump_cpu_len(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *page)
|
||||
{
|
||||
return scnprintf(page, PAGE_SIZE, "%lx\n",
|
||||
uv_info.guest_cpu_stor_len);
|
||||
}
|
||||
|
||||
static struct kobj_attribute uv_query_dump_cpu_len_attr =
|
||||
__ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL);
|
||||
|
||||
static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *page)
|
||||
{
|
||||
return scnprintf(page, PAGE_SIZE, "%lx\n",
|
||||
uv_info.conf_dump_storage_state_len);
|
||||
}
|
||||
|
||||
static struct kobj_attribute uv_query_dump_storage_state_len_attr =
|
||||
__ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL);
|
||||
|
||||
static ssize_t uv_query_dump_finalize_len(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *page)
|
||||
{
|
||||
return scnprintf(page, PAGE_SIZE, "%lx\n",
|
||||
uv_info.conf_dump_finalize_len);
|
||||
}
|
||||
|
||||
static struct kobj_attribute uv_query_dump_finalize_len_attr =
|
||||
__ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL);
|
||||
|
||||
static ssize_t uv_query_feature_indications(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
|
@ -431,12 +562,37 @@ static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
|
|||
static struct kobj_attribute uv_query_max_guest_addr_attr =
|
||||
__ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
|
||||
|
||||
static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *page)
|
||||
{
|
||||
return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver);
|
||||
}
|
||||
|
||||
static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr =
|
||||
__ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL);
|
||||
|
||||
static ssize_t uv_query_supp_att_pflags(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *page)
|
||||
{
|
||||
return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags);
|
||||
}
|
||||
|
||||
static struct kobj_attribute uv_query_supp_att_pflags_attr =
|
||||
__ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL);
|
||||
|
||||
static struct attribute *uv_query_attrs[] = {
|
||||
&uv_query_facilities_attr.attr,
|
||||
&uv_query_feature_indications_attr.attr,
|
||||
&uv_query_max_guest_cpus_attr.attr,
|
||||
&uv_query_max_guest_vms_attr.attr,
|
||||
&uv_query_max_guest_addr_attr.attr,
|
||||
&uv_query_supp_se_hdr_ver_attr.attr,
|
||||
&uv_query_supp_se_hdr_pcf_attr.attr,
|
||||
&uv_query_dump_storage_state_len_attr.attr,
|
||||
&uv_query_dump_finalize_len_attr.attr,
|
||||
&uv_query_dump_cpu_len_attr.attr,
|
||||
&uv_query_supp_att_req_hdr_ver_attr.attr,
|
||||
&uv_query_supp_att_pflags_attr.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@ config KVM
|
|||
select SRCU
|
||||
select KVM_VFIO
|
||||
select INTERVAL_TREE
|
||||
select MMU_NOTIFIER
|
||||
help
|
||||
Support hosting paravirtualized guest machines using the SIE
|
||||
virtualization capability on the mainframe. This should work
|
||||
|
|
|
@ -10,4 +10,5 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
|
|||
kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
|
||||
kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
|
||||
|
||||
kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
|
||||
obj-$(CONFIG_KVM) += kvm.o
|
||||
|
|
|
@ -262,77 +262,77 @@ struct aste {
|
|||
/* .. more fields there */
|
||||
};
|
||||
|
||||
int ipte_lock_held(struct kvm_vcpu *vcpu)
|
||||
int ipte_lock_held(struct kvm *kvm)
|
||||
{
|
||||
if (vcpu->arch.sie_block->eca & ECA_SII) {
|
||||
if (sclp.has_siif) {
|
||||
int rc;
|
||||
|
||||
read_lock(&vcpu->kvm->arch.sca_lock);
|
||||
rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0;
|
||||
read_unlock(&vcpu->kvm->arch.sca_lock);
|
||||
read_lock(&kvm->arch.sca_lock);
|
||||
rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
return rc;
|
||||
}
|
||||
return vcpu->kvm->arch.ipte_lock_count != 0;
|
||||
return kvm->arch.ipte_lock_count != 0;
|
||||
}
|
||||
|
||||
static void ipte_lock_simple(struct kvm_vcpu *vcpu)
|
||||
static void ipte_lock_simple(struct kvm *kvm)
|
||||
{
|
||||
union ipte_control old, new, *ic;
|
||||
|
||||
mutex_lock(&vcpu->kvm->arch.ipte_mutex);
|
||||
vcpu->kvm->arch.ipte_lock_count++;
|
||||
if (vcpu->kvm->arch.ipte_lock_count > 1)
|
||||
mutex_lock(&kvm->arch.ipte_mutex);
|
||||
kvm->arch.ipte_lock_count++;
|
||||
if (kvm->arch.ipte_lock_count > 1)
|
||||
goto out;
|
||||
retry:
|
||||
read_lock(&vcpu->kvm->arch.sca_lock);
|
||||
ic = kvm_s390_get_ipte_control(vcpu->kvm);
|
||||
read_lock(&kvm->arch.sca_lock);
|
||||
ic = kvm_s390_get_ipte_control(kvm);
|
||||
do {
|
||||
old = READ_ONCE(*ic);
|
||||
if (old.k) {
|
||||
read_unlock(&vcpu->kvm->arch.sca_lock);
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
cond_resched();
|
||||
goto retry;
|
||||
}
|
||||
new = old;
|
||||
new.k = 1;
|
||||
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
|
||||
read_unlock(&vcpu->kvm->arch.sca_lock);
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
out:
|
||||
mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
|
||||
mutex_unlock(&kvm->arch.ipte_mutex);
|
||||
}
|
||||
|
||||
static void ipte_unlock_simple(struct kvm_vcpu *vcpu)
|
||||
static void ipte_unlock_simple(struct kvm *kvm)
|
||||
{
|
||||
union ipte_control old, new, *ic;
|
||||
|
||||
mutex_lock(&vcpu->kvm->arch.ipte_mutex);
|
||||
vcpu->kvm->arch.ipte_lock_count--;
|
||||
if (vcpu->kvm->arch.ipte_lock_count)
|
||||
mutex_lock(&kvm->arch.ipte_mutex);
|
||||
kvm->arch.ipte_lock_count--;
|
||||
if (kvm->arch.ipte_lock_count)
|
||||
goto out;
|
||||
read_lock(&vcpu->kvm->arch.sca_lock);
|
||||
ic = kvm_s390_get_ipte_control(vcpu->kvm);
|
||||
read_lock(&kvm->arch.sca_lock);
|
||||
ic = kvm_s390_get_ipte_control(kvm);
|
||||
do {
|
||||
old = READ_ONCE(*ic);
|
||||
new = old;
|
||||
new.k = 0;
|
||||
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
|
||||
read_unlock(&vcpu->kvm->arch.sca_lock);
|
||||
wake_up(&vcpu->kvm->arch.ipte_wq);
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
wake_up(&kvm->arch.ipte_wq);
|
||||
out:
|
||||
mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
|
||||
mutex_unlock(&kvm->arch.ipte_mutex);
|
||||
}
|
||||
|
||||
static void ipte_lock_siif(struct kvm_vcpu *vcpu)
|
||||
static void ipte_lock_siif(struct kvm *kvm)
|
||||
{
|
||||
union ipte_control old, new, *ic;
|
||||
|
||||
retry:
|
||||
read_lock(&vcpu->kvm->arch.sca_lock);
|
||||
ic = kvm_s390_get_ipte_control(vcpu->kvm);
|
||||
read_lock(&kvm->arch.sca_lock);
|
||||
ic = kvm_s390_get_ipte_control(kvm);
|
||||
do {
|
||||
old = READ_ONCE(*ic);
|
||||
if (old.kg) {
|
||||
read_unlock(&vcpu->kvm->arch.sca_lock);
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
cond_resched();
|
||||
goto retry;
|
||||
}
|
||||
|
@ -340,15 +340,15 @@ retry:
|
|||
new.k = 1;
|
||||
new.kh++;
|
||||
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
|
||||
read_unlock(&vcpu->kvm->arch.sca_lock);
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
}
|
||||
|
||||
static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
|
||||
static void ipte_unlock_siif(struct kvm *kvm)
|
||||
{
|
||||
union ipte_control old, new, *ic;
|
||||
|
||||
read_lock(&vcpu->kvm->arch.sca_lock);
|
||||
ic = kvm_s390_get_ipte_control(vcpu->kvm);
|
||||
read_lock(&kvm->arch.sca_lock);
|
||||
ic = kvm_s390_get_ipte_control(kvm);
|
||||
do {
|
||||
old = READ_ONCE(*ic);
|
||||
new = old;
|
||||
|
@ -356,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
|
|||
if (!new.kh)
|
||||
new.k = 0;
|
||||
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
|
||||
read_unlock(&vcpu->kvm->arch.sca_lock);
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
if (!new.kh)
|
||||
wake_up(&vcpu->kvm->arch.ipte_wq);
|
||||
wake_up(&kvm->arch.ipte_wq);
|
||||
}
|
||||
|
||||
void ipte_lock(struct kvm_vcpu *vcpu)
|
||||
void ipte_lock(struct kvm *kvm)
|
||||
{
|
||||
if (vcpu->arch.sie_block->eca & ECA_SII)
|
||||
ipte_lock_siif(vcpu);
|
||||
if (sclp.has_siif)
|
||||
ipte_lock_siif(kvm);
|
||||
else
|
||||
ipte_lock_simple(vcpu);
|
||||
ipte_lock_simple(kvm);
|
||||
}
|
||||
|
||||
void ipte_unlock(struct kvm_vcpu *vcpu)
|
||||
void ipte_unlock(struct kvm *kvm)
|
||||
{
|
||||
if (vcpu->arch.sie_block->eca & ECA_SII)
|
||||
ipte_unlock_siif(vcpu);
|
||||
if (sclp.has_siif)
|
||||
ipte_unlock_siif(kvm);
|
||||
else
|
||||
ipte_unlock_simple(vcpu);
|
||||
ipte_unlock_simple(kvm);
|
||||
}
|
||||
|
||||
static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
|
||||
|
@ -1086,7 +1086,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
|
|||
try_storage_prot_override = storage_prot_override_applicable(vcpu);
|
||||
need_ipte_lock = psw_bits(*psw).dat && !asce.r;
|
||||
if (need_ipte_lock)
|
||||
ipte_lock(vcpu);
|
||||
ipte_lock(vcpu->kvm);
|
||||
/*
|
||||
* Since we do the access further down ultimately via a move instruction
|
||||
* that does key checking and returns an error in case of a protection
|
||||
|
@ -1127,7 +1127,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
|
|||
}
|
||||
out_unlock:
|
||||
if (need_ipte_lock)
|
||||
ipte_unlock(vcpu);
|
||||
ipte_unlock(vcpu->kvm);
|
||||
if (nr_pages > ARRAY_SIZE(gpa_array))
|
||||
vfree(gpas);
|
||||
return rc;
|
||||
|
@ -1199,10 +1199,10 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
|
|||
rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
|
||||
if (rc)
|
||||
return rc;
|
||||
ipte_lock(vcpu);
|
||||
ipte_lock(vcpu->kvm);
|
||||
rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode,
|
||||
access_key);
|
||||
ipte_unlock(vcpu);
|
||||
ipte_unlock(vcpu->kvm);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
@ -1465,7 +1465,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
|
|||
* tables/pointers we read stay valid - unshadowing is however
|
||||
* always possible - only guest_table_lock protects us.
|
||||
*/
|
||||
ipte_lock(vcpu);
|
||||
ipte_lock(vcpu->kvm);
|
||||
|
||||
rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
|
||||
if (rc)
|
||||
|
@ -1499,7 +1499,7 @@ shadow_page:
|
|||
pte.p |= dat_protection;
|
||||
if (!rc)
|
||||
rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
|
||||
ipte_unlock(vcpu);
|
||||
ipte_unlock(vcpu->kvm);
|
||||
mmap_read_unlock(sg->mm);
|
||||
return rc;
|
||||
}
|
||||
|
|
|
@ -440,9 +440,9 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
|
|||
return access_guest_real(vcpu, gra, data, len, 0);
|
||||
}
|
||||
|
||||
void ipte_lock(struct kvm_vcpu *vcpu);
|
||||
void ipte_unlock(struct kvm_vcpu *vcpu);
|
||||
int ipte_lock_held(struct kvm_vcpu *vcpu);
|
||||
void ipte_lock(struct kvm *kvm);
|
||||
void ipte_unlock(struct kvm *kvm);
|
||||
int ipte_lock_held(struct kvm *kvm);
|
||||
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
|
||||
|
||||
/* MVPG PEI indication bits */
|
||||
|
|
|
@ -528,12 +528,27 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu)
|
|||
|
||||
static int handle_pv_notification(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (vcpu->arch.sie_block->ipa == 0xb210)
|
||||
return handle_pv_spx(vcpu);
|
||||
if (vcpu->arch.sie_block->ipa == 0xb220)
|
||||
return handle_pv_sclp(vcpu);
|
||||
if (vcpu->arch.sie_block->ipa == 0xb9a4)
|
||||
return handle_pv_uvc(vcpu);
|
||||
if (vcpu->arch.sie_block->ipa >> 8 == 0xae) {
|
||||
/*
|
||||
* Besides external call, other SIGP orders also cause a
|
||||
* 108 (pv notify) intercept. In contrast to external call,
|
||||
* these orders need to be emulated and hence the appropriate
|
||||
* place to handle them is in handle_instruction().
|
||||
* So first try kvm_s390_handle_sigp_pei() and if that isn't
|
||||
* successful, go on with handle_instruction().
|
||||
*/
|
||||
ret = kvm_s390_handle_sigp_pei(vcpu);
|
||||
if (!ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return handle_instruction(vcpu);
|
||||
}
|
||||
|
|
|
@ -28,9 +28,11 @@
|
|||
#include <asm/switch_to.h>
|
||||
#include <asm/nmi.h>
|
||||
#include <asm/airq.h>
|
||||
#include <asm/tpi.h>
|
||||
#include "kvm-s390.h"
|
||||
#include "gaccess.h"
|
||||
#include "trace-s390.h"
|
||||
#include "pci.h"
|
||||
|
||||
#define PFAULT_INIT 0x0600
|
||||
#define PFAULT_DONE 0x0680
|
||||
|
@ -702,7 +704,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
|
|||
/*
|
||||
* We indicate floating repressible conditions along with
|
||||
* other pending conditions. Channel Report Pending and Channel
|
||||
* Subsystem damage are the only two and and are indicated by
|
||||
* Subsystem damage are the only two and are indicated by
|
||||
* bits in mcic and masked in cr14.
|
||||
*/
|
||||
if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) {
|
||||
|
@ -3311,10 +3313,87 @@ out:
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister);
|
||||
|
||||
static void gib_alert_irq_handler(struct airq_struct *airq, bool floating)
|
||||
static void aen_host_forward(unsigned long si)
|
||||
{
|
||||
struct kvm_s390_gisa_interrupt *gi;
|
||||
struct zpci_gaite *gaite;
|
||||
struct kvm *kvm;
|
||||
|
||||
gaite = (struct zpci_gaite *)aift->gait +
|
||||
(si * sizeof(struct zpci_gaite));
|
||||
if (gaite->count == 0)
|
||||
return;
|
||||
if (gaite->aisb != 0)
|
||||
set_bit_inv(gaite->aisbo, (unsigned long *)gaite->aisb);
|
||||
|
||||
kvm = kvm_s390_pci_si_to_kvm(aift, si);
|
||||
if (!kvm)
|
||||
return;
|
||||
gi = &kvm->arch.gisa_int;
|
||||
|
||||
if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) ||
|
||||
!(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) {
|
||||
gisa_set_ipm_gisc(gi->origin, gaite->gisc);
|
||||
if (hrtimer_active(&gi->timer))
|
||||
hrtimer_cancel(&gi->timer);
|
||||
hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL);
|
||||
kvm->stat.aen_forward++;
|
||||
}
|
||||
}
|
||||
|
||||
static void aen_process_gait(u8 isc)
|
||||
{
|
||||
bool found = false, first = true;
|
||||
union zpci_sic_iib iib = {{0}};
|
||||
unsigned long si, flags;
|
||||
|
||||
spin_lock_irqsave(&aift->gait_lock, flags);
|
||||
|
||||
if (!aift->gait) {
|
||||
spin_unlock_irqrestore(&aift->gait_lock, flags);
|
||||
return;
|
||||
}
|
||||
|
||||
for (si = 0;;) {
|
||||
/* Scan adapter summary indicator bit vector */
|
||||
si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv));
|
||||
if (si == -1UL) {
|
||||
if (first || found) {
|
||||
/* Re-enable interrupts. */
|
||||
zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc,
|
||||
&iib);
|
||||
first = found = false;
|
||||
} else {
|
||||
/* Interrupts on and all bits processed */
|
||||
break;
|
||||
}
|
||||
found = false;
|
||||
si = 0;
|
||||
/* Scan again after re-enabling interrupts */
|
||||
continue;
|
||||
}
|
||||
found = true;
|
||||
aen_host_forward(si);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&aift->gait_lock, flags);
|
||||
}
|
||||
|
||||
static void gib_alert_irq_handler(struct airq_struct *airq,
|
||||
struct tpi_info *tpi_info)
|
||||
{
|
||||
struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info;
|
||||
|
||||
inc_irq_stat(IRQIO_GAL);
|
||||
process_gib_alert_list();
|
||||
|
||||
if ((info->forward || info->error) &&
|
||||
IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
|
||||
aen_process_gait(info->isc);
|
||||
if (info->aism != 0)
|
||||
process_gib_alert_list();
|
||||
} else {
|
||||
process_gib_alert_list();
|
||||
}
|
||||
}
|
||||
|
||||
static struct airq_struct gib_alert_irq = {
|
||||
|
@ -3326,6 +3405,11 @@ void kvm_s390_gib_destroy(void)
|
|||
{
|
||||
if (!gib)
|
||||
return;
|
||||
if (kvm_s390_pci_interp_allowed() && aift) {
|
||||
mutex_lock(&aift->aift_lock);
|
||||
kvm_s390_pci_aen_exit();
|
||||
mutex_unlock(&aift->aift_lock);
|
||||
}
|
||||
chsc_sgib(0);
|
||||
unregister_adapter_interrupt(&gib_alert_irq);
|
||||
free_page((unsigned long)gib);
|
||||
|
@ -3363,6 +3447,14 @@ int kvm_s390_gib_init(u8 nisc)
|
|||
goto out_unreg_gal;
|
||||
}
|
||||
|
||||
if (kvm_s390_pci_interp_allowed()) {
|
||||
if (kvm_s390_pci_aen_init(nisc)) {
|
||||
pr_err("Initializing AEN for PCI failed\n");
|
||||
rc = -EIO;
|
||||
goto out_unreg_gal;
|
||||
}
|
||||
}
|
||||
|
||||
KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
|
||||
goto out;
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#include <linux/sched/signal.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/pgtable.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/lowcore.h>
|
||||
|
@ -47,6 +48,7 @@
|
|||
#include <asm/fpu/api.h>
|
||||
#include "kvm-s390.h"
|
||||
#include "gaccess.h"
|
||||
#include "pci.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include "trace.h"
|
||||
|
@ -63,7 +65,8 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
|
|||
STATS_DESC_COUNTER(VM, inject_float_mchk),
|
||||
STATS_DESC_COUNTER(VM, inject_pfault_done),
|
||||
STATS_DESC_COUNTER(VM, inject_service_signal),
|
||||
STATS_DESC_COUNTER(VM, inject_virtio)
|
||||
STATS_DESC_COUNTER(VM, inject_virtio),
|
||||
STATS_DESC_COUNTER(VM, aen_forward)
|
||||
};
|
||||
|
||||
const struct kvm_stats_header kvm_vm_stats_header = {
|
||||
|
@ -502,6 +505,14 @@ int kvm_arch_init(void *opaque)
|
|||
goto out;
|
||||
}
|
||||
|
||||
if (kvm_s390_pci_interp_allowed()) {
|
||||
rc = kvm_s390_pci_init();
|
||||
if (rc) {
|
||||
pr_err("Unable to allocate AIFT for PCI\n");
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
rc = kvm_s390_gib_init(GAL_ISC);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
@ -516,6 +527,8 @@ out:
|
|||
void kvm_arch_exit(void)
|
||||
{
|
||||
kvm_s390_gib_destroy();
|
||||
if (kvm_s390_pci_interp_allowed())
|
||||
kvm_s390_pci_exit();
|
||||
debug_unregister(kvm_s390_dbf);
|
||||
debug_unregister(kvm_s390_dbf_uv);
|
||||
}
|
||||
|
@ -606,6 +619,32 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
|
|||
case KVM_CAP_S390_PROTECTED:
|
||||
r = is_prot_virt_host();
|
||||
break;
|
||||
case KVM_CAP_S390_PROTECTED_DUMP: {
|
||||
u64 pv_cmds_dump[] = {
|
||||
BIT_UVC_CMD_DUMP_INIT,
|
||||
BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE,
|
||||
BIT_UVC_CMD_DUMP_CPU,
|
||||
BIT_UVC_CMD_DUMP_COMPLETE,
|
||||
};
|
||||
int i;
|
||||
|
||||
r = is_prot_virt_host();
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) {
|
||||
if (!test_bit_inv(pv_cmds_dump[i],
|
||||
(unsigned long *)&uv_info.inst_calls_list)) {
|
||||
r = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case KVM_CAP_S390_ZPCI_OP:
|
||||
r = kvm_s390_pci_interp_allowed();
|
||||
break;
|
||||
case KVM_CAP_S390_CPU_TOPOLOGY:
|
||||
r = test_facility(11);
|
||||
break;
|
||||
default:
|
||||
r = 0;
|
||||
}
|
||||
|
@ -817,6 +856,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
|
|||
icpt_operexc_on_all_vcpus(kvm);
|
||||
r = 0;
|
||||
break;
|
||||
case KVM_CAP_S390_CPU_TOPOLOGY:
|
||||
r = -EINVAL;
|
||||
mutex_lock(&kvm->lock);
|
||||
if (kvm->created_vcpus) {
|
||||
r = -EBUSY;
|
||||
} else if (test_facility(11)) {
|
||||
set_kvm_facility(kvm->arch.model.fac_mask, 11);
|
||||
set_kvm_facility(kvm->arch.model.fac_list, 11);
|
||||
r = 0;
|
||||
}
|
||||
mutex_unlock(&kvm->lock);
|
||||
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
|
||||
r ? "(not available)" : "(success)");
|
||||
break;
|
||||
default:
|
||||
r = -EINVAL;
|
||||
break;
|
||||
|
@ -1019,6 +1072,42 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/* Only set the ECB bits after guest requests zPCI interpretation */
|
||||
if (!vcpu->kvm->arch.use_zpci_interp)
|
||||
return;
|
||||
|
||||
vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI;
|
||||
vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI;
|
||||
}
|
||||
|
||||
void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
unsigned long i;
|
||||
|
||||
lockdep_assert_held(&kvm->lock);
|
||||
|
||||
if (!kvm_s390_pci_interp_allowed())
|
||||
return;
|
||||
|
||||
/*
|
||||
* If host is configured for PCI and the necessary facilities are
|
||||
* available, turn on interpretation for the life of this guest
|
||||
*/
|
||||
kvm->arch.use_zpci_interp = 1;
|
||||
|
||||
kvm_s390_vcpu_block_all(kvm);
|
||||
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
kvm_s390_vcpu_pci_setup(vcpu);
|
||||
kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu);
|
||||
}
|
||||
|
||||
kvm_s390_vcpu_unblock_all(kvm);
|
||||
}
|
||||
|
||||
static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
|
||||
{
|
||||
unsigned long cx;
|
||||
|
@ -1691,6 +1780,57 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_s390_update_topology_change_report - update CPU topology change report
|
||||
* @kvm: guest KVM description
|
||||
* @val: set or clear the MTCR bit
|
||||
*
|
||||
* Updates the Multiprocessor Topology-Change-Report bit to signal
|
||||
* the guest with a topology change.
|
||||
* This is only relevant if the topology facility is present.
|
||||
*
|
||||
* The SCA version, bsca or esca, doesn't matter as offset is the same.
|
||||
*/
|
||||
static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
|
||||
{
|
||||
union sca_utility new, old;
|
||||
struct bsca_block *sca;
|
||||
|
||||
read_lock(&kvm->arch.sca_lock);
|
||||
sca = kvm->arch.sca;
|
||||
do {
|
||||
old = READ_ONCE(sca->utility);
|
||||
new = old;
|
||||
new.mtcr = val;
|
||||
} while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val);
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
}
|
||||
|
||||
static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
|
||||
struct kvm_device_attr *attr)
|
||||
{
|
||||
if (!test_kvm_facility(kvm, 11))
|
||||
return -ENXIO;
|
||||
|
||||
kvm_s390_update_topology_change_report(kvm, !!attr->attr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
|
||||
struct kvm_device_attr *attr)
|
||||
{
|
||||
u8 topo;
|
||||
|
||||
if (!test_kvm_facility(kvm, 11))
|
||||
return -ENXIO;
|
||||
|
||||
read_lock(&kvm->arch.sca_lock);
|
||||
topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
|
||||
read_unlock(&kvm->arch.sca_lock);
|
||||
|
||||
return put_user(topo, (u8 __user *)attr->addr);
|
||||
}
|
||||
|
||||
static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
|
||||
{
|
||||
int ret;
|
||||
|
@ -1711,6 +1851,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
|
|||
case KVM_S390_VM_MIGRATION:
|
||||
ret = kvm_s390_vm_set_migration(kvm, attr);
|
||||
break;
|
||||
case KVM_S390_VM_CPU_TOPOLOGY:
|
||||
ret = kvm_s390_set_topo_change_indication(kvm, attr);
|
||||
break;
|
||||
default:
|
||||
ret = -ENXIO;
|
||||
break;
|
||||
|
@ -1736,6 +1879,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
|
|||
case KVM_S390_VM_MIGRATION:
|
||||
ret = kvm_s390_vm_get_migration(kvm, attr);
|
||||
break;
|
||||
case KVM_S390_VM_CPU_TOPOLOGY:
|
||||
ret = kvm_s390_get_topo_change_indication(kvm, attr);
|
||||
break;
|
||||
default:
|
||||
ret = -ENXIO;
|
||||
break;
|
||||
|
@ -1809,6 +1955,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
|
|||
case KVM_S390_VM_MIGRATION:
|
||||
ret = 0;
|
||||
break;
|
||||
case KVM_S390_VM_CPU_TOPOLOGY:
|
||||
ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO;
|
||||
break;
|
||||
default:
|
||||
ret = -ENXIO;
|
||||
break;
|
||||
|
@ -2166,12 +2315,25 @@ out:
|
|||
return r;
|
||||
}
|
||||
|
||||
static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
|
||||
/**
|
||||
* kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to
|
||||
* non protected.
|
||||
* @kvm: the VM whose protected vCPUs are to be converted
|
||||
* @rc: return value for the RC field of the UVC (in case of error)
|
||||
* @rrc: return value for the RRC field of the UVC (in case of error)
|
||||
*
|
||||
* Does not stop in case of error, tries to convert as many
|
||||
* CPUs as possible. In case of error, the RC and RRC of the last error are
|
||||
* returned.
|
||||
*
|
||||
* Return: 0 in case of success, otherwise -EIO
|
||||
*/
|
||||
int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
u16 rc, rrc;
|
||||
int ret = 0;
|
||||
unsigned long i;
|
||||
u16 _rc, _rrc;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* We ignore failures and try to destroy as many CPUs as possible.
|
||||
|
@ -2183,9 +2345,9 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
|
|||
*/
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
mutex_lock(&vcpu->mutex);
|
||||
if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) {
|
||||
*rcp = rc;
|
||||
*rrcp = rrc;
|
||||
if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) {
|
||||
*rc = _rc;
|
||||
*rrc = _rrc;
|
||||
ret = -EIO;
|
||||
}
|
||||
mutex_unlock(&vcpu->mutex);
|
||||
|
@ -2196,6 +2358,17 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM
|
||||
* to protected.
|
||||
* @kvm: the VM whose protected vCPUs are to be converted
|
||||
* @rc: return value for the RC field of the UVC (in case of error)
|
||||
* @rrc: return value for the RRC field of the UVC (in case of error)
|
||||
*
|
||||
* Tries to undo the conversion in case of error.
|
||||
*
|
||||
* Return: 0 in case of success, otherwise -EIO
|
||||
*/
|
||||
static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
{
|
||||
unsigned long i;
|
||||
|
@ -2220,6 +2393,115 @@ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
|
|||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
* Here we provide user space with a direct interface to query UV
|
||||
* related data like UV maxima and available features as well as
|
||||
* feature specific data.
|
||||
*
|
||||
* To facilitate future extension of the data structures we'll try to
|
||||
* write data up to the maximum requested length.
|
||||
*/
|
||||
static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info)
|
||||
{
|
||||
ssize_t len_min;
|
||||
|
||||
switch (info->header.id) {
|
||||
case KVM_PV_INFO_VM: {
|
||||
len_min = sizeof(info->header) + sizeof(info->vm);
|
||||
|
||||
if (info->header.len_max < len_min)
|
||||
return -EINVAL;
|
||||
|
||||
memcpy(info->vm.inst_calls_list,
|
||||
uv_info.inst_calls_list,
|
||||
sizeof(uv_info.inst_calls_list));
|
||||
|
||||
/* It's max cpuid not max cpus, so it's off by one */
|
||||
info->vm.max_cpus = uv_info.max_guest_cpu_id + 1;
|
||||
info->vm.max_guests = uv_info.max_num_sec_conf;
|
||||
info->vm.max_guest_addr = uv_info.max_sec_stor_addr;
|
||||
info->vm.feature_indication = uv_info.uv_feature_indications;
|
||||
|
||||
return len_min;
|
||||
}
|
||||
case KVM_PV_INFO_DUMP: {
|
||||
len_min = sizeof(info->header) + sizeof(info->dump);
|
||||
|
||||
if (info->header.len_max < len_min)
|
||||
return -EINVAL;
|
||||
|
||||
info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len;
|
||||
info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len;
|
||||
info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len;
|
||||
return len_min;
|
||||
}
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
|
||||
struct kvm_s390_pv_dmp dmp)
|
||||
{
|
||||
int r = -EINVAL;
|
||||
void __user *result_buff = (void __user *)dmp.buff_addr;
|
||||
|
||||
switch (dmp.subcmd) {
|
||||
case KVM_PV_DUMP_INIT: {
|
||||
if (kvm->arch.pv.dumping)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Block SIE entry as concurrent dump UVCs could lead
|
||||
* to validities.
|
||||
*/
|
||||
kvm_s390_vcpu_block_all(kvm);
|
||||
|
||||
r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
|
||||
UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc);
|
||||
KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x",
|
||||
cmd->rc, cmd->rrc);
|
||||
if (!r) {
|
||||
kvm->arch.pv.dumping = true;
|
||||
} else {
|
||||
kvm_s390_vcpu_unblock_all(kvm);
|
||||
r = -EINVAL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case KVM_PV_DUMP_CONFIG_STOR_STATE: {
|
||||
if (!kvm->arch.pv.dumping)
|
||||
break;
|
||||
|
||||
/*
|
||||
* gaddr is an output parameter since we might stop
|
||||
* early. As dmp will be copied back in our caller, we
|
||||
* don't need to do it ourselves.
|
||||
*/
|
||||
r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len,
|
||||
&cmd->rc, &cmd->rrc);
|
||||
break;
|
||||
}
|
||||
case KVM_PV_DUMP_COMPLETE: {
|
||||
if (!kvm->arch.pv.dumping)
|
||||
break;
|
||||
|
||||
r = -EINVAL;
|
||||
if (dmp.buff_len < uv_info.conf_dump_finalize_len)
|
||||
break;
|
||||
|
||||
r = kvm_s390_pv_dump_complete(kvm, result_buff,
|
||||
&cmd->rc, &cmd->rrc);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
r = -ENOTTY;
|
||||
break;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
|
||||
{
|
||||
int r = 0;
|
||||
|
@ -2356,6 +2638,68 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
|
|||
cmd->rc, cmd->rrc);
|
||||
break;
|
||||
}
|
||||
case KVM_PV_INFO: {
|
||||
struct kvm_s390_pv_info info = {};
|
||||
ssize_t data_len;
|
||||
|
||||
/*
|
||||
* No need to check the VM protection here.
|
||||
*
|
||||
* Maybe user space wants to query some of the data
|
||||
* when the VM is still unprotected. If we see the
|
||||
* need to fence a new data command we can still
|
||||
* return an error in the info handler.
|
||||
*/
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_from_user(&info, argp, sizeof(info.header)))
|
||||
break;
|
||||
|
||||
r = -EINVAL;
|
||||
if (info.header.len_max < sizeof(info.header))
|
||||
break;
|
||||
|
||||
data_len = kvm_s390_handle_pv_info(&info);
|
||||
if (data_len < 0) {
|
||||
r = data_len;
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* If a data command struct is extended (multiple
|
||||
* times) this can be used to determine how much of it
|
||||
* is valid.
|
||||
*/
|
||||
info.header.len_written = data_len;
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_to_user(argp, &info, data_len))
|
||||
break;
|
||||
|
||||
r = 0;
|
||||
break;
|
||||
}
|
||||
case KVM_PV_DUMP: {
|
||||
struct kvm_s390_pv_dmp dmp;
|
||||
|
||||
r = -EINVAL;
|
||||
if (!kvm_s390_pv_is_protected(kvm))
|
||||
break;
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_from_user(&dmp, argp, sizeof(dmp)))
|
||||
break;
|
||||
|
||||
r = kvm_s390_pv_dmp(kvm, cmd, dmp);
|
||||
if (r)
|
||||
break;
|
||||
|
||||
if (copy_to_user(argp, &dmp, sizeof(dmp))) {
|
||||
r = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
r = -ENOTTY;
|
||||
}
|
||||
|
@ -2581,6 +2925,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
|
|||
r = -EFAULT;
|
||||
break;
|
||||
}
|
||||
case KVM_S390_ZPCI_OP: {
|
||||
struct kvm_s390_zpci_op args;
|
||||
|
||||
r = -EINVAL;
|
||||
if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
|
||||
break;
|
||||
if (copy_from_user(&args, argp, sizeof(args))) {
|
||||
r = -EFAULT;
|
||||
break;
|
||||
}
|
||||
r = kvm_s390_pci_zpci_op(kvm, &args);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
r = -ENOTTY;
|
||||
}
|
||||
|
@ -2742,6 +3099,14 @@ static void sca_dispose(struct kvm *kvm)
|
|||
kvm->arch.sca = NULL;
|
||||
}
|
||||
|
||||
void kvm_arch_free_vm(struct kvm *kvm)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
|
||||
kvm_s390_pci_clear_list(kvm);
|
||||
|
||||
__kvm_arch_free_vm(kvm);
|
||||
}
|
||||
|
||||
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
{
|
||||
gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
|
||||
|
@ -2824,6 +3189,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
|||
|
||||
kvm_s390_crypto_init(kvm);
|
||||
|
||||
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
|
||||
mutex_lock(&kvm->lock);
|
||||
kvm_s390_pci_init_list(kvm);
|
||||
kvm_s390_vcpu_pci_enable_interp(kvm);
|
||||
mutex_unlock(&kvm->lock);
|
||||
}
|
||||
|
||||
mutex_init(&kvm->arch.float_int.ais_lock);
|
||||
spin_lock_init(&kvm->arch.float_int.lock);
|
||||
for (i = 0; i < FIRQ_LIST_COUNT; i++)
|
||||
|
@ -2877,6 +3249,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
|
|||
kvm_clear_async_pf_completion_queue(vcpu);
|
||||
if (!kvm_is_ucontrol(vcpu->kvm))
|
||||
sca_del_vcpu(vcpu);
|
||||
kvm_s390_update_topology_change_report(vcpu->kvm, 1);
|
||||
|
||||
if (kvm_is_ucontrol(vcpu->kvm))
|
||||
gmap_remove(vcpu->arch.gmap);
|
||||
|
@ -2904,6 +3277,15 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
|
|||
*/
|
||||
if (kvm_s390_pv_get_handle(kvm))
|
||||
kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
|
||||
/*
|
||||
* Remove the mmu notifier only when the whole KVM VM is torn down,
|
||||
* and only if one was registered to begin with. If the VM is
|
||||
* currently not protected, but has been previously been protected,
|
||||
* then it's possible that the notifier is still registered.
|
||||
*/
|
||||
if (kvm->arch.pv.mmu_notifier.ops)
|
||||
mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm);
|
||||
|
||||
debug_unregister(kvm->arch.dbf);
|
||||
free_page((unsigned long)kvm->arch.sie_page2);
|
||||
if (!kvm_is_ucontrol(kvm))
|
||||
|
@ -3047,9 +3429,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
|
|||
if (!sclp.has_esca || !sclp.has_64bscao)
|
||||
return false;
|
||||
|
||||
mutex_lock(&kvm->lock);
|
||||
rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
|
||||
mutex_unlock(&kvm->lock);
|
||||
|
||||
return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
|
||||
}
|
||||
|
@ -3272,6 +3652,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
|
|||
vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
|
||||
if (test_kvm_facility(vcpu->kvm, 9))
|
||||
vcpu->arch.sie_block->ecb |= ECB_SRSI;
|
||||
if (test_kvm_facility(vcpu->kvm, 11))
|
||||
vcpu->arch.sie_block->ecb |= ECB_PTF;
|
||||
if (test_kvm_facility(vcpu->kvm, 73))
|
||||
vcpu->arch.sie_block->ecb |= ECB_TE;
|
||||
if (!kvm_is_ucontrol(vcpu->kvm))
|
||||
|
@ -3324,6 +3706,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
|
|||
|
||||
kvm_s390_vcpu_crypto_setup(vcpu);
|
||||
|
||||
kvm_s390_vcpu_pci_setup(vcpu);
|
||||
|
||||
mutex_lock(&vcpu->kvm->lock);
|
||||
if (kvm_s390_pv_is_protected(vcpu->kvm)) {
|
||||
rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc);
|
||||
|
@ -3403,6 +3787,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
|
|||
rc = kvm_s390_vcpu_setup(vcpu);
|
||||
if (rc)
|
||||
goto out_ucontrol_uninit;
|
||||
|
||||
kvm_s390_update_topology_change_report(vcpu->kvm, 1);
|
||||
return 0;
|
||||
|
||||
out_ucontrol_uninit:
|
||||
|
@ -4473,6 +4859,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
|
|||
struct kvm_run *kvm_run = vcpu->run;
|
||||
int rc;
|
||||
|
||||
/*
|
||||
* Running a VM while dumping always has the potential to
|
||||
* produce inconsistent dump data. But for PV vcpus a SIE
|
||||
* entry while dumping could also lead to a fatal validity
|
||||
* intercept which we absolutely want to avoid.
|
||||
*/
|
||||
if (vcpu->kvm->arch.pv.dumping)
|
||||
return -EINVAL;
|
||||
|
||||
if (kvm_run->immediate_exit)
|
||||
return -EINTR;
|
||||
|
||||
|
@ -4912,6 +5307,48 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
|
|||
return -ENOIOCTLCMD;
|
||||
}
|
||||
|
||||
static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu,
|
||||
struct kvm_pv_cmd *cmd)
|
||||
{
|
||||
struct kvm_s390_pv_dmp dmp;
|
||||
void *data;
|
||||
int ret;
|
||||
|
||||
/* Dump initialization is a prerequisite */
|
||||
if (!vcpu->kvm->arch.pv.dumping)
|
||||
return -EINVAL;
|
||||
|
||||
if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp)))
|
||||
return -EFAULT;
|
||||
|
||||
/* We only handle this subcmd right now */
|
||||
if (dmp.subcmd != KVM_PV_DUMP_CPU)
|
||||
return -EINVAL;
|
||||
|
||||
/* CPU dump length is the same as create cpu storage donation. */
|
||||
if (dmp.buff_len != uv_info.guest_cpu_stor_len)
|
||||
return -EINVAL;
|
||||
|
||||
data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL);
|
||||
if (!data)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc);
|
||||
|
||||
VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x",
|
||||
vcpu->vcpu_id, cmd->rc, cmd->rrc);
|
||||
|
||||
if (ret)
|
||||
ret = -EINVAL;
|
||||
|
||||
/* On success copy over the dump data */
|
||||
if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len))
|
||||
ret = -EFAULT;
|
||||
|
||||
kvfree(data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
long kvm_arch_vcpu_ioctl(struct file *filp,
|
||||
unsigned int ioctl, unsigned long arg)
|
||||
{
|
||||
|
@ -5076,6 +5513,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
|
|||
irq_state.len);
|
||||
break;
|
||||
}
|
||||
case KVM_S390_PV_CPU_COMMAND: {
|
||||
struct kvm_pv_cmd cmd;
|
||||
|
||||
r = -EINVAL;
|
||||
if (!is_prot_virt_host())
|
||||
break;
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_from_user(&cmd, argp, sizeof(cmd)))
|
||||
break;
|
||||
|
||||
r = -EINVAL;
|
||||
if (cmd.flags)
|
||||
break;
|
||||
|
||||
/* We only handle this cmd right now */
|
||||
if (cmd.cmd != KVM_PV_DUMP)
|
||||
break;
|
||||
|
||||
r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd);
|
||||
|
||||
/* Always copy over UV rc / rrc data */
|
||||
if (copy_to_user((__u8 __user *)argp, &cmd.rc,
|
||||
sizeof(cmd.rc) + sizeof(cmd.rrc)))
|
||||
r = -EFAULT;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
r = -ENOTTY;
|
||||
}
|
||||
|
|
|
@ -250,6 +250,11 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
|
|||
int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
|
||||
unsigned long tweak, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state);
|
||||
int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
|
||||
u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
|
||||
int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
|
||||
u16 *rc, u16 *rrc);
|
||||
|
||||
static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
|
||||
{
|
||||
|
@ -374,6 +379,7 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
|
|||
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
|
||||
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
|
||||
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
|
||||
int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
|
||||
|
||||
/* implemented in diag.c */
|
||||
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
|
||||
|
@ -507,6 +513,16 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
|
|||
*/
|
||||
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
|
||||
|
||||
/**
|
||||
* kvm_s390_vcpu_pci_enable_interp
|
||||
*
|
||||
* Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store
|
||||
* interpretation as well as adapter interruption forwarding.
|
||||
*
|
||||
* @kvm: the KVM guest
|
||||
*/
|
||||
void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm);
|
||||
|
||||
/**
|
||||
* diag9c_forwarding_hz
|
||||
*
|
||||
|
|
|
@ -0,0 +1,690 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* s390 kvm PCI passthrough support
|
||||
*
|
||||
* Copyright IBM Corp. 2022
|
||||
*
|
||||
* Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
|
||||
*/
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/pci.h>
|
||||
#include <asm/pci.h>
|
||||
#include <asm/pci_insn.h>
|
||||
#include <asm/pci_io.h>
|
||||
#include <asm/sclp.h>
|
||||
#include "pci.h"
|
||||
#include "kvm-s390.h"
|
||||
|
||||
struct zpci_aift *aift;
|
||||
|
||||
static inline int __set_irq_noiib(u16 ctl, u8 isc)
|
||||
{
|
||||
union zpci_sic_iib iib = {{0}};
|
||||
|
||||
return zpci_set_irq_ctrl(ctl, isc, &iib);
|
||||
}
|
||||
|
||||
void kvm_s390_pci_aen_exit(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct kvm_zdev **gait_kzdev;
|
||||
|
||||
lockdep_assert_held(&aift->aift_lock);
|
||||
|
||||
/*
|
||||
* Contents of the aipb remain registered for the life of the host
|
||||
* kernel, the information preserved in zpci_aipb and zpci_aif_sbv
|
||||
* in case we insert the KVM module again later. Clear the AIFT
|
||||
* information and free anything not registered with underlying
|
||||
* firmware.
|
||||
*/
|
||||
spin_lock_irqsave(&aift->gait_lock, flags);
|
||||
gait_kzdev = aift->kzdev;
|
||||
aift->gait = NULL;
|
||||
aift->sbv = NULL;
|
||||
aift->kzdev = NULL;
|
||||
spin_unlock_irqrestore(&aift->gait_lock, flags);
|
||||
|
||||
kfree(gait_kzdev);
|
||||
}
|
||||
|
||||
static int zpci_setup_aipb(u8 nisc)
|
||||
{
|
||||
struct page *page;
|
||||
int size, rc;
|
||||
|
||||
zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL);
|
||||
if (!zpci_aipb)
|
||||
return -ENOMEM;
|
||||
|
||||
aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, 0);
|
||||
if (!aift->sbv) {
|
||||
rc = -ENOMEM;
|
||||
goto free_aipb;
|
||||
}
|
||||
zpci_aif_sbv = aift->sbv;
|
||||
size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES *
|
||||
sizeof(struct zpci_gaite)));
|
||||
page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size);
|
||||
if (!page) {
|
||||
rc = -ENOMEM;
|
||||
goto free_sbv;
|
||||
}
|
||||
aift->gait = (struct zpci_gaite *)page_to_phys(page);
|
||||
|
||||
zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector);
|
||||
zpci_aipb->aipb.gait = virt_to_phys(aift->gait);
|
||||
zpci_aipb->aipb.afi = nisc;
|
||||
zpci_aipb->aipb.faal = ZPCI_NR_DEVICES;
|
||||
|
||||
/* Setup Adapter Event Notification Interpretation */
|
||||
if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) {
|
||||
rc = -EIO;
|
||||
goto free_gait;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
free_gait:
|
||||
free_pages((unsigned long)aift->gait, size);
|
||||
free_sbv:
|
||||
airq_iv_release(aift->sbv);
|
||||
zpci_aif_sbv = NULL;
|
||||
free_aipb:
|
||||
kfree(zpci_aipb);
|
||||
zpci_aipb = NULL;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int zpci_reset_aipb(u8 nisc)
|
||||
{
|
||||
/*
|
||||
* AEN registration can only happen once per system boot. If
|
||||
* an aipb already exists then AEN was already registered and
|
||||
* we can re-use the aipb contents. This can only happen if
|
||||
* the KVM module was removed and re-inserted. However, we must
|
||||
* ensure that the same forwarding ISC is used as this is assigned
|
||||
* during KVM module load.
|
||||
*/
|
||||
if (zpci_aipb->aipb.afi != nisc)
|
||||
return -EINVAL;
|
||||
|
||||
aift->sbv = zpci_aif_sbv;
|
||||
aift->gait = (struct zpci_gaite *)zpci_aipb->aipb.gait;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_s390_pci_aen_init(u8 nisc)
|
||||
{
|
||||
int rc = 0;
|
||||
|
||||
/* If already enabled for AEN, bail out now */
|
||||
if (aift->gait || aift->sbv)
|
||||
return -EPERM;
|
||||
|
||||
mutex_lock(&aift->aift_lock);
|
||||
aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev),
|
||||
GFP_KERNEL);
|
||||
if (!aift->kzdev) {
|
||||
rc = -ENOMEM;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (!zpci_aipb)
|
||||
rc = zpci_setup_aipb(nisc);
|
||||
else
|
||||
rc = zpci_reset_aipb(nisc);
|
||||
if (rc)
|
||||
goto free_zdev;
|
||||
|
||||
/* Enable floating IRQs */
|
||||
if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) {
|
||||
rc = -EIO;
|
||||
kvm_s390_pci_aen_exit();
|
||||
}
|
||||
|
||||
goto unlock;
|
||||
|
||||
free_zdev:
|
||||
kfree(aift->kzdev);
|
||||
unlock:
|
||||
mutex_unlock(&aift->aift_lock);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Modify PCI: Register floating adapter interruption forwarding */
|
||||
static int kvm_zpci_set_airq(struct zpci_dev *zdev)
|
||||
{
|
||||
u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT);
|
||||
struct zpci_fib fib = {};
|
||||
u8 status;
|
||||
|
||||
fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc;
|
||||
fib.fmt0.sum = 1; /* enable summary notifications */
|
||||
fib.fmt0.noi = airq_iv_end(zdev->aibv);
|
||||
fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector);
|
||||
fib.fmt0.aibvo = 0;
|
||||
fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
|
||||
fib.fmt0.aisbo = zdev->aisb & 63;
|
||||
fib.gd = zdev->gisa;
|
||||
|
||||
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
|
||||
}
|
||||
|
||||
/* Modify PCI: Unregister floating adapter interruption forwarding */
|
||||
static int kvm_zpci_clear_airq(struct zpci_dev *zdev)
|
||||
{
|
||||
u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT);
|
||||
struct zpci_fib fib = {};
|
||||
u8 cc, status;
|
||||
|
||||
fib.gd = zdev->gisa;
|
||||
|
||||
cc = zpci_mod_fc(req, &fib, &status);
|
||||
if (cc == 3 || (cc == 1 && status == 24))
|
||||
/* Function already gone or IRQs already deregistered. */
|
||||
cc = 0;
|
||||
|
||||
return cc ? -EIO : 0;
|
||||
}
|
||||
|
||||
static inline void unaccount_mem(unsigned long nr_pages)
|
||||
{
|
||||
struct user_struct *user = get_uid(current_user());
|
||||
|
||||
if (user)
|
||||
atomic_long_sub(nr_pages, &user->locked_vm);
|
||||
if (current->mm)
|
||||
atomic64_sub(nr_pages, ¤t->mm->pinned_vm);
|
||||
}
|
||||
|
||||
static inline int account_mem(unsigned long nr_pages)
|
||||
{
|
||||
struct user_struct *user = get_uid(current_user());
|
||||
unsigned long page_limit, cur_pages, new_pages;
|
||||
|
||||
page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
||||
|
||||
do {
|
||||
cur_pages = atomic_long_read(&user->locked_vm);
|
||||
new_pages = cur_pages + nr_pages;
|
||||
if (new_pages > page_limit)
|
||||
return -ENOMEM;
|
||||
} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
|
||||
new_pages) != cur_pages);
|
||||
|
||||
atomic64_add(nr_pages, ¤t->mm->pinned_vm);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib,
|
||||
bool assist)
|
||||
{
|
||||
struct page *pages[1], *aibv_page, *aisb_page = NULL;
|
||||
unsigned int msi_vecs, idx;
|
||||
struct zpci_gaite *gaite;
|
||||
unsigned long hva, bit;
|
||||
struct kvm *kvm;
|
||||
phys_addr_t gaddr;
|
||||
int rc = 0, gisc, npages, pcount = 0;
|
||||
|
||||
/*
|
||||
* Interrupt forwarding is only applicable if the device is already
|
||||
* enabled for interpretation
|
||||
*/
|
||||
if (zdev->gisa == 0)
|
||||
return -EINVAL;
|
||||
|
||||
kvm = zdev->kzdev->kvm;
|
||||
msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi);
|
||||
|
||||
/* Get the associated forwarding ISC - if invalid, return the error */
|
||||
gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc);
|
||||
if (gisc < 0)
|
||||
return gisc;
|
||||
|
||||
/* Replace AIBV address */
|
||||
idx = srcu_read_lock(&kvm->srcu);
|
||||
hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv));
|
||||
npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages);
|
||||
srcu_read_unlock(&kvm->srcu, idx);
|
||||
if (npages < 1) {
|
||||
rc = -EIO;
|
||||
goto out;
|
||||
}
|
||||
aibv_page = pages[0];
|
||||
pcount++;
|
||||
gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK);
|
||||
fib->fmt0.aibv = gaddr;
|
||||
|
||||
/* Pin the guest AISB if one was specified */
|
||||
if (fib->fmt0.sum == 1) {
|
||||
idx = srcu_read_lock(&kvm->srcu);
|
||||
hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb));
|
||||
npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM,
|
||||
pages);
|
||||
srcu_read_unlock(&kvm->srcu, idx);
|
||||
if (npages < 1) {
|
||||
rc = -EIO;
|
||||
goto unpin1;
|
||||
}
|
||||
aisb_page = pages[0];
|
||||
pcount++;
|
||||
}
|
||||
|
||||
/* Account for pinned pages, roll back on failure */
|
||||
if (account_mem(pcount))
|
||||
goto unpin2;
|
||||
|
||||
/* AISB must be allocated before we can fill in GAITE */
|
||||
mutex_lock(&aift->aift_lock);
|
||||
bit = airq_iv_alloc_bit(aift->sbv);
|
||||
if (bit == -1UL)
|
||||
goto unlock;
|
||||
zdev->aisb = bit; /* store the summary bit number */
|
||||
zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA |
|
||||
AIRQ_IV_BITLOCK |
|
||||
AIRQ_IV_GUESTVEC,
|
||||
phys_to_virt(fib->fmt0.aibv));
|
||||
|
||||
spin_lock_irq(&aift->gait_lock);
|
||||
gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
|
||||
sizeof(struct zpci_gaite));
|
||||
|
||||
/* If assist not requested, host will get all alerts */
|
||||
if (assist)
|
||||
gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
|
||||
else
|
||||
gaite->gisa = 0;
|
||||
|
||||
gaite->gisc = fib->fmt0.isc;
|
||||
gaite->count++;
|
||||
gaite->aisbo = fib->fmt0.aisbo;
|
||||
gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb &
|
||||
~PAGE_MASK));
|
||||
aift->kzdev[zdev->aisb] = zdev->kzdev;
|
||||
spin_unlock_irq(&aift->gait_lock);
|
||||
|
||||
/* Update guest FIB for re-issue */
|
||||
fib->fmt0.aisbo = zdev->aisb & 63;
|
||||
fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8);
|
||||
fib->fmt0.isc = gisc;
|
||||
|
||||
/* Save some guest fib values in the host for later use */
|
||||
zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc;
|
||||
zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv;
|
||||
mutex_unlock(&aift->aift_lock);
|
||||
|
||||
/* Issue the clp to setup the irq now */
|
||||
rc = kvm_zpci_set_airq(zdev);
|
||||
return rc;
|
||||
|
||||
unlock:
|
||||
mutex_unlock(&aift->aift_lock);
|
||||
unpin2:
|
||||
if (fib->fmt0.sum == 1)
|
||||
unpin_user_page(aisb_page);
|
||||
unpin1:
|
||||
unpin_user_page(aibv_page);
|
||||
out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force)
|
||||
{
|
||||
struct kvm_zdev *kzdev = zdev->kzdev;
|
||||
struct zpci_gaite *gaite;
|
||||
struct page *vpage = NULL, *spage = NULL;
|
||||
int rc, pcount = 0;
|
||||
u8 isc;
|
||||
|
||||
if (zdev->gisa == 0)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&aift->aift_lock);
|
||||
|
||||
/*
|
||||
* If the clear fails due to an error, leave now unless we know this
|
||||
* device is about to go away (force) -- In that case clear the GAITE
|
||||
* regardless.
|
||||
*/
|
||||
rc = kvm_zpci_clear_airq(zdev);
|
||||
if (rc && !force)
|
||||
goto out;
|
||||
|
||||
if (zdev->kzdev->fib.fmt0.aibv == 0)
|
||||
goto out;
|
||||
spin_lock_irq(&aift->gait_lock);
|
||||
gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb *
|
||||
sizeof(struct zpci_gaite));
|
||||
isc = gaite->gisc;
|
||||
gaite->count--;
|
||||
if (gaite->count == 0) {
|
||||
/* Release guest AIBV and AISB */
|
||||
vpage = phys_to_page(kzdev->fib.fmt0.aibv);
|
||||
if (gaite->aisb != 0)
|
||||
spage = phys_to_page(gaite->aisb);
|
||||
/* Clear the GAIT entry */
|
||||
gaite->aisb = 0;
|
||||
gaite->gisc = 0;
|
||||
gaite->aisbo = 0;
|
||||
gaite->gisa = 0;
|
||||
aift->kzdev[zdev->aisb] = 0;
|
||||
/* Clear zdev info */
|
||||
airq_iv_free_bit(aift->sbv, zdev->aisb);
|
||||
airq_iv_release(zdev->aibv);
|
||||
zdev->aisb = 0;
|
||||
zdev->aibv = NULL;
|
||||
}
|
||||
spin_unlock_irq(&aift->gait_lock);
|
||||
kvm_s390_gisc_unregister(kzdev->kvm, isc);
|
||||
kzdev->fib.fmt0.isc = 0;
|
||||
kzdev->fib.fmt0.aibv = 0;
|
||||
|
||||
if (vpage) {
|
||||
unpin_user_page(vpage);
|
||||
pcount++;
|
||||
}
|
||||
if (spage) {
|
||||
unpin_user_page(spage);
|
||||
pcount++;
|
||||
}
|
||||
if (pcount > 0)
|
||||
unaccount_mem(pcount);
|
||||
out:
|
||||
mutex_unlock(&aift->aift_lock);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int kvm_s390_pci_dev_open(struct zpci_dev *zdev)
|
||||
{
|
||||
struct kvm_zdev *kzdev;
|
||||
|
||||
kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL);
|
||||
if (!kzdev)
|
||||
return -ENOMEM;
|
||||
|
||||
kzdev->zdev = zdev;
|
||||
zdev->kzdev = kzdev;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void kvm_s390_pci_dev_release(struct zpci_dev *zdev)
|
||||
{
|
||||
struct kvm_zdev *kzdev;
|
||||
|
||||
kzdev = zdev->kzdev;
|
||||
WARN_ON(kzdev->zdev != zdev);
|
||||
zdev->kzdev = NULL;
|
||||
kfree(kzdev);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Register device with the specified KVM. If interpetation facilities are
|
||||
* available, enable them and let userspace indicate whether or not they will
|
||||
* be used (specify SHM bit to disable).
|
||||
*/
|
||||
int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm)
|
||||
{
|
||||
int rc;
|
||||
|
||||
if (!zdev)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&zdev->kzdev_lock);
|
||||
|
||||
if (zdev->kzdev || zdev->gisa != 0 || !kvm) {
|
||||
mutex_unlock(&zdev->kzdev_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
kvm_get_kvm(kvm);
|
||||
|
||||
mutex_lock(&kvm->lock);
|
||||
|
||||
rc = kvm_s390_pci_dev_open(zdev);
|
||||
if (rc)
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* If interpretation facilities aren't available, add the device to
|
||||
* the kzdev list but don't enable for interpretation.
|
||||
*/
|
||||
if (!kvm_s390_pci_interp_allowed())
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If this is the first request to use an interpreted device, make the
|
||||
* necessary vcpu changes
|
||||
*/
|
||||
if (!kvm->arch.use_zpci_interp)
|
||||
kvm_s390_vcpu_pci_enable_interp(kvm);
|
||||
|
||||
if (zdev_enabled(zdev)) {
|
||||
rc = zpci_disable_device(zdev);
|
||||
if (rc)
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Store information about the identity of the kvm guest allowed to
|
||||
* access this device via interpretation to be used by host CLP
|
||||
*/
|
||||
zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
|
||||
|
||||
rc = zpci_enable_device(zdev);
|
||||
if (rc)
|
||||
goto clear_gisa;
|
||||
|
||||
/* Re-register the IOMMU that was already created */
|
||||
rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
|
||||
virt_to_phys(zdev->dma_table));
|
||||
if (rc)
|
||||
goto clear_gisa;
|
||||
|
||||
out:
|
||||
zdev->kzdev->kvm = kvm;
|
||||
|
||||
spin_lock(&kvm->arch.kzdev_list_lock);
|
||||
list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list);
|
||||
spin_unlock(&kvm->arch.kzdev_list_lock);
|
||||
|
||||
mutex_unlock(&kvm->lock);
|
||||
mutex_unlock(&zdev->kzdev_lock);
|
||||
return 0;
|
||||
|
||||
clear_gisa:
|
||||
zdev->gisa = 0;
|
||||
err:
|
||||
if (zdev->kzdev)
|
||||
kvm_s390_pci_dev_release(zdev);
|
||||
mutex_unlock(&kvm->lock);
|
||||
mutex_unlock(&zdev->kzdev_lock);
|
||||
kvm_put_kvm(kvm);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_s390_pci_register_kvm);
|
||||
|
||||
void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev)
|
||||
{
|
||||
struct kvm *kvm;
|
||||
|
||||
if (!zdev)
|
||||
return;
|
||||
|
||||
mutex_lock(&zdev->kzdev_lock);
|
||||
|
||||
if (WARN_ON(!zdev->kzdev)) {
|
||||
mutex_unlock(&zdev->kzdev_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
kvm = zdev->kzdev->kvm;
|
||||
mutex_lock(&kvm->lock);
|
||||
|
||||
/*
|
||||
* A 0 gisa means interpretation was never enabled, just remove the
|
||||
* device from the list.
|
||||
*/
|
||||
if (zdev->gisa == 0)
|
||||
goto out;
|
||||
|
||||
/* Forwarding must be turned off before interpretation */
|
||||
if (zdev->kzdev->fib.fmt0.aibv != 0)
|
||||
kvm_s390_pci_aif_disable(zdev, true);
|
||||
|
||||
/* Remove the host CLP guest designation */
|
||||
zdev->gisa = 0;
|
||||
|
||||
if (zdev_enabled(zdev)) {
|
||||
if (zpci_disable_device(zdev))
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (zpci_enable_device(zdev))
|
||||
goto out;
|
||||
|
||||
/* Re-register the IOMMU that was already created */
|
||||
zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
|
||||
virt_to_phys(zdev->dma_table));
|
||||
|
||||
out:
|
||||
spin_lock(&kvm->arch.kzdev_list_lock);
|
||||
list_del(&zdev->kzdev->entry);
|
||||
spin_unlock(&kvm->arch.kzdev_list_lock);
|
||||
kvm_s390_pci_dev_release(zdev);
|
||||
|
||||
mutex_unlock(&kvm->lock);
|
||||
mutex_unlock(&zdev->kzdev_lock);
|
||||
|
||||
kvm_put_kvm(kvm);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_s390_pci_unregister_kvm);
|
||||
|
||||
void kvm_s390_pci_init_list(struct kvm *kvm)
|
||||
{
|
||||
spin_lock_init(&kvm->arch.kzdev_list_lock);
|
||||
INIT_LIST_HEAD(&kvm->arch.kzdev_list);
|
||||
}
|
||||
|
||||
void kvm_s390_pci_clear_list(struct kvm *kvm)
|
||||
{
|
||||
/*
|
||||
* This list should already be empty, either via vfio device closures
|
||||
* or kvm fd cleanup.
|
||||
*/
|
||||
spin_lock(&kvm->arch.kzdev_list_lock);
|
||||
WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list));
|
||||
spin_unlock(&kvm->arch.kzdev_list_lock);
|
||||
}
|
||||
|
||||
static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh)
|
||||
{
|
||||
struct zpci_dev *zdev = NULL;
|
||||
struct kvm_zdev *kzdev;
|
||||
|
||||
spin_lock(&kvm->arch.kzdev_list_lock);
|
||||
list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) {
|
||||
if (kzdev->zdev->fh == fh) {
|
||||
zdev = kzdev->zdev;
|
||||
break;
|
||||
}
|
||||
}
|
||||
spin_unlock(&kvm->arch.kzdev_list_lock);
|
||||
|
||||
return zdev;
|
||||
}
|
||||
|
||||
static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev,
|
||||
struct kvm_s390_zpci_op *args)
|
||||
{
|
||||
struct zpci_fib fib = {};
|
||||
bool hostflag;
|
||||
|
||||
fib.fmt0.aibv = args->u.reg_aen.ibv;
|
||||
fib.fmt0.isc = args->u.reg_aen.isc;
|
||||
fib.fmt0.noi = args->u.reg_aen.noi;
|
||||
if (args->u.reg_aen.sb != 0) {
|
||||
fib.fmt0.aisb = args->u.reg_aen.sb;
|
||||
fib.fmt0.aisbo = args->u.reg_aen.sbo;
|
||||
fib.fmt0.sum = 1;
|
||||
} else {
|
||||
fib.fmt0.aisb = 0;
|
||||
fib.fmt0.aisbo = 0;
|
||||
fib.fmt0.sum = 0;
|
||||
}
|
||||
|
||||
hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST);
|
||||
return kvm_s390_pci_aif_enable(zdev, &fib, hostflag);
|
||||
}
|
||||
|
||||
int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args)
|
||||
{
|
||||
struct kvm_zdev *kzdev;
|
||||
struct zpci_dev *zdev;
|
||||
int r;
|
||||
|
||||
zdev = get_zdev_from_kvm_by_fh(kvm, args->fh);
|
||||
if (!zdev)
|
||||
return -ENODEV;
|
||||
|
||||
mutex_lock(&zdev->kzdev_lock);
|
||||
mutex_lock(&kvm->lock);
|
||||
|
||||
kzdev = zdev->kzdev;
|
||||
if (!kzdev) {
|
||||
r = -ENODEV;
|
||||
goto out;
|
||||
}
|
||||
if (kzdev->kvm != kvm) {
|
||||
r = -EPERM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
switch (args->op) {
|
||||
case KVM_S390_ZPCIOP_REG_AEN:
|
||||
/* Fail on unknown flags */
|
||||
if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) {
|
||||
r = -EINVAL;
|
||||
break;
|
||||
}
|
||||
r = kvm_s390_pci_zpci_reg_aen(zdev, args);
|
||||
break;
|
||||
case KVM_S390_ZPCIOP_DEREG_AEN:
|
||||
r = kvm_s390_pci_aif_disable(zdev, false);
|
||||
break;
|
||||
default:
|
||||
r = -EINVAL;
|
||||
}
|
||||
|
||||
out:
|
||||
mutex_unlock(&kvm->lock);
|
||||
mutex_unlock(&zdev->kzdev_lock);
|
||||
return r;
|
||||
}
|
||||
|
||||
int kvm_s390_pci_init(void)
|
||||
{
|
||||
aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
|
||||
if (!aift)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&aift->gait_lock);
|
||||
mutex_init(&aift->aift_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kvm_s390_pci_exit(void)
|
||||
{
|
||||
mutex_destroy(&aift->aift_lock);
|
||||
|
||||
kfree(aift);
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* s390 kvm PCI passthrough support
|
||||
*
|
||||
* Copyright IBM Corp. 2022
|
||||
*
|
||||
* Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
|
||||
*/
|
||||
|
||||
#ifndef __KVM_S390_PCI_H
|
||||
#define __KVM_S390_PCI_H
|
||||
|
||||
#include <linux/kvm.h>
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/pci.h>
|
||||
#include <asm/airq.h>
|
||||
#include <asm/cpu.h>
|
||||
|
||||
struct kvm_zdev {
|
||||
struct zpci_dev *zdev;
|
||||
struct kvm *kvm;
|
||||
struct zpci_fib fib;
|
||||
struct list_head entry;
|
||||
};
|
||||
|
||||
struct zpci_gaite {
|
||||
u32 gisa;
|
||||
u8 gisc;
|
||||
u8 count;
|
||||
u8 reserved;
|
||||
u8 aisbo;
|
||||
u64 aisb;
|
||||
};
|
||||
|
||||
struct zpci_aift {
|
||||
struct zpci_gaite *gait;
|
||||
struct airq_iv *sbv;
|
||||
struct kvm_zdev **kzdev;
|
||||
spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
|
||||
struct mutex aift_lock; /* Protects the other structures in aift */
|
||||
};
|
||||
|
||||
extern struct zpci_aift *aift;
|
||||
|
||||
static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
|
||||
unsigned long si)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || aift->kzdev == 0 ||
|
||||
aift->kzdev[si] == 0)
|
||||
return 0;
|
||||
return aift->kzdev[si]->kvm;
|
||||
};
|
||||
|
||||
int kvm_s390_pci_aen_init(u8 nisc);
|
||||
void kvm_s390_pci_aen_exit(void);
|
||||
|
||||
void kvm_s390_pci_init_list(struct kvm *kvm);
|
||||
void kvm_s390_pci_clear_list(struct kvm *kvm);
|
||||
|
||||
int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
|
||||
|
||||
int kvm_s390_pci_init(void);
|
||||
void kvm_s390_pci_exit(void);
|
||||
|
||||
static inline bool kvm_s390_pci_interp_allowed(void)
|
||||
{
|
||||
struct cpuid cpu_id;
|
||||
|
||||
get_cpu_id(&cpu_id);
|
||||
switch (cpu_id.machine) {
|
||||
case 0x2817:
|
||||
case 0x2818:
|
||||
case 0x2827:
|
||||
case 0x2828:
|
||||
case 0x2964:
|
||||
case 0x2965:
|
||||
/* No SHM on certain machines */
|
||||
return false;
|
||||
default:
|
||||
return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) &&
|
||||
sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi &&
|
||||
sclp.has_aisii);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __KVM_S390_PCI_H */
|
|
@ -442,7 +442,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
|
|||
vcpu->stat.instruction_ipte_interlock++;
|
||||
if (psw_bits(vcpu->arch.sie_block->gpsw).pstate)
|
||||
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
|
||||
wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
|
||||
wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm));
|
||||
kvm_s390_retry_instr(vcpu);
|
||||
VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
|
||||
return 0;
|
||||
|
@ -873,10 +873,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
|
|||
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
|
||||
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
|
||||
|
||||
if (fc > 3) {
|
||||
kvm_s390_set_psw_cc(vcpu, 3);
|
||||
return 0;
|
||||
}
|
||||
/* Bailout forbidden function codes */
|
||||
if (fc > 3 && fc != 15)
|
||||
goto out_no_data;
|
||||
|
||||
/*
|
||||
* fc 15 is provided only with
|
||||
* - PTF/CPU topology support through facility 15
|
||||
* - KVM_CAP_S390_USER_STSI
|
||||
*/
|
||||
if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) ||
|
||||
!vcpu->kvm->arch.user_stsi))
|
||||
goto out_no_data;
|
||||
|
||||
if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
|
||||
|| vcpu->run->s.regs.gprs[1] & 0xffff0000)
|
||||
|
@ -910,6 +918,10 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
|
|||
goto out_no_data;
|
||||
handle_stsi_3_2_2(vcpu, (void *) mem);
|
||||
break;
|
||||
case 15: /* fc 15 is fully handled in userspace */
|
||||
insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2);
|
||||
trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
|
||||
return -EREMOTE;
|
||||
}
|
||||
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
|
||||
memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem,
|
||||
|
@ -1471,7 +1483,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
|
|||
access_key = (operand2 & 0xf0) >> 4;
|
||||
|
||||
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
|
||||
ipte_lock(vcpu);
|
||||
ipte_lock(vcpu->kvm);
|
||||
|
||||
ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
|
||||
GACC_STORE, access_key);
|
||||
|
@ -1508,7 +1520,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
|
||||
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
|
||||
ipte_unlock(vcpu);
|
||||
ipte_unlock(vcpu->kvm);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -7,13 +7,25 @@
|
|||
*/
|
||||
#include <linux/kvm.h>
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/minmax.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <asm/gmap.h>
|
||||
#include <asm/uv.h>
|
||||
#include <asm/mman.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include "kvm-s390.h"
|
||||
|
||||
static void kvm_s390_clear_pv_state(struct kvm *kvm)
|
||||
{
|
||||
kvm->arch.pv.handle = 0;
|
||||
kvm->arch.pv.guest_len = 0;
|
||||
kvm->arch.pv.stor_base = 0;
|
||||
kvm->arch.pv.stor_var = NULL;
|
||||
}
|
||||
|
||||
int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
|
||||
{
|
||||
int cc;
|
||||
|
@ -108,7 +120,7 @@ static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
|
|||
vfree(kvm->arch.pv.stor_var);
|
||||
free_pages(kvm->arch.pv.stor_base,
|
||||
get_order(uv_info.guest_base_stor_len));
|
||||
memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
|
||||
kvm_s390_clear_pv_state(kvm);
|
||||
}
|
||||
|
||||
static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
|
||||
|
@ -152,21 +164,51 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
|
|||
{
|
||||
int cc;
|
||||
|
||||
/* make all pages accessible before destroying the guest */
|
||||
s390_reset_acc(kvm->mm);
|
||||
|
||||
cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
|
||||
UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
|
||||
WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
|
||||
atomic_set(&kvm->mm->context.is_protected, 0);
|
||||
/*
|
||||
* if the mm still has a mapping, make all its pages accessible
|
||||
* before destroying the guest
|
||||
*/
|
||||
if (mmget_not_zero(kvm->mm)) {
|
||||
s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
|
||||
mmput(kvm->mm);
|
||||
}
|
||||
|
||||
if (!cc) {
|
||||
atomic_dec(&kvm->mm->context.protected_count);
|
||||
kvm_s390_pv_dealloc_vm(kvm);
|
||||
} else {
|
||||
/* Intended memory leak on "impossible" error */
|
||||
s390_replace_asce(kvm->arch.gmap);
|
||||
}
|
||||
KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
|
||||
WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
|
||||
/* Inteded memory leak on "impossible" error */
|
||||
if (!cc)
|
||||
kvm_s390_pv_dealloc_vm(kvm);
|
||||
|
||||
return cc ? -EIO : 0;
|
||||
}
|
||||
|
||||
static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
|
||||
struct mm_struct *mm)
|
||||
{
|
||||
struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
|
||||
u16 dummy;
|
||||
|
||||
/*
|
||||
* No locking is needed since this is the last thread of the last user of this
|
||||
* struct mm.
|
||||
* When the struct kvm gets deinitialized, this notifier is also
|
||||
* unregistered. This means that if this notifier runs, then the
|
||||
* struct kvm is still valid.
|
||||
*/
|
||||
kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
|
||||
}
|
||||
|
||||
static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
|
||||
.release = kvm_s390_pv_mmu_notifier_release,
|
||||
};
|
||||
|
||||
int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct uv_cb_cgc uvcb = {
|
||||
|
@ -197,14 +239,22 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
|
|||
/* Outputs */
|
||||
kvm->arch.pv.handle = uvcb.guest_handle;
|
||||
|
||||
atomic_inc(&kvm->mm->context.protected_count);
|
||||
if (cc) {
|
||||
if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
|
||||
if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
|
||||
kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
|
||||
else
|
||||
} else {
|
||||
atomic_dec(&kvm->mm->context.protected_count);
|
||||
kvm_s390_pv_dealloc_vm(kvm);
|
||||
}
|
||||
return -EIO;
|
||||
}
|
||||
kvm->arch.gmap->guest_handle = uvcb.guest_handle;
|
||||
/* Add the notifier only once. No races because we hold kvm->lock */
|
||||
if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
|
||||
kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
|
||||
mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -224,8 +274,6 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
|
|||
*rrc = uvcb.header.rrc;
|
||||
KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
|
||||
*rc, *rrc);
|
||||
if (!cc)
|
||||
atomic_set(&kvm->mm->context.is_protected, 1);
|
||||
return cc ? -EINVAL : 0;
|
||||
}
|
||||
|
||||
|
@ -298,3 +346,200 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
|
|||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct uv_cb_dump_cpu uvcb = {
|
||||
.header.cmd = UVC_CMD_DUMP_CPU,
|
||||
.header.len = sizeof(uvcb),
|
||||
.cpu_handle = vcpu->arch.pv.handle,
|
||||
.dump_area_origin = (u64)buff,
|
||||
};
|
||||
int cc;
|
||||
|
||||
cc = uv_call_sched(0, (u64)&uvcb);
|
||||
*rc = uvcb.header.rc;
|
||||
*rrc = uvcb.header.rrc;
|
||||
return cc;
|
||||
}
|
||||
|
||||
/* Size of the cache for the storage state dump data. 1MB for now */
|
||||
#define DUMP_BUFF_LEN HPAGE_SIZE
|
||||
|
||||
/**
|
||||
* kvm_s390_pv_dump_stor_state
|
||||
*
|
||||
* @kvm: pointer to the guest's KVM struct
|
||||
* @buff_user: Userspace pointer where we will write the results to
|
||||
* @gaddr: Starting absolute guest address for which the storage state
|
||||
* is requested.
|
||||
* @buff_user_len: Length of the buff_user buffer
|
||||
* @rc: Pointer to where the uvcb return code is stored
|
||||
* @rrc: Pointer to where the uvcb return reason code is stored
|
||||
*
|
||||
* Stores buff_len bytes of tweak component values to buff_user
|
||||
* starting with the 1MB block specified by the absolute guest address
|
||||
* (gaddr). The gaddr pointer will be updated with the last address
|
||||
* for which data was written when returning to userspace. buff_user
|
||||
* might be written to even if an error rc is returned. For instance
|
||||
* if we encounter a fault after writing the first page of data.
|
||||
*
|
||||
* Context: kvm->lock needs to be held
|
||||
*
|
||||
* Return:
|
||||
* 0 on success
|
||||
* -ENOMEM if allocating the cache fails
|
||||
* -EINVAL if gaddr is not aligned to 1MB
|
||||
* -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
|
||||
* -EINVAL if the UV call fails, rc and rrc will be set in this case
|
||||
* -EFAULT if copying the result to buff_user failed
|
||||
*/
|
||||
int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
|
||||
u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct uv_cb_dump_stor_state uvcb = {
|
||||
.header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
|
||||
.header.len = sizeof(uvcb),
|
||||
.config_handle = kvm->arch.pv.handle,
|
||||
.gaddr = *gaddr,
|
||||
.dump_area_origin = 0,
|
||||
};
|
||||
const u64 increment_len = uv_info.conf_dump_storage_state_len;
|
||||
size_t buff_kvm_size;
|
||||
size_t size_done = 0;
|
||||
u8 *buff_kvm = NULL;
|
||||
int cc, ret;
|
||||
|
||||
ret = -EINVAL;
|
||||
/* UV call processes 1MB guest storage chunks at a time */
|
||||
if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* We provide the storage state for 1MB chunks of guest
|
||||
* storage. The buffer will need to be aligned to
|
||||
* conf_dump_storage_state_len so we don't end on a partial
|
||||
* chunk.
|
||||
*/
|
||||
if (!buff_user_len ||
|
||||
!IS_ALIGNED(buff_user_len, increment_len))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Allocate a buffer from which we will later copy to the user
|
||||
* process. We don't want userspace to dictate our buffer size
|
||||
* so we limit it to DUMP_BUFF_LEN.
|
||||
*/
|
||||
ret = -ENOMEM;
|
||||
buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
|
||||
buff_kvm = vzalloc(buff_kvm_size);
|
||||
if (!buff_kvm)
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
uvcb.dump_area_origin = (u64)buff_kvm;
|
||||
/* We will loop until the user buffer is filled or an error occurs */
|
||||
do {
|
||||
/* Get 1MB worth of guest storage state data */
|
||||
cc = uv_call_sched(0, (u64)&uvcb);
|
||||
|
||||
/* All or nothing */
|
||||
if (cc) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
size_done += increment_len;
|
||||
uvcb.dump_area_origin += increment_len;
|
||||
buff_user_len -= increment_len;
|
||||
uvcb.gaddr += HPAGE_SIZE;
|
||||
|
||||
/* KVM Buffer full, time to copy to the process */
|
||||
if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
|
||||
if (copy_to_user(buff_user, buff_kvm, size_done)) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
buff_user += size_done;
|
||||
size_done = 0;
|
||||
uvcb.dump_area_origin = (u64)buff_kvm;
|
||||
}
|
||||
} while (buff_user_len);
|
||||
|
||||
/* Report back where we ended dumping */
|
||||
*gaddr = uvcb.gaddr;
|
||||
|
||||
/* Lets only log errors, we don't want to spam */
|
||||
out:
|
||||
if (ret)
|
||||
KVM_UV_EVENT(kvm, 3,
|
||||
"PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
|
||||
uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
|
||||
*rc = uvcb.header.rc;
|
||||
*rrc = uvcb.header.rrc;
|
||||
vfree(buff_kvm);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* kvm_s390_pv_dump_complete
|
||||
*
|
||||
* @kvm: pointer to the guest's KVM struct
|
||||
* @buff_user: Userspace pointer where we will write the results to
|
||||
* @rc: Pointer to where the uvcb return code is stored
|
||||
* @rrc: Pointer to where the uvcb return reason code is stored
|
||||
*
|
||||
* Completes the dumping operation and writes the completion data to
|
||||
* user space.
|
||||
*
|
||||
* Context: kvm->lock needs to be held
|
||||
*
|
||||
* Return:
|
||||
* 0 on success
|
||||
* -ENOMEM if allocating the completion buffer fails
|
||||
* -EINVAL if the UV call fails, rc and rrc will be set in this case
|
||||
* -EFAULT if copying the result to buff_user failed
|
||||
*/
|
||||
int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
|
||||
u16 *rc, u16 *rrc)
|
||||
{
|
||||
struct uv_cb_dump_complete complete = {
|
||||
.header.len = sizeof(complete),
|
||||
.header.cmd = UVC_CMD_DUMP_COMPLETE,
|
||||
.config_handle = kvm_s390_pv_get_handle(kvm),
|
||||
};
|
||||
u64 *compl_data;
|
||||
int ret;
|
||||
|
||||
/* Allocate dump area */
|
||||
compl_data = vzalloc(uv_info.conf_dump_finalize_len);
|
||||
if (!compl_data)
|
||||
return -ENOMEM;
|
||||
complete.dump_area_origin = (u64)compl_data;
|
||||
|
||||
ret = uv_call_sched(0, (u64)&complete);
|
||||
*rc = complete.header.rc;
|
||||
*rrc = complete.header.rrc;
|
||||
KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
|
||||
complete.header.rc, complete.header.rrc);
|
||||
|
||||
if (!ret) {
|
||||
/*
|
||||
* kvm_s390_pv_dealloc_vm() will also (mem)set
|
||||
* this to false on a reboot or other destroy
|
||||
* operation for this vm.
|
||||
*/
|
||||
kvm->arch.pv.dumping = false;
|
||||
kvm_s390_vcpu_unblock_all(kvm);
|
||||
ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
|
||||
if (ret)
|
||||
ret = -EFAULT;
|
||||
}
|
||||
vfree(compl_data);
|
||||
/* If the UVC returned an error, translate it to -EINVAL */
|
||||
if (ret > 0)
|
||||
ret = -EINVAL;
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -480,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
|
|||
struct kvm_vcpu *dest_vcpu;
|
||||
u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL);
|
||||
|
||||
trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
|
||||
|
||||
if (order_code == SIGP_EXTERNAL_CALL) {
|
||||
trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
|
||||
|
||||
dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr);
|
||||
BUG_ON(dest_vcpu == NULL);
|
||||
|
||||
|
|
|
@ -503,6 +503,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
|
|||
/* Host-protection-interruption introduced with ESOP */
|
||||
if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
|
||||
scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
|
||||
/*
|
||||
* CPU Topology
|
||||
* This facility only uses the utility field of the SCA and none of
|
||||
* the cpu entries that are problematic with the other interpretation
|
||||
* facilities so we can pass it through
|
||||
*/
|
||||
if (test_kvm_facility(vcpu->kvm, 11))
|
||||
scb_s->ecb |= scb_o->ecb & ECB_PTF;
|
||||
/* transactional execution */
|
||||
if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
|
||||
/* remap the prefix is tx is toggled on */
|
||||
|
|
|
@ -754,6 +754,7 @@ void do_secure_storage_access(struct pt_regs *regs)
|
|||
struct vm_area_struct *vma;
|
||||
struct mm_struct *mm;
|
||||
struct page *page;
|
||||
struct gmap *gmap;
|
||||
int rc;
|
||||
|
||||
/*
|
||||
|
@ -783,6 +784,17 @@ void do_secure_storage_access(struct pt_regs *regs)
|
|||
}
|
||||
|
||||
switch (get_fault_type(regs)) {
|
||||
case GMAP_FAULT:
|
||||
mm = current->mm;
|
||||
gmap = (struct gmap *)S390_lowcore.gmap;
|
||||
mmap_read_lock(mm);
|
||||
addr = __gmap_translate(gmap, addr);
|
||||
mmap_read_unlock(mm);
|
||||
if (IS_ERR_VALUE(addr)) {
|
||||
do_fault_error(regs, VM_ACCESS_FLAGS, VM_FAULT_BADMAP);
|
||||
break;
|
||||
}
|
||||
fallthrough;
|
||||
case USER_FAULT:
|
||||
mm = current->mm;
|
||||
mmap_read_lock(mm);
|
||||
|
@ -811,7 +823,6 @@ void do_secure_storage_access(struct pt_regs *regs)
|
|||
if (rc)
|
||||
BUG();
|
||||
break;
|
||||
case GMAP_FAULT:
|
||||
default:
|
||||
do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
|
||||
WARN_ON_ONCE(1);
|
||||
|
@ -837,6 +848,16 @@ NOKPROBE_SYMBOL(do_non_secure_storage_access);
|
|||
|
||||
void do_secure_storage_violation(struct pt_regs *regs)
|
||||
{
|
||||
unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
|
||||
struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
|
||||
|
||||
/*
|
||||
* If the VM has been rebooted, its address space might still contain
|
||||
* secure pages from the previous boot.
|
||||
* Clear the page so it can be reused.
|
||||
*/
|
||||
if (!gmap_destroy_page(gmap, gaddr))
|
||||
return;
|
||||
/*
|
||||
* Either KVM messed up the secure guest mapping or the same
|
||||
* page is mapped into multiple secure guests.
|
||||
|
|
|
@ -2697,41 +2697,168 @@ void s390_reset_cmma(struct mm_struct *mm)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(s390_reset_cmma);
|
||||
|
||||
/*
|
||||
* make inaccessible pages accessible again
|
||||
*/
|
||||
static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
|
||||
unsigned long next, struct mm_walk *walk)
|
||||
#define GATHER_GET_PAGES 32
|
||||
|
||||
struct reset_walk_state {
|
||||
unsigned long next;
|
||||
unsigned long count;
|
||||
unsigned long pfns[GATHER_GET_PAGES];
|
||||
};
|
||||
|
||||
static int s390_gather_pages(pte_t *ptep, unsigned long addr,
|
||||
unsigned long next, struct mm_walk *walk)
|
||||
{
|
||||
struct reset_walk_state *p = walk->private;
|
||||
pte_t pte = READ_ONCE(*ptep);
|
||||
|
||||
/* There is a reference through the mapping */
|
||||
if (pte_present(pte))
|
||||
WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK));
|
||||
if (pte_present(pte)) {
|
||||
/* we have a reference from the mapping, take an extra one */
|
||||
get_page(phys_to_page(pte_val(pte)));
|
||||
p->pfns[p->count] = phys_to_pfn(pte_val(pte));
|
||||
p->next = next;
|
||||
p->count++;
|
||||
}
|
||||
return p->count >= GATHER_GET_PAGES;
|
||||
}
|
||||
|
||||
static const struct mm_walk_ops gather_pages_ops = {
|
||||
.pte_entry = s390_gather_pages,
|
||||
};
|
||||
|
||||
/*
|
||||
* Call the Destroy secure page UVC on each page in the given array of PFNs.
|
||||
* Each page needs to have an extra reference, which will be released here.
|
||||
*/
|
||||
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
|
||||
{
|
||||
unsigned long i;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
/* we always have an extra reference */
|
||||
uv_destroy_owned_page(pfn_to_phys(pfns[i]));
|
||||
/* get rid of the extra reference */
|
||||
put_page(pfn_to_page(pfns[i]));
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
|
||||
|
||||
/**
|
||||
* __s390_uv_destroy_range - Call the destroy secure page UVC on each page
|
||||
* in the given range of the given address space.
|
||||
* @mm: the mm to operate on
|
||||
* @start: the start of the range
|
||||
* @end: the end of the range
|
||||
* @interruptible: if not 0, stop when a fatal signal is received
|
||||
*
|
||||
* Walk the given range of the given address space and call the destroy
|
||||
* secure page UVC on each page. Optionally exit early if a fatal signal is
|
||||
* pending.
|
||||
*
|
||||
* Return: 0 on success, -EINTR if the function stopped before completing
|
||||
*/
|
||||
int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, bool interruptible)
|
||||
{
|
||||
struct reset_walk_state state = { .next = start };
|
||||
int r = 1;
|
||||
|
||||
while (r > 0) {
|
||||
state.count = 0;
|
||||
mmap_read_lock(mm);
|
||||
r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
|
||||
mmap_read_unlock(mm);
|
||||
cond_resched();
|
||||
s390_uv_destroy_pfns(state.count, state.pfns);
|
||||
if (interruptible && fatal_signal_pending(current))
|
||||
return -EINTR;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
|
||||
|
||||
/**
|
||||
* s390_unlist_old_asce - Remove the topmost level of page tables from the
|
||||
* list of page tables of the gmap.
|
||||
* @gmap: the gmap whose table is to be removed
|
||||
*
|
||||
* On s390x, KVM keeps a list of all pages containing the page tables of the
|
||||
* gmap (the CRST list). This list is used at tear down time to free all
|
||||
* pages that are now not needed anymore.
|
||||
*
|
||||
* This function removes the topmost page of the tree (the one pointed to by
|
||||
* the ASCE) from the CRST list.
|
||||
*
|
||||
* This means that it will not be freed when the VM is torn down, and needs
|
||||
* to be handled separately by the caller, unless a leak is actually
|
||||
* intended. Notice that this function will only remove the page from the
|
||||
* list, the page will still be used as a top level page table (and ASCE).
|
||||
*/
|
||||
void s390_unlist_old_asce(struct gmap *gmap)
|
||||
{
|
||||
struct page *old;
|
||||
|
||||
old = virt_to_page(gmap->table);
|
||||
spin_lock(&gmap->guest_table_lock);
|
||||
list_del(&old->lru);
|
||||
/*
|
||||
* Sometimes the topmost page might need to be "removed" multiple
|
||||
* times, for example if the VM is rebooted into secure mode several
|
||||
* times concurrently, or if s390_replace_asce fails after calling
|
||||
* s390_remove_old_asce and is attempted again later. In that case
|
||||
* the old asce has been removed from the list, and therefore it
|
||||
* will not be freed when the VM terminates, but the ASCE is still
|
||||
* in use and still pointed to.
|
||||
* A subsequent call to replace_asce will follow the pointer and try
|
||||
* to remove the same page from the list again.
|
||||
* Therefore it's necessary that the page of the ASCE has valid
|
||||
* pointers, so list_del can work (and do nothing) without
|
||||
* dereferencing stale or invalid pointers.
|
||||
*/
|
||||
INIT_LIST_HEAD(&old->lru);
|
||||
spin_unlock(&gmap->guest_table_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
|
||||
|
||||
/**
|
||||
* s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
|
||||
* @gmap: the gmap whose ASCE needs to be replaced
|
||||
*
|
||||
* If the allocation of the new top level page table fails, the ASCE is not
|
||||
* replaced.
|
||||
* In any case, the old ASCE is always removed from the gmap CRST list.
|
||||
* Therefore the caller has to make sure to save a pointer to it
|
||||
* beforehand, unless a leak is actually intended.
|
||||
*/
|
||||
int s390_replace_asce(struct gmap *gmap)
|
||||
{
|
||||
unsigned long asce;
|
||||
struct page *page;
|
||||
void *table;
|
||||
|
||||
s390_unlist_old_asce(gmap);
|
||||
|
||||
page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
table = page_to_virt(page);
|
||||
memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
|
||||
|
||||
/*
|
||||
* The caller has to deal with the old ASCE, but here we make sure
|
||||
* the new one is properly added to the CRST list, so that
|
||||
* it will be freed when the VM is torn down.
|
||||
*/
|
||||
spin_lock(&gmap->guest_table_lock);
|
||||
list_add(&page->lru, &gmap->crst_list);
|
||||
spin_unlock(&gmap->guest_table_lock);
|
||||
|
||||
/* Set new table origin while preserving existing ASCE control bits */
|
||||
asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
|
||||
WRITE_ONCE(gmap->asce, asce);
|
||||
WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
|
||||
WRITE_ONCE(gmap->table, table);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct mm_walk_ops reset_acc_walk_ops = {
|
||||
.pte_entry = __s390_reset_acc,
|
||||
};
|
||||
|
||||
#include <linux/sched/mm.h>
|
||||
void s390_reset_acc(struct mm_struct *mm)
|
||||
{
|
||||
if (!mm_is_protected(mm))
|
||||
return;
|
||||
/*
|
||||
* we might be called during
|
||||
* reset: we walk the pages and clear
|
||||
* close of all kvm file descriptors: we walk the pages and clear
|
||||
* exit of process on fd closure: vma already gone, do nothing
|
||||
*/
|
||||
if (!mmget_not_zero(mm))
|
||||
return;
|
||||
mmap_read_lock(mm);
|
||||
walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
|
||||
mmap_read_unlock(mm);
|
||||
mmput(mm);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(s390_reset_acc);
|
||||
EXPORT_SYMBOL_GPL(s390_replace_asce);
|
||||
|
|
|
@ -61,6 +61,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio);
|
|||
|
||||
static struct kmem_cache *zdev_fmb_cache;
|
||||
|
||||
/* AEN structures that must be preserved over KVM module re-insertion */
|
||||
union zpci_sic_iib *zpci_aipb;
|
||||
EXPORT_SYMBOL_GPL(zpci_aipb);
|
||||
struct airq_iv *zpci_aif_sbv;
|
||||
EXPORT_SYMBOL_GPL(zpci_aif_sbv);
|
||||
|
||||
struct zpci_dev *get_zdev_by_fid(u32 fid)
|
||||
{
|
||||
struct zpci_dev *tmp, *zdev = NULL;
|
||||
|
@ -120,11 +126,13 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
|
|||
fib.pba = base;
|
||||
fib.pal = limit;
|
||||
fib.iota = iota | ZPCI_IOTA_RTTO_FLAG;
|
||||
fib.gd = zdev->gisa;
|
||||
cc = zpci_mod_fc(req, &fib, &status);
|
||||
if (cc)
|
||||
zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
|
||||
return cc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(zpci_register_ioat);
|
||||
|
||||
/* Modify PCI: Unregister I/O address translation parameters */
|
||||
int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
|
||||
|
@ -133,6 +141,8 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
|
|||
struct zpci_fib fib = {0};
|
||||
u8 cc, status;
|
||||
|
||||
fib.gd = zdev->gisa;
|
||||
|
||||
cc = zpci_mod_fc(req, &fib, &status);
|
||||
if (cc)
|
||||
zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
|
||||
|
@ -160,6 +170,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
|
|||
atomic64_set(&zdev->unmapped_pages, 0);
|
||||
|
||||
fib.fmb_addr = virt_to_phys(zdev->fmb);
|
||||
fib.gd = zdev->gisa;
|
||||
cc = zpci_mod_fc(req, &fib, &status);
|
||||
if (cc) {
|
||||
kmem_cache_free(zdev_fmb_cache, zdev->fmb);
|
||||
|
@ -178,6 +189,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev)
|
|||
if (!zdev->fmb)
|
||||
return -EINVAL;
|
||||
|
||||
fib.gd = zdev->gisa;
|
||||
|
||||
/* Function measurement is disabled if fmb address is zero */
|
||||
cc = zpci_mod_fc(req, &fib, &status);
|
||||
if (cc == 3) /* Function already gone. */
|
||||
|
@ -700,6 +713,7 @@ int zpci_enable_device(struct zpci_dev *zdev)
|
|||
zpci_update_fh(zdev, fh);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(zpci_enable_device);
|
||||
|
||||
int zpci_disable_device(struct zpci_dev *zdev)
|
||||
{
|
||||
|
@ -723,6 +737,7 @@ int zpci_disable_device(struct zpci_dev *zdev)
|
|||
}
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(zpci_disable_device);
|
||||
|
||||
/**
|
||||
* zpci_hot_reset_device - perform a reset of the given zPCI function
|
||||
|
@ -816,6 +831,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
|
|||
|
||||
kref_init(&zdev->kref);
|
||||
mutex_init(&zdev->lock);
|
||||
mutex_init(&zdev->kzdev_lock);
|
||||
|
||||
rc = zpci_init_iommu(zdev);
|
||||
if (rc)
|
||||
|
|
|
@ -106,6 +106,8 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev,
|
|||
zdev->max_msi = response->noi;
|
||||
zdev->fmb_update = response->mui;
|
||||
zdev->version = response->version;
|
||||
zdev->maxstbl = response->maxstbl;
|
||||
zdev->dtsm = response->dtsm;
|
||||
|
||||
switch (response->version) {
|
||||
case 1:
|
||||
|
@ -229,12 +231,16 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma
|
|||
{
|
||||
struct clp_req_rsp_set_pci *rrb;
|
||||
int rc, retries = 100;
|
||||
u32 gisa = 0;
|
||||
|
||||
*fh = 0;
|
||||
rrb = clp_alloc_block(GFP_KERNEL);
|
||||
if (!rrb)
|
||||
return -ENOMEM;
|
||||
|
||||
if (command != CLP_SET_DISABLE_PCI_FN)
|
||||
gisa = zdev->gisa;
|
||||
|
||||
do {
|
||||
memset(rrb, 0, sizeof(*rrb));
|
||||
rrb->request.hdr.len = sizeof(rrb->request);
|
||||
|
@ -243,6 +249,7 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma
|
|||
rrb->request.fh = zdev->fh;
|
||||
rrb->request.oc = command;
|
||||
rrb->request.ndas = nr_dma_as;
|
||||
rrb->request.gisa = gisa;
|
||||
|
||||
rc = clp_req(rrb, CLP_LPS_PCI);
|
||||
if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) {
|
||||
|
|
|
@ -92,6 +92,7 @@ u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status)
|
|||
|
||||
return cc;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(zpci_mod_fc);
|
||||
|
||||
/* Refresh PCI Translations */
|
||||
static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
|
||||
|
@ -138,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
|
|||
}
|
||||
|
||||
/* Set Interruption Controls */
|
||||
int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
|
||||
int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
|
||||
{
|
||||
if (!test_facility(72))
|
||||
return -EIO;
|
||||
|
@ -149,6 +150,7 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
|
|||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl);
|
||||
|
||||
/* PCI Load */
|
||||
static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status)
|
||||
|
|
|
@ -11,16 +11,10 @@
|
|||
|
||||
#include <asm/isc.h>
|
||||
#include <asm/airq.h>
|
||||
#include <asm/tpi.h>
|
||||
|
||||
static enum {FLOATING, DIRECTED} irq_delivery;
|
||||
|
||||
#define SIC_IRQ_MODE_ALL 0
|
||||
#define SIC_IRQ_MODE_SINGLE 1
|
||||
#define SIC_IRQ_MODE_DIRECT 4
|
||||
#define SIC_IRQ_MODE_D_ALL 16
|
||||
#define SIC_IRQ_MODE_D_SINGLE 17
|
||||
#define SIC_IRQ_MODE_SET_CPU 18
|
||||
|
||||
/*
|
||||
* summary bit vector
|
||||
* FLOATING - summary bit per function
|
||||
|
@ -49,6 +43,7 @@ static int zpci_set_airq(struct zpci_dev *zdev)
|
|||
fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */
|
||||
fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8;
|
||||
fib.fmt0.aisbo = zdev->aisb & 63;
|
||||
fib.gd = zdev->gisa;
|
||||
|
||||
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
|
||||
}
|
||||
|
@ -60,6 +55,8 @@ static int zpci_clear_airq(struct zpci_dev *zdev)
|
|||
struct zpci_fib fib = {0};
|
||||
u8 cc, status;
|
||||
|
||||
fib.gd = zdev->gisa;
|
||||
|
||||
cc = zpci_mod_fc(req, &fib, &status);
|
||||
if (cc == 3 || (cc == 1 && status == 24))
|
||||
/* Function already gone or IRQs already deregistered. */
|
||||
|
@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev)
|
|||
fib.fmt = 1;
|
||||
fib.fmt1.noi = zdev->msi_nr_irqs;
|
||||
fib.fmt1.dibvo = zdev->msi_first_bit;
|
||||
fib.gd = zdev->gisa;
|
||||
|
||||
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
|
||||
}
|
||||
|
@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
|
|||
u8 cc, status;
|
||||
|
||||
fib.fmt = 1;
|
||||
fib.gd = zdev->gisa;
|
||||
cc = zpci_mod_fc(req, &fib, &status);
|
||||
if (cc == 3 || (cc == 1 && status == 24))
|
||||
/* Function already gone or IRQs already deregistered. */
|
||||
|
@ -153,6 +152,7 @@ static struct irq_chip zpci_irq_chip = {
|
|||
static void zpci_handle_cpu_local_irq(bool rescan)
|
||||
{
|
||||
struct airq_iv *dibv = zpci_ibv[smp_processor_id()];
|
||||
union zpci_sic_iib iib = {{0}};
|
||||
unsigned long bit;
|
||||
int irqs_on = 0;
|
||||
|
||||
|
@ -164,7 +164,7 @@ static void zpci_handle_cpu_local_irq(bool rescan)
|
|||
/* End of second scan with interrupts on. */
|
||||
break;
|
||||
/* First scan complete, reenable interrupts. */
|
||||
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC))
|
||||
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib))
|
||||
break;
|
||||
bit = 0;
|
||||
continue;
|
||||
|
@ -192,6 +192,7 @@ static void zpci_handle_remote_irq(void *data)
|
|||
static void zpci_handle_fallback_irq(void)
|
||||
{
|
||||
struct cpu_irq_data *cpu_data;
|
||||
union zpci_sic_iib iib = {{0}};
|
||||
unsigned long cpu;
|
||||
int irqs_on = 0;
|
||||
|
||||
|
@ -202,7 +203,7 @@ static void zpci_handle_fallback_irq(void)
|
|||
/* End of second scan with interrupts on. */
|
||||
break;
|
||||
/* First scan complete, reenable interrupts. */
|
||||
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
|
||||
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
|
||||
break;
|
||||
cpu = 0;
|
||||
continue;
|
||||
|
@ -216,8 +217,11 @@ static void zpci_handle_fallback_irq(void)
|
|||
}
|
||||
}
|
||||
|
||||
static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
|
||||
static void zpci_directed_irq_handler(struct airq_struct *airq,
|
||||
struct tpi_info *tpi_info)
|
||||
{
|
||||
bool floating = !tpi_info->directed_irq;
|
||||
|
||||
if (floating) {
|
||||
inc_irq_stat(IRQIO_PCF);
|
||||
zpci_handle_fallback_irq();
|
||||
|
@ -227,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
|
|||
}
|
||||
}
|
||||
|
||||
static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
|
||||
static void zpci_floating_irq_handler(struct airq_struct *airq,
|
||||
struct tpi_info *tpi_info)
|
||||
{
|
||||
union zpci_sic_iib iib = {{0}};
|
||||
unsigned long si, ai;
|
||||
struct airq_iv *aibv;
|
||||
int irqs_on = 0;
|
||||
|
@ -242,7 +248,7 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
|
|||
/* End of second scan with interrupts on. */
|
||||
break;
|
||||
/* First scan complete, reenable interrupts. */
|
||||
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
|
||||
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
|
||||
break;
|
||||
si = 0;
|
||||
continue;
|
||||
|
@ -291,7 +297,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
|
|||
zdev->aisb = bit;
|
||||
|
||||
/* Create adapter interrupt vector */
|
||||
zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK);
|
||||
zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL);
|
||||
if (!zdev->aibv)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -402,11 +408,12 @@ static struct airq_struct zpci_airq = {
|
|||
static void __init cpu_enable_directed_irq(void *unused)
|
||||
{
|
||||
union zpci_sic_iib iib = {{0}};
|
||||
union zpci_sic_iib ziib = {{0}};
|
||||
|
||||
iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector;
|
||||
|
||||
__zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
|
||||
zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC);
|
||||
zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
|
||||
zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib);
|
||||
}
|
||||
|
||||
static int __init zpci_directed_irq_init(void)
|
||||
|
@ -414,14 +421,14 @@ static int __init zpci_directed_irq_init(void)
|
|||
union zpci_sic_iib iib = {{0}};
|
||||
unsigned int cpu;
|
||||
|
||||
zpci_sbv = airq_iv_create(num_possible_cpus(), 0);
|
||||
zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL);
|
||||
if (!zpci_sbv)
|
||||
return -ENOMEM;
|
||||
|
||||
iib.diib.isc = PCI_ISC;
|
||||
iib.diib.nr_cpus = num_possible_cpus();
|
||||
iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector);
|
||||
__zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
|
||||
zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
|
||||
|
||||
zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv),
|
||||
GFP_KERNEL);
|
||||
|
@ -436,7 +443,7 @@ static int __init zpci_directed_irq_init(void)
|
|||
zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
|
||||
AIRQ_IV_DATA |
|
||||
AIRQ_IV_CACHELINE |
|
||||
(!cpu ? AIRQ_IV_ALLOC : 0));
|
||||
(!cpu ? AIRQ_IV_ALLOC : 0), NULL);
|
||||
if (!zpci_ibv[cpu])
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
@ -453,7 +460,7 @@ static int __init zpci_floating_irq_init(void)
|
|||
if (!zpci_ibv)
|
||||
return -ENOMEM;
|
||||
|
||||
zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC);
|
||||
zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
|
||||
if (!zpci_sbv)
|
||||
goto out_free;
|
||||
|
||||
|
@ -466,6 +473,7 @@ out_free:
|
|||
|
||||
int __init zpci_irq_init(void)
|
||||
{
|
||||
union zpci_sic_iib iib = {{0}};
|
||||
int rc;
|
||||
|
||||
irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING;
|
||||
|
@ -497,7 +505,7 @@ int __init zpci_irq_init(void)
|
|||
* Enable floating IRQs (with suppression after one IRQ). When using
|
||||
* directed IRQs this enables the fallback path.
|
||||
*/
|
||||
zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC);
|
||||
zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib);
|
||||
|
||||
return 0;
|
||||
out_airq:
|
||||
|
|
|
@ -111,6 +111,7 @@ static struct facility_def facility_defs[] = {
|
|||
193, /* bear enhancement facility */
|
||||
194, /* rdp enhancement facility */
|
||||
196, /* processor activity instrumentation facility */
|
||||
197, /* processor activity instrumentation extension 1 */
|
||||
-1 /* END */
|
||||
}
|
||||
},
|
||||
|
|
|
@ -693,9 +693,9 @@ void x86_pmu_disable_all(void)
|
|||
}
|
||||
}
|
||||
|
||||
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
|
||||
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
|
||||
{
|
||||
return static_call(x86_pmu_guest_get_msrs)(nr);
|
||||
return static_call(x86_pmu_guest_get_msrs)(nr, data);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
|
||||
|
||||
|
@ -2103,14 +2103,15 @@ static int __init init_hw_perf_events(void)
|
|||
}
|
||||
if (err != 0) {
|
||||
pr_cont("no PMU driver, software events only.\n");
|
||||
return 0;
|
||||
err = 0;
|
||||
goto out_bad_pmu;
|
||||
}
|
||||
|
||||
pmu_check_apic();
|
||||
|
||||
/* sanity check that the hardware exists or is emulated */
|
||||
if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed))
|
||||
return 0;
|
||||
goto out_bad_pmu;
|
||||
|
||||
pr_cont("%s PMU driver.\n", x86_pmu.name);
|
||||
|
||||
|
@ -2219,6 +2220,8 @@ out1:
|
|||
cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
|
||||
out:
|
||||
cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
|
||||
out_bad_pmu:
|
||||
memset(&x86_pmu, 0, sizeof(x86_pmu));
|
||||
return err;
|
||||
}
|
||||
early_initcall(init_hw_perf_events);
|
||||
|
@ -2990,6 +2993,11 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
|
|||
|
||||
void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
|
||||
{
|
||||
if (!x86_pmu_initialized()) {
|
||||
memset(cap, 0, sizeof(*cap));
|
||||
return;
|
||||
}
|
||||
|
||||
cap->version = x86_pmu.version;
|
||||
/*
|
||||
* KVM doesn't support the hybrid PMU yet.
|
||||
|
@ -3002,5 +3010,17 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
|
|||
cap->bit_width_fixed = x86_pmu.cntval_bits;
|
||||
cap->events_mask = (unsigned int)x86_pmu.events_maskl;
|
||||
cap->events_mask_len = x86_pmu.events_mask_len;
|
||||
cap->pebs_ept = x86_pmu.pebs_ept;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
|
||||
|
||||
u64 perf_get_hw_event_config(int hw_event)
|
||||
{
|
||||
int max = x86_pmu.max_events;
|
||||
|
||||
if (hw_event < max)
|
||||
return x86_pmu.event_map(array_index_nospec(hw_event, max));
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/nmi.h>
|
||||
#include <linux/kvm_host.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/hardirq.h>
|
||||
|
@ -2852,6 +2853,47 @@ static void intel_pmu_reset(void)
|
|||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* We may be running with guest PEBS events created by KVM, and the
|
||||
* PEBS records are logged into the guest's DS and invisible to host.
|
||||
*
|
||||
* In the case of guest PEBS overflow, we only trigger a fake event
|
||||
* to emulate the PEBS overflow PMI for guest PEBS counters in KVM.
|
||||
* The guest will then vm-entry and check the guest DS area to read
|
||||
* the guest PEBS records.
|
||||
*
|
||||
* The contents and other behavior of the guest event do not matter.
|
||||
*/
|
||||
static void x86_pmu_handle_guest_pebs(struct pt_regs *regs,
|
||||
struct perf_sample_data *data)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask;
|
||||
struct perf_event *event = NULL;
|
||||
int bit;
|
||||
|
||||
if (!unlikely(perf_guest_state()))
|
||||
return;
|
||||
|
||||
if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active ||
|
||||
!guest_pebs_idxs)
|
||||
return;
|
||||
|
||||
for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs,
|
||||
INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) {
|
||||
event = cpuc->events[bit];
|
||||
if (!event->attr.precise_ip)
|
||||
continue;
|
||||
|
||||
perf_sample_data_init(data, 0, event->hw.last_period);
|
||||
if (perf_event_overflow(event, data, regs))
|
||||
x86_pmu_stop(event, 0);
|
||||
|
||||
/* Inject one fake event is enough. */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int handle_pmi_common(struct pt_regs *regs, u64 status)
|
||||
{
|
||||
struct perf_sample_data data;
|
||||
|
@ -2891,10 +2933,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
|
|||
* counters from the GLOBAL_STATUS mask and we always process PEBS
|
||||
* events via drain_pebs().
|
||||
*/
|
||||
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
|
||||
status &= ~cpuc->pebs_enabled;
|
||||
else
|
||||
status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
|
||||
status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable);
|
||||
|
||||
/*
|
||||
* PEBS overflow sets bit 62 in the global status register
|
||||
|
@ -2903,6 +2942,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
|
|||
u64 pebs_enabled = cpuc->pebs_enabled;
|
||||
|
||||
handled++;
|
||||
x86_pmu_handle_guest_pebs(regs, &data);
|
||||
x86_pmu.drain_pebs(regs, &data);
|
||||
status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
|
||||
|
||||
|
@ -3930,40 +3970,98 @@ static int intel_pmu_hw_config(struct perf_event *event)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
|
||||
/*
|
||||
* Currently, the only caller of this function is the atomic_switch_perf_msrs().
|
||||
* The host perf conext helps to prepare the values of the real hardware for
|
||||
* a set of msrs that need to be switched atomically in a vmx transaction.
|
||||
*
|
||||
* For example, the pseudocode needed to add a new msr should look like:
|
||||
*
|
||||
* arr[(*nr)++] = (struct perf_guest_switch_msr){
|
||||
* .msr = the hardware msr address,
|
||||
* .host = the value the hardware has when it doesn't run a guest,
|
||||
* .guest = the value the hardware has when it runs a guest,
|
||||
* };
|
||||
*
|
||||
* These values have nothing to do with the emulated values the guest sees
|
||||
* when it uses {RD,WR}MSR, which should be handled by the KVM context,
|
||||
* specifically in the intel_pmu_{get,set}_msr().
|
||||
*/
|
||||
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
|
||||
struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data;
|
||||
u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
|
||||
u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable;
|
||||
int global_ctrl, pebs_enable;
|
||||
|
||||
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
|
||||
arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
|
||||
arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask;
|
||||
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
|
||||
arr[0].guest &= ~cpuc->pebs_enabled;
|
||||
else
|
||||
arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
|
||||
*nr = 1;
|
||||
*nr = 0;
|
||||
global_ctrl = (*nr)++;
|
||||
arr[global_ctrl] = (struct perf_guest_switch_msr){
|
||||
.msr = MSR_CORE_PERF_GLOBAL_CTRL,
|
||||
.host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask,
|
||||
.guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask),
|
||||
};
|
||||
|
||||
if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
|
||||
/*
|
||||
* If PMU counter has PEBS enabled it is not enough to
|
||||
* disable counter on a guest entry since PEBS memory
|
||||
* write can overshoot guest entry and corrupt guest
|
||||
* memory. Disabling PEBS solves the problem.
|
||||
*
|
||||
* Don't do this if the CPU already enforces it.
|
||||
*/
|
||||
arr[1].msr = MSR_IA32_PEBS_ENABLE;
|
||||
arr[1].host = cpuc->pebs_enabled;
|
||||
arr[1].guest = 0;
|
||||
*nr = 2;
|
||||
if (!x86_pmu.pebs)
|
||||
return arr;
|
||||
|
||||
/*
|
||||
* If PMU counter has PEBS enabled it is not enough to
|
||||
* disable counter on a guest entry since PEBS memory
|
||||
* write can overshoot guest entry and corrupt guest
|
||||
* memory. Disabling PEBS solves the problem.
|
||||
*
|
||||
* Don't do this if the CPU already enforces it.
|
||||
*/
|
||||
if (x86_pmu.pebs_no_isolation) {
|
||||
arr[(*nr)++] = (struct perf_guest_switch_msr){
|
||||
.msr = MSR_IA32_PEBS_ENABLE,
|
||||
.host = cpuc->pebs_enabled,
|
||||
.guest = 0,
|
||||
};
|
||||
return arr;
|
||||
}
|
||||
|
||||
if (!kvm_pmu || !x86_pmu.pebs_ept)
|
||||
return arr;
|
||||
|
||||
arr[(*nr)++] = (struct perf_guest_switch_msr){
|
||||
.msr = MSR_IA32_DS_AREA,
|
||||
.host = (unsigned long)cpuc->ds,
|
||||
.guest = kvm_pmu->ds_area,
|
||||
};
|
||||
|
||||
if (x86_pmu.intel_cap.pebs_baseline) {
|
||||
arr[(*nr)++] = (struct perf_guest_switch_msr){
|
||||
.msr = MSR_PEBS_DATA_CFG,
|
||||
.host = cpuc->pebs_data_cfg,
|
||||
.guest = kvm_pmu->pebs_data_cfg,
|
||||
};
|
||||
}
|
||||
|
||||
pebs_enable = (*nr)++;
|
||||
arr[pebs_enable] = (struct perf_guest_switch_msr){
|
||||
.msr = MSR_IA32_PEBS_ENABLE,
|
||||
.host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask,
|
||||
.guest = pebs_mask & ~cpuc->intel_ctrl_host_mask,
|
||||
};
|
||||
|
||||
if (arr[pebs_enable].host) {
|
||||
/* Disable guest PEBS if host PEBS is enabled. */
|
||||
arr[pebs_enable].guest = 0;
|
||||
} else {
|
||||
/* Disable guest PEBS for cross-mapped PEBS counters. */
|
||||
arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask;
|
||||
/* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */
|
||||
arr[global_ctrl].guest |= arr[pebs_enable].guest;
|
||||
}
|
||||
|
||||
return arr;
|
||||
}
|
||||
|
||||
static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
|
||||
static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
|
||||
|
@ -5650,6 +5748,7 @@ __init int intel_pmu_init(void)
|
|||
x86_pmu.events_mask_len = eax.split.mask_length;
|
||||
|
||||
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
|
||||
x86_pmu.pebs_capable = PEBS_COUNTER_MASK;
|
||||
|
||||
/*
|
||||
* Quirk: v2 perfmon does not report fixed-purpose events, so
|
||||
|
@ -5834,6 +5933,7 @@ __init int intel_pmu_init(void)
|
|||
x86_pmu.pebs_aliases = NULL;
|
||||
x86_pmu.pebs_prec_dist = true;
|
||||
x86_pmu.lbr_pt_coexist = true;
|
||||
x86_pmu.pebs_capable = ~0ULL;
|
||||
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
||||
x86_pmu.flags |= PMU_FL_PEBS_ALL;
|
||||
x86_pmu.get_event_constraints = glp_get_event_constraints;
|
||||
|
@ -6138,6 +6238,7 @@ __init int intel_pmu_init(void)
|
|||
|
||||
case INTEL_FAM6_ICELAKE_X:
|
||||
case INTEL_FAM6_ICELAKE_D:
|
||||
x86_pmu.pebs_ept = 1;
|
||||
pmem = true;
|
||||
fallthrough;
|
||||
case INTEL_FAM6_ICELAKE_L:
|
||||
|
@ -6190,6 +6291,7 @@ __init int intel_pmu_init(void)
|
|||
x86_pmu.pebs_aliases = NULL;
|
||||
x86_pmu.pebs_prec_dist = true;
|
||||
x86_pmu.pebs_block = true;
|
||||
x86_pmu.pebs_capable = ~0ULL;
|
||||
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
||||
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
||||
x86_pmu.flags |= PMU_FL_PEBS_ALL;
|
||||
|
@ -6235,6 +6337,7 @@ __init int intel_pmu_init(void)
|
|||
x86_pmu.pebs_aliases = NULL;
|
||||
x86_pmu.pebs_prec_dist = true;
|
||||
x86_pmu.pebs_block = true;
|
||||
x86_pmu.pebs_capable = ~0ULL;
|
||||
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
||||
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
||||
x86_pmu.flags |= PMU_FL_PEBS_ALL;
|
||||
|
@ -6398,8 +6501,7 @@ __init int intel_pmu_init(void)
|
|||
x86_pmu.intel_ctrl);
|
||||
/*
|
||||
* Access LBR MSR may cause #GP under certain circumstances.
|
||||
* E.g. KVM doesn't support LBR MSR
|
||||
* Check all LBT MSR here.
|
||||
* Check all LBR MSR here.
|
||||
* Disable LBR access if any LBR MSRs can not be accessed.
|
||||
*/
|
||||
if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))
|
||||
|
|
|
@ -818,7 +818,8 @@ struct x86_pmu {
|
|||
pebs_prec_dist :1,
|
||||
pebs_no_tlb :1,
|
||||
pebs_no_isolation :1,
|
||||
pebs_block :1;
|
||||
pebs_block :1,
|
||||
pebs_ept :1;
|
||||
int pebs_record_size;
|
||||
int pebs_buffer_size;
|
||||
int max_pebs_events;
|
||||
|
@ -827,6 +828,7 @@ struct x86_pmu {
|
|||
void (*pebs_aliases)(struct perf_event *event);
|
||||
unsigned long large_pebs_flags;
|
||||
u64 rtm_abort_event;
|
||||
u64 pebs_capable;
|
||||
|
||||
/*
|
||||
* Intel LBR
|
||||
|
@ -902,7 +904,7 @@ struct x86_pmu {
|
|||
/*
|
||||
* Intel host/guest support (KVM)
|
||||
*/
|
||||
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
|
||||
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr, void *data);
|
||||
|
||||
/*
|
||||
* Check period value for PERF_EVENT_IOC_PERIOD ioctl.
|
||||
|
|
|
@ -46,7 +46,7 @@ static void hv_apic_icr_write(u32 low, u32 id)
|
|||
{
|
||||
u64 reg_val;
|
||||
|
||||
reg_val = SET_APIC_DEST_FIELD(id);
|
||||
reg_val = SET_XAPIC_DEST_FIELD(id);
|
||||
reg_val = reg_val << 32;
|
||||
reg_val |= low;
|
||||
|
||||
|
|
|
@ -89,8 +89,8 @@
|
|||
#define APIC_DM_EXTINT 0x00700
|
||||
#define APIC_VECTOR_MASK 0x000FF
|
||||
#define APIC_ICR2 0x310
|
||||
#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
|
||||
#define SET_APIC_DEST_FIELD(x) ((x) << 24)
|
||||
#define GET_XAPIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
|
||||
#define SET_XAPIC_DEST_FIELD(x) ((x) << 24)
|
||||
#define APIC_LVTT 0x320
|
||||
#define APIC_LVTTHMR 0x330
|
||||
#define APIC_LVTPC 0x340
|
||||
|
|
|
@ -353,6 +353,7 @@
|
|||
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
|
||||
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
|
||||
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
|
||||
#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */
|
||||
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
|
||||
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr)
|
|||
KVM_X86_OP(vcpu_after_set_cpuid)
|
||||
KVM_X86_OP(vm_init)
|
||||
KVM_X86_OP_OPTIONAL(vm_destroy)
|
||||
KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
|
||||
KVM_X86_OP(vcpu_create)
|
||||
KVM_X86_OP(vcpu_free)
|
||||
KVM_X86_OP(vcpu_reset)
|
||||
|
@ -87,7 +88,7 @@ KVM_X86_OP(deliver_interrupt)
|
|||
KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
|
||||
KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
|
||||
KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
|
||||
KVM_X86_OP(get_mt_mask)
|
||||
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
|
||||
KVM_X86_OP(load_mmu_pgd)
|
||||
KVM_X86_OP(has_wbinvd_exit)
|
||||
KVM_X86_OP(get_l2_tsc_offset)
|
||||
|
|
|
@ -12,7 +12,7 @@ BUILD_BUG_ON(1)
|
|||
* a NULL definition, for example if "static_call_cond()" will be used
|
||||
* at the call sites.
|
||||
*/
|
||||
KVM_X86_PMU_OP(pmc_perf_hw_id)
|
||||
KVM_X86_PMU_OP(hw_event_available)
|
||||
KVM_X86_PMU_OP(pmc_is_enabled)
|
||||
KVM_X86_PMU_OP(pmc_idx_to_pmc)
|
||||
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
|
||||
|
|
|
@ -65,6 +65,9 @@
|
|||
#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \
|
||||
KVM_BUS_LOCK_DETECTION_EXIT)
|
||||
|
||||
#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \
|
||||
KVM_X86_NOTIFY_VMEXIT_USER)
|
||||
|
||||
/* x86-specific vcpu->requests bit members */
|
||||
#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
|
||||
#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
|
||||
|
@ -126,7 +129,6 @@
|
|||
#define INVALID_PAGE (~(hpa_t)0)
|
||||
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
|
||||
|
||||
#define UNMAPPED_GVA (~(gpa_t)0)
|
||||
#define INVALID_GPA (~(gpa_t)0)
|
||||
|
||||
/* KVM Hugepage definitions for x86 */
|
||||
|
@ -505,6 +507,7 @@ struct kvm_pmu {
|
|||
unsigned nr_arch_fixed_counters;
|
||||
unsigned available_event_types;
|
||||
u64 fixed_ctr_ctrl;
|
||||
u64 fixed_ctr_ctrl_mask;
|
||||
u64 global_ctrl;
|
||||
u64 global_status;
|
||||
u64 counter_bitmask[2];
|
||||
|
@ -520,6 +523,21 @@ struct kvm_pmu {
|
|||
DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
|
||||
DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
|
||||
|
||||
u64 ds_area;
|
||||
u64 pebs_enable;
|
||||
u64 pebs_enable_mask;
|
||||
u64 pebs_data_cfg;
|
||||
u64 pebs_data_cfg_mask;
|
||||
|
||||
/*
|
||||
* If a guest counter is cross-mapped to host counter with different
|
||||
* index, its PEBS capability will be temporarily disabled.
|
||||
*
|
||||
* The user should make sure that this mask is updated
|
||||
* after disabling interrupts and before perf_guest_get_msrs();
|
||||
*/
|
||||
u64 host_cross_mapped_mask;
|
||||
|
||||
/*
|
||||
* The gate to release perf_events not marked in
|
||||
* pmc_in_use only once in a vcpu time slice.
|
||||
|
@ -644,7 +662,6 @@ struct kvm_vcpu_arch {
|
|||
u64 efer;
|
||||
u64 apic_base;
|
||||
struct kvm_lapic *apic; /* kernel irqchip context */
|
||||
bool apicv_active;
|
||||
bool load_eoi_exitmap_pending;
|
||||
DECLARE_BITMAP(ioapic_handled_vectors, 256);
|
||||
unsigned long apic_attention;
|
||||
|
@ -695,7 +712,7 @@ struct kvm_vcpu_arch {
|
|||
|
||||
struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
|
||||
struct kvm_mmu_memory_cache mmu_shadow_page_cache;
|
||||
struct kvm_mmu_memory_cache mmu_gfn_array_cache;
|
||||
struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
|
||||
struct kvm_mmu_memory_cache mmu_page_header_cache;
|
||||
|
||||
/*
|
||||
|
@ -808,6 +825,7 @@ struct kvm_vcpu_arch {
|
|||
u64 mcg_ctl;
|
||||
u64 mcg_ext_ctl;
|
||||
u64 *mce_banks;
|
||||
u64 *mci_ctl2_banks;
|
||||
|
||||
/* Cache MMIO info */
|
||||
u64 mmio_gva;
|
||||
|
@ -1110,11 +1128,6 @@ enum kvm_apicv_inhibit {
|
|||
*/
|
||||
APICV_INHIBIT_REASON_PIT_REINJ,
|
||||
|
||||
/*
|
||||
* AVIC is inhibited because the guest has x2apic in its CPUID.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_X2APIC,
|
||||
|
||||
/*
|
||||
* AVIC is disabled because SEV doesn't support it.
|
||||
*/
|
||||
|
@ -1222,8 +1235,13 @@ struct kvm_arch {
|
|||
bool guest_can_read_msr_platform_info;
|
||||
bool exception_payload_enabled;
|
||||
|
||||
bool triple_fault_event;
|
||||
|
||||
bool bus_lock_detection_enabled;
|
||||
bool enable_pmu;
|
||||
|
||||
u32 notify_window;
|
||||
u32 notify_vmexit_flags;
|
||||
/*
|
||||
* If exit_on_emulation_error is set, and the in-kernel instruction
|
||||
* emulator fails to emulate an instruction, allow userspace
|
||||
|
@ -1307,6 +1325,36 @@ struct kvm_arch {
|
|||
hpa_t hv_root_tdp;
|
||||
spinlock_t hv_root_tdp_lock;
|
||||
#endif
|
||||
/*
|
||||
* VM-scope maximum vCPU ID. Used to determine the size of structures
|
||||
* that increase along with the maximum vCPU ID, in which case, using
|
||||
* the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
|
||||
*/
|
||||
u32 max_vcpu_ids;
|
||||
|
||||
bool disable_nx_huge_pages;
|
||||
|
||||
/*
|
||||
* Memory caches used to allocate shadow pages when performing eager
|
||||
* page splitting. No need for a shadowed_info_cache since eager page
|
||||
* splitting only allocates direct shadow pages.
|
||||
*
|
||||
* Protected by kvm->slots_lock.
|
||||
*/
|
||||
struct kvm_mmu_memory_cache split_shadow_page_cache;
|
||||
struct kvm_mmu_memory_cache split_page_header_cache;
|
||||
|
||||
/*
|
||||
* Memory cache used to allocate pte_list_desc structs while splitting
|
||||
* huge pages. In the worst case, to split one huge page, 512
|
||||
* pte_list_desc structs are needed to add each lower level leaf sptep
|
||||
* to the rmap plus 1 to extend the parent_ptes rmap of the lower level
|
||||
* page table.
|
||||
*
|
||||
* Protected by kvm->slots_lock.
|
||||
*/
|
||||
#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
|
||||
struct kvm_mmu_memory_cache split_desc_cache;
|
||||
};
|
||||
|
||||
struct kvm_vm_stat {
|
||||
|
@ -1367,6 +1415,7 @@ struct kvm_vcpu_stat {
|
|||
u64 preemption_reported;
|
||||
u64 preemption_other;
|
||||
u64 guest_mode;
|
||||
u64 notify_window_exits;
|
||||
};
|
||||
|
||||
struct x86_instruction_info;
|
||||
|
@ -1407,6 +1456,7 @@ struct kvm_x86_ops {
|
|||
void (*vm_destroy)(struct kvm *kvm);
|
||||
|
||||
/* Create, but do not attach this VCPU */
|
||||
int (*vcpu_precreate)(struct kvm *kvm);
|
||||
int (*vcpu_create)(struct kvm_vcpu *vcpu);
|
||||
void (*vcpu_free)(struct kvm_vcpu *vcpu);
|
||||
void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
|
||||
|
@ -1471,7 +1521,7 @@ struct kvm_x86_ops {
|
|||
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
|
||||
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
|
||||
unsigned char *hypercall_addr);
|
||||
void (*inject_irq)(struct kvm_vcpu *vcpu);
|
||||
void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected);
|
||||
void (*inject_nmi)(struct kvm_vcpu *vcpu);
|
||||
void (*queue_exception)(struct kvm_vcpu *vcpu);
|
||||
void (*cancel_injection)(struct kvm_vcpu *vcpu);
|
||||
|
@ -1485,7 +1535,7 @@ struct kvm_x86_ops {
|
|||
bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
|
||||
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
|
||||
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
|
||||
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
|
||||
void (*hwapic_isr_update)(int isr);
|
||||
bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
|
||||
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
|
||||
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
|
||||
|
@ -1495,7 +1545,7 @@ struct kvm_x86_ops {
|
|||
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
|
||||
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
|
||||
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
|
||||
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
|
||||
u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
|
||||
|
||||
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
|
||||
int root_level);
|
||||
|
@ -1705,21 +1755,6 @@ extern bool tdp_enabled;
|
|||
|
||||
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
|
||||
|
||||
/* control of guest tsc rate supported? */
|
||||
extern bool kvm_has_tsc_control;
|
||||
/* maximum supported tsc_khz for guests */
|
||||
extern u32 kvm_max_guest_tsc_khz;
|
||||
/* number of bits of the fractional part of the TSC scaling ratio */
|
||||
extern u8 kvm_tsc_scaling_ratio_frac_bits;
|
||||
/* maximum allowed value of TSC scaling ratio */
|
||||
extern u64 kvm_max_tsc_scaling_ratio;
|
||||
/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
|
||||
extern u64 kvm_default_tsc_scaling_ratio;
|
||||
/* bus lock detection supported? */
|
||||
extern bool kvm_has_bus_lock_exit;
|
||||
|
||||
extern u64 kvm_mce_cap_supported;
|
||||
|
||||
/*
|
||||
* EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
|
||||
* userspace I/O) to indicate that the emulation context
|
||||
|
@ -2060,6 +2095,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
|
|||
KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \
|
||||
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
|
||||
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
|
||||
KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
|
||||
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
|
||||
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
|
||||
|
||||
#endif /* _ASM_X86_KVM_HOST_H */
|
||||
|
|
|
@ -231,6 +231,12 @@
|
|||
#define PERF_CAP_PT_IDX 16
|
||||
|
||||
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
|
||||
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
|
||||
#define PERF_CAP_ARCH_REG BIT_ULL(7)
|
||||
#define PERF_CAP_PEBS_FORMAT 0xf00
|
||||
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
|
||||
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
|
||||
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
|
||||
|
||||
#define MSR_IA32_RTIT_CTL 0x00000570
|
||||
#define RTIT_CTL_TRACEEN BIT(0)
|
||||
|
@ -1018,6 +1024,7 @@
|
|||
#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
|
||||
#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
|
||||
#define MSR_IA32_VMX_VMFUNC 0x00000491
|
||||
#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492
|
||||
|
||||
/* VMX_BASIC bits and bitmasks */
|
||||
#define VMX_BASIC_VMCS_SIZE_SHIFT 32
|
||||
|
|
|
@ -206,6 +206,7 @@ struct x86_pmu_capability {
|
|||
int bit_width_fixed;
|
||||
unsigned int events_mask;
|
||||
int events_mask_len;
|
||||
unsigned int pebs_ept :1;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -504,6 +505,7 @@ struct x86_pmu_lbr {
|
|||
};
|
||||
|
||||
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
|
||||
extern u64 perf_get_hw_event_config(int hw_event);
|
||||
extern void perf_check_microcode(void);
|
||||
extern void perf_clear_dirty_counters(void);
|
||||
extern int x86_perf_rdpmc_index(struct perf_event *event);
|
||||
|
@ -513,15 +515,20 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
|
|||
memset(cap, 0, sizeof(*cap));
|
||||
}
|
||||
|
||||
static inline u64 perf_get_hw_event_config(int hw_event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void perf_events_lapic_init(void) { }
|
||||
static inline void perf_check_microcode(void) { }
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
|
||||
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
|
||||
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
|
||||
extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
|
||||
#else
|
||||
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
|
||||
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
|
||||
static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
|
||||
{
|
||||
return -1;
|
||||
|
|
|
@ -195,6 +195,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
|
|||
#define AVIC_ENABLE_SHIFT 31
|
||||
#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
|
||||
|
||||
#define X2APIC_MODE_SHIFT 30
|
||||
#define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT)
|
||||
|
||||
#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
|
||||
#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
|
||||
|
||||
|
@ -253,12 +256,19 @@ enum avic_ipi_failure_cause {
|
|||
AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
|
||||
};
|
||||
|
||||
#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(9, 0)
|
||||
|
||||
/*
|
||||
* 0xff is broadcast, so the max index allowed for physical APIC ID
|
||||
* table is 0xfe. APIC IDs above 0xff are reserved.
|
||||
* For AVIC, the max index allowed for physical APIC ID
|
||||
* table is 0xff (255).
|
||||
*/
|
||||
#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff
|
||||
#define AVIC_MAX_PHYSICAL_ID 0XFEULL
|
||||
|
||||
/*
|
||||
* For x2AVIC, the max index allowed for physical APIC ID
|
||||
* table is 0x1ff (511).
|
||||
*/
|
||||
#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL
|
||||
|
||||
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
|
||||
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING)
|
||||
#define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING)
|
||||
#define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING)
|
||||
#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS VMCS_CONTROL_BIT(TERTIARY_CONTROLS)
|
||||
#define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING)
|
||||
#define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING)
|
||||
#define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR)
|
||||
|
@ -74,6 +75,12 @@
|
|||
#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
|
||||
#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
|
||||
#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
|
||||
#define SECONDARY_EXEC_NOTIFY_VM_EXITING VMCS_CONTROL_BIT(NOTIFY_VM_EXITING)
|
||||
|
||||
/*
|
||||
* Definitions of Tertiary Processor-Based VM-Execution Controls.
|
||||
*/
|
||||
#define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT)
|
||||
|
||||
#define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING)
|
||||
#define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING)
|
||||
|
@ -158,6 +165,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc)
|
|||
enum vmcs_field {
|
||||
VIRTUAL_PROCESSOR_ID = 0x00000000,
|
||||
POSTED_INTR_NV = 0x00000002,
|
||||
LAST_PID_POINTER_INDEX = 0x00000008,
|
||||
GUEST_ES_SELECTOR = 0x00000800,
|
||||
GUEST_CS_SELECTOR = 0x00000802,
|
||||
GUEST_SS_SELECTOR = 0x00000804,
|
||||
|
@ -221,6 +229,10 @@ enum vmcs_field {
|
|||
ENCLS_EXITING_BITMAP_HIGH = 0x0000202F,
|
||||
TSC_MULTIPLIER = 0x00002032,
|
||||
TSC_MULTIPLIER_HIGH = 0x00002033,
|
||||
TERTIARY_VM_EXEC_CONTROL = 0x00002034,
|
||||
TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035,
|
||||
PID_POINTER_TABLE = 0x00002042,
|
||||
PID_POINTER_TABLE_HIGH = 0x00002043,
|
||||
GUEST_PHYSICAL_ADDRESS = 0x00002400,
|
||||
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
|
||||
VMCS_LINK_POINTER = 0x00002800,
|
||||
|
@ -269,6 +281,7 @@ enum vmcs_field {
|
|||
SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
|
||||
PLE_GAP = 0x00004020,
|
||||
PLE_WINDOW = 0x00004022,
|
||||
NOTIFY_WINDOW = 0x00004024,
|
||||
VM_INSTRUCTION_ERROR = 0x00004400,
|
||||
VM_EXIT_REASON = 0x00004402,
|
||||
VM_EXIT_INTR_INFO = 0x00004404,
|
||||
|
@ -553,6 +566,11 @@ enum vm_entry_failure_code {
|
|||
#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
|
||||
#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
|
||||
|
||||
/*
|
||||
* Exit Qualifications for NOTIFY VM EXIT
|
||||
*/
|
||||
#define NOTIFY_VM_CONTEXT_INVALID BIT(0)
|
||||
|
||||
/*
|
||||
* VM-instruction error numbers
|
||||
*/
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
/*
|
||||
* Defines VMX CPU feature bits
|
||||
*/
|
||||
#define NVMXINTS 3 /* N 32-bit words worth of info */
|
||||
#define NVMXINTS 5 /* N 32-bit words worth of info */
|
||||
|
||||
/*
|
||||
* Note: If the comment begins with a quoted string, that string is used
|
||||
|
@ -43,6 +43,7 @@
|
|||
#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */
|
||||
#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */
|
||||
#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */
|
||||
#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* "" Enable Tertiary VM-Execution Controls */
|
||||
#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */
|
||||
#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */
|
||||
#define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */
|
||||
|
@ -84,5 +85,8 @@
|
|||
#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
|
||||
#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
|
||||
#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */
|
||||
#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */
|
||||
|
||||
/* Tertiary Processor-Based VM-Execution Controls, word 3 */
|
||||
#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */
|
||||
#endif /* _ASM_X86_VMXFEATURES_H */
|
||||
|
|
|
@ -306,7 +306,8 @@ struct kvm_pit_state {
|
|||
struct kvm_pit_channel_state channels[3];
|
||||
};
|
||||
|
||||
#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
|
||||
#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
|
||||
#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002
|
||||
|
||||
struct kvm_pit_state2 {
|
||||
struct kvm_pit_channel_state channels[3];
|
||||
|
@ -325,6 +326,7 @@ struct kvm_reinject_control {
|
|||
#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
|
||||
#define KVM_VCPUEVENT_VALID_SMM 0x00000008
|
||||
#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010
|
||||
#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020
|
||||
|
||||
/* Interrupt shadow states */
|
||||
#define KVM_X86_SHADOW_INT_MOV_SS 0x01
|
||||
|
@ -359,7 +361,10 @@ struct kvm_vcpu_events {
|
|||
__u8 smm_inside_nmi;
|
||||
__u8 latched_init;
|
||||
} smi;
|
||||
__u8 reserved[27];
|
||||
struct {
|
||||
__u8 pending;
|
||||
} triple_fault;
|
||||
__u8 reserved[26];
|
||||
__u8 exception_has_payload;
|
||||
__u64 exception_payload;
|
||||
};
|
||||
|
@ -434,6 +439,7 @@ struct kvm_sync_regs {
|
|||
#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
|
||||
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
|
||||
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
|
||||
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
|
||||
|
||||
#define KVM_STATE_NESTED_FORMAT_VMX 0
|
||||
#define KVM_STATE_NESTED_FORMAT_SVM 1
|
||||
|
|
|
@ -91,6 +91,7 @@
|
|||
#define EXIT_REASON_UMWAIT 67
|
||||
#define EXIT_REASON_TPAUSE 68
|
||||
#define EXIT_REASON_BUS_LOCK 74
|
||||
#define EXIT_REASON_NOTIFY 75
|
||||
|
||||
#define VMX_EXIT_REASONS \
|
||||
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
|
||||
|
@ -153,7 +154,8 @@
|
|||
{ EXIT_REASON_XRSTORS, "XRSTORS" }, \
|
||||
{ EXIT_REASON_UMWAIT, "UMWAIT" }, \
|
||||
{ EXIT_REASON_TPAUSE, "TPAUSE" }, \
|
||||
{ EXIT_REASON_BUS_LOCK, "BUS_LOCK" }
|
||||
{ EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \
|
||||
{ EXIT_REASON_NOTIFY, "NOTIFY" }
|
||||
|
||||
#define VMX_EXIT_REASON_FLAGS \
|
||||
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
|
||||
|
|
|
@ -275,7 +275,7 @@ void native_apic_icr_write(u32 low, u32 id)
|
|||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
|
||||
apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id));
|
||||
apic_write(APIC_ICR, low);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
|
|
@ -99,7 +99,7 @@ sendmask:
|
|||
|
||||
static inline int __prepare_ICR2(unsigned int mask)
|
||||
{
|
||||
return SET_APIC_DEST_FIELD(mask);
|
||||
return SET_XAPIC_DEST_FIELD(mask);
|
||||
}
|
||||
|
||||
static inline void __xapic_wait_icr_idle(void)
|
||||
|
|
|
@ -15,6 +15,8 @@ enum vmx_feature_leafs {
|
|||
MISC_FEATURES = 0,
|
||||
PRIMARY_CTLS,
|
||||
SECONDARY_CTLS,
|
||||
TERTIARY_CTLS_LOW,
|
||||
TERTIARY_CTLS_HIGH,
|
||||
NR_VMX_FEATURE_WORDS,
|
||||
};
|
||||
|
||||
|
@ -22,7 +24,7 @@ enum vmx_feature_leafs {
|
|||
|
||||
static void init_vmx_capabilities(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u32 supported, funcs, ept, vpid, ign;
|
||||
u32 supported, funcs, ept, vpid, ign, low, high;
|
||||
|
||||
BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
|
||||
|
||||
|
@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
|
|||
rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
|
||||
c->vmx_capability[SECONDARY_CTLS] = supported;
|
||||
|
||||
/* All 64 bits of tertiary controls MSR are allowed-1 settings. */
|
||||
rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high);
|
||||
c->vmx_capability[TERTIARY_CTLS_LOW] = low;
|
||||
c->vmx_capability[TERTIARY_CTLS_HIGH] = high;
|
||||
|
||||
rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
|
||||
rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
|
||||
|
||||
|
|
|
@ -236,8 +236,7 @@ again:
|
|||
raw_spin_unlock(&b->lock);
|
||||
|
||||
/* A dummy token might be allocated and ultimately not used. */
|
||||
if (dummy)
|
||||
kfree(dummy);
|
||||
kfree(dummy);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
|
||||
|
||||
|
|
|
@ -67,9 +67,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
|
|||
#define F feature_bit
|
||||
#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
|
||||
|
||||
/*
|
||||
* Magic value used by KVM when querying userspace-provided CPUID entries and
|
||||
* doesn't care about the CPIUD index because the index of the function in
|
||||
* question is not significant. Note, this magic value must have at least one
|
||||
* bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find()
|
||||
* to avoid false positives when processing guest CPUID input.
|
||||
*/
|
||||
#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
|
||||
|
||||
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
|
||||
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
|
||||
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *e;
|
||||
int i;
|
||||
|
@ -77,9 +85,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
|
|||
for (i = 0; i < nent; i++) {
|
||||
e = &entries[i];
|
||||
|
||||
if (e->function == function &&
|
||||
(!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index))
|
||||
if (e->function != function)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If the index isn't significant, use the first entry with a
|
||||
* matching function. It's userspace's responsibilty to not
|
||||
* provide "duplicate" entries in all cases.
|
||||
*/
|
||||
if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)
|
||||
return e;
|
||||
|
||||
|
||||
/*
|
||||
* Similarly, use the first matching entry if KVM is doing a
|
||||
* lookup (as opposed to emulating CPUID) for a function that's
|
||||
* architecturally defined as not having a significant index.
|
||||
*/
|
||||
if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) {
|
||||
/*
|
||||
* Direct lookups from KVM should not diverge from what
|
||||
* KVM defines internally (the architectural behavior).
|
||||
*/
|
||||
WARN_ON_ONCE(cpuid_function_is_indexed(function));
|
||||
return e;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
@ -96,7 +126,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
|
|||
* The existing code assumes virtual address is 48-bit or 57-bit in the
|
||||
* canonical address checks; exit if it is ever changed.
|
||||
*/
|
||||
best = cpuid_entry2_find(entries, nent, 0x80000008, 0);
|
||||
best = cpuid_entry2_find(entries, nent, 0x80000008,
|
||||
KVM_CPUID_INDEX_NOT_SIGNIFICANT);
|
||||
if (best) {
|
||||
int vaddr_bits = (best->eax & 0xff00) >> 8;
|
||||
|
||||
|
@ -151,7 +182,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
|
|||
vcpu->arch.kvm_cpuid_base = 0;
|
||||
|
||||
for_each_possible_hypervisor_cpuid_base(function) {
|
||||
entry = kvm_find_cpuid_entry(vcpu, function, 0);
|
||||
entry = kvm_find_cpuid_entry(vcpu, function);
|
||||
|
||||
if (entry) {
|
||||
u32 signature[3];
|
||||
|
@ -177,7 +208,8 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *v
|
|||
if (!base)
|
||||
return NULL;
|
||||
|
||||
return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
|
||||
return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES,
|
||||
KVM_CPUID_INDEX_NOT_SIGNIFICANT);
|
||||
}
|
||||
|
||||
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
|
||||
|
@ -200,7 +232,7 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
|
|||
|
||||
/*
|
||||
* Calculate guest's supported XCR0 taking into account guest CPUID data and
|
||||
* supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0).
|
||||
* KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0).
|
||||
*/
|
||||
static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
|
||||
{
|
||||
|
@ -210,7 +242,7 @@ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
|
|||
if (!best)
|
||||
return 0;
|
||||
|
||||
return (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
|
||||
return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0;
|
||||
}
|
||||
|
||||
static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
|
||||
|
@ -219,7 +251,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
|
|||
struct kvm_cpuid_entry2 *best;
|
||||
u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
|
||||
|
||||
best = cpuid_entry2_find(entries, nent, 1, 0);
|
||||
best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
|
||||
if (best) {
|
||||
/* Update OSXSAVE bit */
|
||||
if (boot_cpu_has(X86_FEATURE_XSAVE))
|
||||
|
@ -250,7 +282,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
|
|||
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
|
||||
|
||||
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
|
||||
best = cpuid_entry2_find(entries, nent, 0x1, 0);
|
||||
best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
|
||||
if (best)
|
||||
cpuid_entry_change(best, X86_FEATURE_MWAIT,
|
||||
vcpu->arch.ia32_misc_enable_msr &
|
||||
|
@ -285,7 +317,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
|||
struct kvm_cpuid_entry2 *best;
|
||||
u64 guest_supported_xcr0;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 1, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 1);
|
||||
if (best && apic) {
|
||||
if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
|
||||
apic->lapic_timer.timer_mode_mask = 3 << 17;
|
||||
|
@ -325,10 +357,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x80000000);
|
||||
if (!best || best->eax < 0x80000008)
|
||||
goto not_found;
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x80000008);
|
||||
if (best)
|
||||
return best->eax & 0xff;
|
||||
not_found:
|
||||
|
@ -868,7 +900,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
|||
case 9:
|
||||
break;
|
||||
case 0xa: { /* Architectural Performance Monitoring */
|
||||
struct x86_pmu_capability cap;
|
||||
union cpuid10_eax eax;
|
||||
union cpuid10_edx edx;
|
||||
|
||||
|
@ -877,30 +908,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
|||
break;
|
||||
}
|
||||
|
||||
perf_get_x86_pmu_capability(&cap);
|
||||
eax.split.version_id = kvm_pmu_cap.version;
|
||||
eax.split.num_counters = kvm_pmu_cap.num_counters_gp;
|
||||
eax.split.bit_width = kvm_pmu_cap.bit_width_gp;
|
||||
eax.split.mask_length = kvm_pmu_cap.events_mask_len;
|
||||
edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed;
|
||||
edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed;
|
||||
|
||||
/*
|
||||
* The guest architecture pmu is only supported if the architecture
|
||||
* pmu exists on the host and the module parameters allow it.
|
||||
*/
|
||||
if (!cap.version || !enable_pmu)
|
||||
memset(&cap, 0, sizeof(cap));
|
||||
|
||||
eax.split.version_id = min(cap.version, 2);
|
||||
eax.split.num_counters = cap.num_counters_gp;
|
||||
eax.split.bit_width = cap.bit_width_gp;
|
||||
eax.split.mask_length = cap.events_mask_len;
|
||||
|
||||
edx.split.num_counters_fixed =
|
||||
min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED);
|
||||
edx.split.bit_width_fixed = cap.bit_width_fixed;
|
||||
if (cap.version)
|
||||
if (kvm_pmu_cap.version)
|
||||
edx.split.anythread_deprecated = 1;
|
||||
edx.split.reserved1 = 0;
|
||||
edx.split.reserved2 = 0;
|
||||
|
||||
entry->eax = eax.full;
|
||||
entry->ebx = cap.events_mask;
|
||||
entry->ebx = kvm_pmu_cap.events_mask;
|
||||
entry->ecx = 0;
|
||||
entry->edx = edx.full;
|
||||
break;
|
||||
|
@ -923,8 +944,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
|||
}
|
||||
break;
|
||||
case 0xd: {
|
||||
u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm();
|
||||
u64 permitted_xss = supported_xss;
|
||||
u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
|
||||
u64 permitted_xss = kvm_caps.supported_xss;
|
||||
|
||||
entry->eax &= permitted_xcr0;
|
||||
entry->ebx = xstate_required_size(permitted_xcr0, false);
|
||||
|
@ -1313,12 +1334,20 @@ out_free:
|
|||
return r;
|
||||
}
|
||||
|
||||
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
|
||||
u32 function, u32 index)
|
||||
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
|
||||
u32 function, u32 index)
|
||||
{
|
||||
return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
|
||||
function, index);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index);
|
||||
|
||||
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
|
||||
u32 function)
|
||||
{
|
||||
return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
|
||||
function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
|
||||
|
||||
/*
|
||||
|
@ -1355,7 +1384,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
|
|||
struct kvm_cpuid_entry2 *basic, *class;
|
||||
u32 function = *fn_ptr;
|
||||
|
||||
basic = kvm_find_cpuid_entry(vcpu, 0, 0);
|
||||
basic = kvm_find_cpuid_entry(vcpu, 0);
|
||||
if (!basic)
|
||||
return NULL;
|
||||
|
||||
|
@ -1364,11 +1393,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
|
|||
return NULL;
|
||||
|
||||
if (function >= 0x40000000 && function <= 0x4fffffff)
|
||||
class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0);
|
||||
class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00);
|
||||
else if (function >= 0xc0000000)
|
||||
class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0);
|
||||
class = kvm_find_cpuid_entry(vcpu, 0xc0000000);
|
||||
else
|
||||
class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
|
||||
class = kvm_find_cpuid_entry(vcpu, function & 0x80000000);
|
||||
|
||||
if (class && function <= class->eax)
|
||||
return NULL;
|
||||
|
@ -1386,7 +1415,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
|
|||
* the effective CPUID entry is the max basic leaf. Note, the index of
|
||||
* the original requested leaf is observed!
|
||||
*/
|
||||
return kvm_find_cpuid_entry(vcpu, basic->eax, index);
|
||||
return kvm_find_cpuid_entry_index(vcpu, basic->eax, index);
|
||||
}
|
||||
|
||||
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
|
||||
|
@ -1396,7 +1425,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
|
|||
struct kvm_cpuid_entry2 *entry;
|
||||
bool exact, used_max_basic = false;
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, function, index);
|
||||
entry = kvm_find_cpuid_entry_index(vcpu, function, index);
|
||||
exact = !!entry;
|
||||
|
||||
if (!entry && !exact_only) {
|
||||
|
@ -1425,7 +1454,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
|
|||
* exists. EDX can be copied from any existing index.
|
||||
*/
|
||||
if (function == 0xb || function == 0x1f) {
|
||||
entry = kvm_find_cpuid_entry(vcpu, function, 1);
|
||||
entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
|
||||
if (entry) {
|
||||
*ecx = index & 0xff;
|
||||
*edx = entry->edx;
|
||||
|
|
|
@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void);
|
|||
|
||||
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
|
||||
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
|
||||
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
|
||||
u32 function, u32 index);
|
||||
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
|
||||
u32 function, u32 index);
|
||||
u32 function);
|
||||
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
|
||||
struct kvm_cpuid_entry2 __user *entries,
|
||||
unsigned int type);
|
||||
|
@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu,
|
|||
const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
|
||||
struct kvm_cpuid_entry2 *entry;
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
|
||||
entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
|
||||
if (!entry)
|
||||
return NULL;
|
||||
|
||||
|
@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 0);
|
||||
return best &&
|
||||
(is_guest_vendor_amd(best->ebx, best->ecx, best->edx) ||
|
||||
is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
|
||||
|
@ -119,7 +121,7 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 0);
|
||||
return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
|
||||
}
|
||||
|
||||
|
@ -127,7 +129,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x1);
|
||||
if (!best)
|
||||
return -1;
|
||||
|
||||
|
@ -138,18 +140,23 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x1);
|
||||
if (!best)
|
||||
return -1;
|
||||
|
||||
return x86_model(best->eax);
|
||||
}
|
||||
|
||||
static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
|
||||
}
|
||||
|
||||
static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x1);
|
||||
if (!best)
|
||||
return -1;
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL,
|
|||
|
||||
static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
|
||||
{
|
||||
*val = kvm_tsc_scaling_ratio_frac_bits;
|
||||
*val = kvm_caps.tsc_scaling_ratio_frac_bits;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -66,7 +66,7 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_
|
|||
debugfs_dentry, vcpu,
|
||||
&vcpu_timer_advance_ns_fops);
|
||||
|
||||
if (kvm_has_tsc_control) {
|
||||
if (kvm_caps.has_tsc_control) {
|
||||
debugfs_create_file("tsc-scaling-ratio", 0444,
|
||||
debugfs_dentry, vcpu,
|
||||
&vcpu_tsc_scaling_fops);
|
||||
|
|
|
@ -244,6 +244,9 @@ enum x86_transfer_type {
|
|||
|
||||
static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
||||
{
|
||||
if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
|
||||
nr &= NR_EMULATOR_GPRS - 1;
|
||||
|
||||
if (!(ctxt->regs_valid & (1 << nr))) {
|
||||
ctxt->regs_valid |= 1 << nr;
|
||||
ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
|
||||
|
@ -253,6 +256,12 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
|||
|
||||
static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
||||
{
|
||||
if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
|
||||
nr &= NR_EMULATOR_GPRS - 1;
|
||||
|
||||
BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
|
||||
BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
|
||||
|
||||
ctxt->regs_valid |= 1 << nr;
|
||||
ctxt->regs_dirty |= 1 << nr;
|
||||
return &ctxt->_regs[nr];
|
||||
|
@ -266,9 +275,10 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
|
|||
|
||||
static void writeback_registers(struct x86_emulate_ctxt *ctxt)
|
||||
{
|
||||
unsigned long dirty = ctxt->regs_dirty;
|
||||
unsigned reg;
|
||||
|
||||
for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
|
||||
for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS)
|
||||
ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
|
||||
}
|
||||
|
||||
|
@ -615,7 +625,9 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
|
|||
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
|
||||
u32 error, bool valid)
|
||||
{
|
||||
WARN_ON(vec > 0x1f);
|
||||
if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt))
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
ctxt->exception.vector = vec;
|
||||
ctxt->exception.error_code = error;
|
||||
ctxt->exception.error_code_valid = valid;
|
||||
|
@ -1362,7 +1374,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
|
|||
if (mc->pos < mc->end)
|
||||
goto read_cached;
|
||||
|
||||
WARN_ON((mc->end + size) >= sizeof(mc->data));
|
||||
if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt))
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
|
||||
&ctxt->exception);
|
||||
|
@ -1687,16 +1700,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
|
|||
case VCPU_SREG_TR:
|
||||
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
|
||||
goto exception;
|
||||
if (!seg_desc.p) {
|
||||
err_vec = NP_VECTOR;
|
||||
goto exception;
|
||||
}
|
||||
old_desc = seg_desc;
|
||||
seg_desc.type |= 2; /* busy */
|
||||
ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
|
||||
sizeof(seg_desc), &ctxt->exception);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
break;
|
||||
case VCPU_SREG_LDTR:
|
||||
if (seg_desc.s || seg_desc.type != 2)
|
||||
|
@ -1734,8 +1737,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
|
|||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
|
||||
((u64)base3 << 32), ctxt))
|
||||
return emulate_gp(ctxt, 0);
|
||||
((u64)base3 << 32), ctxt))
|
||||
return emulate_gp(ctxt, err_code);
|
||||
}
|
||||
|
||||
if (seg == VCPU_SREG_TR) {
|
||||
old_desc = seg_desc;
|
||||
seg_desc.type |= 2; /* busy */
|
||||
ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
|
||||
sizeof(seg_desc), &ctxt->exception);
|
||||
if (ret != X86EMUL_CONTINUE)
|
||||
return ret;
|
||||
}
|
||||
load:
|
||||
ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
|
||||
|
@ -2432,7 +2444,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
|
|||
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
|
||||
ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
for (i = 0; i < NR_EMULATOR_GPRS; i++)
|
||||
*reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
|
||||
|
||||
val = GET_SMSTATE(u32, smstate, 0x7fcc);
|
||||
|
@ -2489,7 +2501,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
|
|||
u16 selector;
|
||||
int i, r;
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
for (i = 0; i < NR_EMULATOR_GPRS; i++)
|
||||
*reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
|
||||
|
||||
ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
|
||||
|
@ -5719,7 +5731,8 @@ writeback:
|
|||
|
||||
done:
|
||||
if (rc == X86EMUL_PROPAGATE_FAULT) {
|
||||
WARN_ON(ctxt->exception.vector > 0x1f);
|
||||
if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt))
|
||||
return EMULATION_FAILED;
|
||||
ctxt->have_exception = true;
|
||||
}
|
||||
if (rc == X86EMUL_INTERCEPTED)
|
||||
|
|
|
@ -1992,7 +1992,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
|
|||
struct kvm_cpuid_entry2 *entry;
|
||||
struct kvm_vcpu_hv *hv_vcpu;
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE);
|
||||
if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
|
||||
vcpu->arch.hyperv_enabled = true;
|
||||
} else {
|
||||
|
@ -2005,7 +2005,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
|
|||
|
||||
hv_vcpu = to_hv_vcpu(vcpu);
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0);
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
|
||||
if (entry) {
|
||||
hv_vcpu->cpuid_cache.features_eax = entry->eax;
|
||||
hv_vcpu->cpuid_cache.features_ebx = entry->ebx;
|
||||
|
@ -2016,7 +2016,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
|
|||
hv_vcpu->cpuid_cache.features_edx = 0;
|
||||
}
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0);
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
|
||||
if (entry) {
|
||||
hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax;
|
||||
hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx;
|
||||
|
@ -2025,7 +2025,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
|
|||
hv_vcpu->cpuid_cache.enlightenments_ebx = 0;
|
||||
}
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0);
|
||||
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
|
||||
if (entry)
|
||||
hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax;
|
||||
else
|
||||
|
|
|
@ -591,7 +591,10 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu,
|
|||
return -EOPNOTSUPP;
|
||||
|
||||
mutex_lock(&pit_state->lock);
|
||||
pit_state->speaker_data_on = (val >> 1) & 1;
|
||||
if (val & (1 << 1))
|
||||
pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON;
|
||||
else
|
||||
pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON;
|
||||
pit_set_gate(pit, 2, val & 1);
|
||||
mutex_unlock(&pit_state->lock);
|
||||
return 0;
|
||||
|
@ -612,8 +615,9 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
|
|||
refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
|
||||
|
||||
mutex_lock(&pit_state->lock);
|
||||
ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
|
||||
(pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
|
||||
ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) |
|
||||
pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) |
|
||||
(refresh_clock << 4);
|
||||
if (len > sizeof(ret))
|
||||
len = sizeof(ret);
|
||||
memcpy(data, (char *)&ret, len);
|
||||
|
|
|
@ -29,7 +29,6 @@ struct kvm_kpit_state {
|
|||
bool is_periodic;
|
||||
s64 period; /* unit: ns */
|
||||
struct hrtimer timer;
|
||||
u32 speaker_data_on;
|
||||
|
||||
struct mutex lock;
|
||||
atomic_t reinject;
|
||||
|
|
|
@ -89,6 +89,7 @@ struct x86_instruction_info {
|
|||
#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
|
||||
|
||||
struct x86_emulate_ops {
|
||||
void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
|
||||
/*
|
||||
* read_gpr: read a general purpose register (rax - r15)
|
||||
*
|
||||
|
@ -301,6 +302,18 @@ struct fastop;
|
|||
|
||||
typedef void (*fastop_t)(struct fastop *);
|
||||
|
||||
/*
|
||||
* The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is
|
||||
* tracked/accessed via _eip, and except for RIP relative addressing, which
|
||||
* also uses _eip, RIP cannot be a register operand nor can it be an operand in
|
||||
* a ModRM or SIB byte.
|
||||
*/
|
||||
#ifdef CONFIG_X86_64
|
||||
#define NR_EMULATOR_GPRS 16
|
||||
#else
|
||||
#define NR_EMULATOR_GPRS 8
|
||||
#endif
|
||||
|
||||
struct x86_emulate_ctxt {
|
||||
void *vcpu;
|
||||
const struct x86_emulate_ops *ops;
|
||||
|
@ -345,9 +358,9 @@ struct x86_emulate_ctxt {
|
|||
u8 lock_prefix;
|
||||
u8 rep_prefix;
|
||||
/* bitmaps of registers in _regs[] that can be read */
|
||||
u32 regs_valid;
|
||||
u16 regs_valid;
|
||||
/* bitmaps of registers in _regs[] that have been written */
|
||||
u32 regs_dirty;
|
||||
u16 regs_dirty;
|
||||
/* modrm */
|
||||
u8 modrm;
|
||||
u8 modrm_mod;
|
||||
|
@ -363,7 +376,7 @@ struct x86_emulate_ctxt {
|
|||
struct operand src2;
|
||||
struct operand dst;
|
||||
struct operand memop;
|
||||
unsigned long _regs[NR_VCPU_REGS];
|
||||
unsigned long _regs[NR_EMULATOR_GPRS];
|
||||
struct operand *memopp;
|
||||
struct fetch_cache fetch;
|
||||
struct read_cache io_read;
|
||||
|
@ -371,6 +384,15 @@ struct x86_emulate_ctxt {
|
|||
bool is_branch;
|
||||
};
|
||||
|
||||
#define KVM_EMULATOR_BUG_ON(cond, ctxt) \
|
||||
({ \
|
||||
int __ret = (cond); \
|
||||
\
|
||||
if (WARN_ON_ONCE(__ret)) \
|
||||
ctxt->ops->vm_bugged(ctxt); \
|
||||
unlikely(__ret); \
|
||||
})
|
||||
|
||||
/* Repeat String Operation Prefix */
|
||||
#define REPE_PREFIX 0xf3
|
||||
#define REPNE_PREFIX 0xf2
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include <linux/math64.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/current.h>
|
||||
|
@ -54,7 +55,7 @@
|
|||
#define PRIo64 "o"
|
||||
|
||||
/* 14 is the version for Xeon and Pentium 8.4.8*/
|
||||
#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
|
||||
#define APIC_VERSION 0x14UL
|
||||
#define LAPIC_MMIO_LENGTH (1 << 12)
|
||||
/* followed define is not in apicdef.h */
|
||||
#define MAX_APIC_VECTOR 256
|
||||
|
@ -67,6 +68,8 @@ static bool lapic_timer_advance_dynamic __read_mostly;
|
|||
#define LAPIC_TIMER_ADVANCE_NS_MAX 5000
|
||||
/* step-by-step approximation to mitigate fluctuation */
|
||||
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
|
||||
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
|
||||
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
|
||||
|
||||
static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
|
||||
{
|
||||
|
@ -398,14 +401,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
|
|||
return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
|
||||
}
|
||||
|
||||
static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index)
|
||||
{
|
||||
return apic->nr_lvt_entries > lvt_index;
|
||||
}
|
||||
|
||||
static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P);
|
||||
}
|
||||
|
||||
void kvm_apic_set_version(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
u32 v = APIC_VERSION;
|
||||
u32 v = 0;
|
||||
|
||||
if (!lapic_in_kernel(vcpu))
|
||||
return;
|
||||
|
||||
v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
|
||||
|
||||
/*
|
||||
* KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
|
||||
* which doesn't have EOI register; Some buggy OSes (e.g. Windows with
|
||||
|
@ -419,12 +434,33 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
|
|||
kvm_lapic_set_reg(apic, APIC_LVR, v);
|
||||
}
|
||||
|
||||
static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
|
||||
LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
|
||||
LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
|
||||
LVT_MASK | APIC_MODE_MASK, /* LVTPC */
|
||||
LINT_MASK, LINT_MASK, /* LVT0-1 */
|
||||
LVT_MASK /* LVTERR */
|
||||
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
int i;
|
||||
|
||||
if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries)
|
||||
return;
|
||||
|
||||
/* Initialize/mask any "new" LVT entries. */
|
||||
for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
|
||||
kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
|
||||
|
||||
apic->nr_lvt_entries = nr_lvt_entries;
|
||||
|
||||
/* The number of LVT entries is reflected in the version register. */
|
||||
kvm_apic_set_version(vcpu);
|
||||
}
|
||||
|
||||
static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
|
||||
[LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */
|
||||
[LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
|
||||
[LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
|
||||
[LVT_LINT0] = LINT_MASK,
|
||||
[LVT_LINT1] = LINT_MASK,
|
||||
[LVT_ERROR] = LVT_MASK,
|
||||
[LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
|
||||
};
|
||||
|
||||
static int find_highest_vector(void *bitmap)
|
||||
|
@ -518,14 +554,11 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
|
|||
|
||||
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
|
||||
vcpu = apic->vcpu;
|
||||
|
||||
if (unlikely(vcpu->arch.apicv_active)) {
|
||||
if (unlikely(apic->apicv_active)) {
|
||||
/* need to update RVI */
|
||||
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
|
||||
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
|
||||
static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu,
|
||||
apic_find_highest_irr(apic));
|
||||
} else {
|
||||
apic->irr_pending = false;
|
||||
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
|
||||
|
@ -542,20 +575,16 @@ EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
|
|||
|
||||
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
|
||||
if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
|
||||
return;
|
||||
|
||||
vcpu = apic->vcpu;
|
||||
|
||||
/*
|
||||
* With APIC virtualization enabled, all caching is disabled
|
||||
* because the processor can modify ISR under the hood. Instead
|
||||
* just set SVI.
|
||||
*/
|
||||
if (unlikely(vcpu->arch.apicv_active))
|
||||
static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec);
|
||||
if (unlikely(apic->apicv_active))
|
||||
static_call_cond(kvm_x86_hwapic_isr_update)(vec);
|
||||
else {
|
||||
++apic->isr_count;
|
||||
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
|
||||
|
@ -589,12 +618,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
|
|||
|
||||
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
|
||||
return;
|
||||
|
||||
vcpu = apic->vcpu;
|
||||
|
||||
/*
|
||||
* We do get here for APIC virtualization enabled if the guest
|
||||
* uses the Hyper-V APIC enlightenment. In this case we may need
|
||||
|
@ -602,8 +628,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
|
|||
* on the other hand isr_count and highest_isr_cache are unused
|
||||
* and must be left alone.
|
||||
*/
|
||||
if (unlikely(vcpu->arch.apicv_active))
|
||||
static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
|
||||
if (unlikely(apic->apicv_active))
|
||||
static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
|
||||
else {
|
||||
--apic->isr_count;
|
||||
BUG_ON(apic->isr_count < 0);
|
||||
|
@ -801,17 +827,17 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
|
|||
if (kvm_apic_broadcast(apic, mda))
|
||||
return true;
|
||||
|
||||
if (apic_x2apic_mode(apic))
|
||||
return mda == kvm_x2apic_id(apic);
|
||||
|
||||
/*
|
||||
* Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
|
||||
* it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and
|
||||
* this allows unique addressing of VCPUs with APIC ID over 0xff.
|
||||
* The 0xff condition is needed because writeable xAPIC ID.
|
||||
* Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they
|
||||
* were in x2APIC mode if the target APIC ID can't be encoded as an
|
||||
* xAPIC ID. This allows unique addressing of hotplugged vCPUs (which
|
||||
* start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC
|
||||
* mode. Match the x2APIC ID if and only if the target APIC ID can't
|
||||
* be encoded in xAPIC to avoid spurious matches against a vCPU that
|
||||
* changed its (addressable) xAPIC ID (which is writable).
|
||||
*/
|
||||
if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
|
||||
return true;
|
||||
if (apic_x2apic_mode(apic) || mda > 0xff)
|
||||
return mda == kvm_x2apic_id(apic);
|
||||
|
||||
return mda == kvm_xapic_id(apic);
|
||||
}
|
||||
|
@ -1325,7 +1351,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
|
|||
if (apic_x2apic_mode(apic))
|
||||
irq.dest_id = icr_high;
|
||||
else
|
||||
irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
|
||||
irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high);
|
||||
|
||||
trace_kvm_apic_ipi(icr_low, irq.dest_id);
|
||||
|
||||
|
@ -1444,6 +1470,9 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
|
|||
APIC_REG_MASK(APIC_TMCCT) |
|
||||
APIC_REG_MASK(APIC_TDCR);
|
||||
|
||||
if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
|
||||
valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
|
||||
|
||||
/*
|
||||
* ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR
|
||||
* in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
|
||||
|
@ -1583,7 +1612,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
|
|||
int vec = reg & APIC_VECTOR_MASK;
|
||||
void *bitmap = apic->regs + APIC_ISR;
|
||||
|
||||
if (vcpu->arch.apicv_active)
|
||||
if (apic->apicv_active)
|
||||
bitmap = apic->regs + APIC_IRR;
|
||||
|
||||
if (apic_test_vector(vec, bitmap))
|
||||
|
@ -1602,7 +1631,7 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
|
|||
* that __delay() uses delay_tsc whenever the hardware has TSC, thus
|
||||
* always for VMX enabled hardware.
|
||||
*/
|
||||
if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) {
|
||||
if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) {
|
||||
__delay(min(guest_cycles,
|
||||
nsec_to_cycles(vcpu, timer_advance_ns)));
|
||||
} else {
|
||||
|
@ -1700,7 +1729,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
|
|||
if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
|
||||
ktimer->expired_tscdeadline = ktimer->tscdeadline;
|
||||
|
||||
if (!from_timer_fn && vcpu->arch.apicv_active) {
|
||||
if (!from_timer_fn && apic->apicv_active) {
|
||||
WARN_ON(kvm_get_running_vcpu() != vcpu);
|
||||
kvm_apic_inject_pending_timer_irqs(apic);
|
||||
return;
|
||||
|
@ -2052,6 +2081,16 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
|
|||
kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
|
||||
}
|
||||
|
||||
static int get_lvt_index(u32 reg)
|
||||
{
|
||||
if (reg == APIC_LVTCMCI)
|
||||
return LVT_CMCI;
|
||||
if (reg < APIC_LVTT || reg > APIC_LVTERR)
|
||||
return -1;
|
||||
return array_index_nospec(
|
||||
(reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES);
|
||||
}
|
||||
|
||||
static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
|
||||
{
|
||||
int ret = 0;
|
||||
|
@ -2098,13 +2137,10 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
|
|||
apic_set_spiv(apic, val & mask);
|
||||
if (!(val & APIC_SPIV_APIC_ENABLED)) {
|
||||
int i;
|
||||
u32 lvt_val;
|
||||
|
||||
for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
|
||||
lvt_val = kvm_lapic_get_reg(apic,
|
||||
APIC_LVTT + 0x10 * i);
|
||||
kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
|
||||
lvt_val | APIC_LVT_MASKED);
|
||||
for (i = 0; i < apic->nr_lvt_entries; i++) {
|
||||
kvm_lapic_set_reg(apic, APIC_LVTx(i),
|
||||
kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
|
||||
}
|
||||
apic_update_lvtt(apic);
|
||||
atomic_set(&apic->lapic_timer.pending, 0);
|
||||
|
@ -2133,16 +2169,15 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
|
|||
case APIC_LVTTHMR:
|
||||
case APIC_LVTPC:
|
||||
case APIC_LVT1:
|
||||
case APIC_LVTERR: {
|
||||
/* TODO: Check vector */
|
||||
size_t size;
|
||||
u32 index;
|
||||
|
||||
case APIC_LVTERR:
|
||||
case APIC_LVTCMCI: {
|
||||
u32 index = get_lvt_index(reg);
|
||||
if (!kvm_lapic_lvt_supported(apic, index)) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
if (!kvm_apic_sw_enabled(apic))
|
||||
val |= APIC_LVT_MASKED;
|
||||
size = ARRAY_SIZE(apic_lvt_mask);
|
||||
index = array_index_nospec(
|
||||
(reg - APIC_LVTT) >> 4, size);
|
||||
val &= apic_lvt_mask[index];
|
||||
kvm_lapic_set_reg(apic, reg, val);
|
||||
break;
|
||||
|
@ -2246,10 +2281,26 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
|
|||
/* emulate APIC access in a trap manner */
|
||||
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
|
||||
{
|
||||
u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset);
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
u64 val;
|
||||
|
||||
/* TODO: optimize to just emulate side effect w/o one more write */
|
||||
kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
|
||||
if (apic_x2apic_mode(apic))
|
||||
kvm_lapic_msr_read(apic, offset, &val);
|
||||
else
|
||||
val = kvm_lapic_get_reg(apic, offset);
|
||||
|
||||
/*
|
||||
* ICR is a single 64-bit register when x2APIC is enabled. For legacy
|
||||
* xAPIC, ICR writes need to go down the common (slightly slower) path
|
||||
* to get the upper half from ICR2.
|
||||
*/
|
||||
if (apic_x2apic_mode(apic) && offset == APIC_ICR) {
|
||||
kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
|
||||
trace_kvm_apic_write(APIC_ICR, val);
|
||||
} else {
|
||||
/* TODO: optimize to just emulate side effect w/o one more write */
|
||||
kvm_lapic_reg_write(apic, offset, (u32)val);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
|
||||
|
||||
|
@ -2344,8 +2395,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
|
|||
if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
|
||||
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
|
||||
|
||||
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
|
||||
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
|
||||
kvm_vcpu_update_apicv(vcpu);
|
||||
static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
|
||||
}
|
||||
|
||||
apic->base_address = apic->vcpu->arch.apic_base &
|
||||
MSR_IA32_APICBASE_BASE;
|
||||
|
@ -2361,7 +2414,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct kvm_lapic *apic = vcpu->arch.apic;
|
||||
|
||||
if (vcpu->arch.apicv_active) {
|
||||
if (apic->apicv_active) {
|
||||
/* irr_pending is always true when apicv is activated. */
|
||||
apic->irr_pending = true;
|
||||
apic->isr_count = 1;
|
||||
|
@ -2401,8 +2454,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
|
|||
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
|
||||
kvm_apic_set_version(apic->vcpu);
|
||||
|
||||
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
|
||||
kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
|
||||
for (i = 0; i < apic->nr_lvt_entries; i++)
|
||||
kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
|
||||
apic_update_lvtt(apic);
|
||||
if (kvm_vcpu_is_reset_bsp(vcpu) &&
|
||||
kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
|
||||
|
@ -2436,10 +2489,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
|
|||
|
||||
vcpu->arch.pv_eoi.msr_val = 0;
|
||||
apic_update_ppr(apic);
|
||||
if (vcpu->arch.apicv_active) {
|
||||
if (apic->apicv_active) {
|
||||
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
|
||||
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
|
||||
static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1);
|
||||
static_call_cond(kvm_x86_hwapic_isr_update)(-1);
|
||||
}
|
||||
|
||||
vcpu->arch.apic_arb_prio = 0;
|
||||
|
@ -2532,6 +2585,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
|
|||
}
|
||||
apic->vcpu = vcpu;
|
||||
|
||||
apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
|
||||
|
||||
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
|
||||
HRTIMER_MODE_ABS_HARD);
|
||||
apic->lapic_timer.timer.function = apic_timer_fn;
|
||||
|
@ -2716,10 +2771,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
|
|||
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
|
||||
kvm_apic_update_apicv(vcpu);
|
||||
apic->highest_isr_cache = -1;
|
||||
if (vcpu->arch.apicv_active) {
|
||||
if (apic->apicv_active) {
|
||||
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
|
||||
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
|
||||
static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
|
||||
static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
|
||||
}
|
||||
kvm_make_request(KVM_REQ_EVENT, vcpu);
|
||||
if (ioapic_in_kernel(vcpu->kvm))
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
|
||||
#define KVM_APIC_INIT 0
|
||||
#define KVM_APIC_SIPI 1
|
||||
#define KVM_APIC_LVT_NUM 6
|
||||
|
||||
#define APIC_SHORT_MASK 0xc0000
|
||||
#define APIC_DEST_NOSHORT 0x0
|
||||
|
@ -29,6 +28,20 @@ enum lapic_mode {
|
|||
LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
|
||||
};
|
||||
|
||||
enum lapic_lvt_entry {
|
||||
LVT_TIMER,
|
||||
LVT_THERMAL_MONITOR,
|
||||
LVT_PERFORMANCE_COUNTER,
|
||||
LVT_LINT0,
|
||||
LVT_LINT1,
|
||||
LVT_ERROR,
|
||||
LVT_CMCI,
|
||||
|
||||
KVM_APIC_MAX_NR_LVT_ENTRIES,
|
||||
};
|
||||
|
||||
#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x))
|
||||
|
||||
struct kvm_timer {
|
||||
struct hrtimer timer;
|
||||
s64 period; /* unit: ns */
|
||||
|
@ -48,6 +61,7 @@ struct kvm_lapic {
|
|||
struct kvm_timer lapic_timer;
|
||||
u32 divide_count;
|
||||
struct kvm_vcpu *vcpu;
|
||||
bool apicv_active;
|
||||
bool sw_enabled;
|
||||
bool irr_pending;
|
||||
bool lvt0_in_nmi_mode;
|
||||
|
@ -65,6 +79,7 @@ struct kvm_lapic {
|
|||
struct gfn_to_hva_cache vapic_cache;
|
||||
unsigned long pending_events;
|
||||
unsigned int sipi_vector;
|
||||
int nr_lvt_entries;
|
||||
};
|
||||
|
||||
struct dest_map;
|
||||
|
@ -84,6 +99,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
|
|||
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
|
||||
void kvm_recalculate_apic_map(struct kvm *kvm);
|
||||
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
|
||||
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
|
||||
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
|
||||
int shorthand, unsigned int dest, int dest_mode);
|
||||
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
|
||||
|
@ -204,7 +220,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
|
|||
|
||||
static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return vcpu->arch.apic && vcpu->arch.apicv_active;
|
||||
return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active;
|
||||
}
|
||||
|
||||
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
|
||||
|
|
|
@ -6,11 +6,6 @@
|
|||
#include "kvm_cache_regs.h"
|
||||
#include "cpuid.h"
|
||||
|
||||
#define PT64_PT_BITS 9
|
||||
#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
|
||||
#define PT32_PT_BITS 10
|
||||
#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
|
||||
|
||||
#define PT_WRITABLE_SHIFT 1
|
||||
#define PT_USER_SHIFT 2
|
||||
|
||||
|
@ -34,11 +29,6 @@
|
|||
#define PT_DIR_PAT_SHIFT 12
|
||||
#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
|
||||
|
||||
#define PT32_DIR_PSE36_SIZE 4
|
||||
#define PT32_DIR_PSE36_SHIFT 13
|
||||
#define PT32_DIR_PSE36_MASK \
|
||||
(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
|
||||
|
||||
#define PT64_ROOT_5LEVEL 5
|
||||
#define PT64_ROOT_4LEVEL 4
|
||||
#define PT32_ROOT_LEVEL 2
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -20,6 +20,20 @@ extern bool dbg;
|
|||
#define MMU_WARN_ON(x) do { } while (0)
|
||||
#endif
|
||||
|
||||
/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
|
||||
#define __PT_LEVEL_SHIFT(level, bits_per_level) \
|
||||
(PAGE_SHIFT + ((level) - 1) * (bits_per_level))
|
||||
#define __PT_INDEX(address, level, bits_per_level) \
|
||||
(((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1))
|
||||
|
||||
#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \
|
||||
((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
|
||||
|
||||
#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \
|
||||
((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
|
||||
|
||||
#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level))
|
||||
|
||||
/*
|
||||
* Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
|
||||
* bit, and thus are guaranteed to be non-zero when valid. And, when a guest
|
||||
|
@ -53,8 +67,21 @@ struct kvm_mmu_page {
|
|||
gfn_t gfn;
|
||||
|
||||
u64 *spt;
|
||||
/* hold the gfn of each spte inside spt */
|
||||
gfn_t *gfns;
|
||||
|
||||
/*
|
||||
* Stores the result of the guest translation being shadowed by each
|
||||
* SPTE. KVM shadows two types of guest translations: nGPA -> GPA
|
||||
* (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
|
||||
* cases the result of the translation is a GPA and a set of access
|
||||
* constraints.
|
||||
*
|
||||
* The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
|
||||
* access permissions are stored in the lower bits. Note, for
|
||||
* convenience and uniformity across guests, the access permissions are
|
||||
* stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.
|
||||
*/
|
||||
u64 *shadowed_translation;
|
||||
|
||||
/* Currently serving as active root */
|
||||
union {
|
||||
int root_count;
|
||||
|
@ -141,9 +168,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
|
|||
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
|
||||
|
||||
extern int nx_huge_pages;
|
||||
static inline bool is_nx_huge_page_enabled(void)
|
||||
static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
|
||||
{
|
||||
return READ_ONCE(nx_huge_pages);
|
||||
return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
|
||||
}
|
||||
|
||||
struct kvm_page_fault {
|
||||
|
@ -242,7 +269,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
|
|||
.user = err & PFERR_USER_MASK,
|
||||
.prefetch = prefetch,
|
||||
.is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
|
||||
.nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
|
||||
.nx_huge_page_workaround_enabled =
|
||||
is_nx_huge_page_enabled(vcpu->kvm),
|
||||
|
||||
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
|
||||
.req_level = PG_LEVEL_4K,
|
||||
|
@ -281,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
|
|||
|
||||
int kvm_mmu_max_mapping_level(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot, gfn_t gfn,
|
||||
kvm_pfn_t pfn, int max_level);
|
||||
int max_level);
|
||||
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
|
||||
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
|
||||
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/* Shadow paging constants/helpers that don't need to be #undef'd. */
|
||||
#ifndef __KVM_X86_PAGING_H
|
||||
#define __KVM_X86_PAGING_H
|
||||
|
||||
#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
|
||||
#define PT64_LVL_ADDR_MASK(level) \
|
||||
(GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
|
||||
* PT64_LEVEL_BITS))) - 1))
|
||||
#define PT64_LVL_OFFSET_MASK(level) \
|
||||
(GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
|
||||
* PT64_LEVEL_BITS))) - 1))
|
||||
#endif /* __KVM_X86_PAGING_H */
|
||||
|
|
@ -16,25 +16,21 @@
|
|||
*/
|
||||
|
||||
/*
|
||||
* We need the mmu code to access both 32-bit and 64-bit guest ptes,
|
||||
* so the code in this file is compiled twice, once per pte size.
|
||||
* The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables,
|
||||
* as well as guest EPT tables, so the code in this file is compiled thrice,
|
||||
* once per guest PTE type. The per-type defines are #undef'd at the end.
|
||||
*/
|
||||
|
||||
#if PTTYPE == 64
|
||||
#define pt_element_t u64
|
||||
#define guest_walker guest_walker64
|
||||
#define FNAME(name) paging##64_##name
|
||||
#define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
|
||||
#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
|
||||
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
|
||||
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
|
||||
#define PT_LEVEL_BITS PT64_LEVEL_BITS
|
||||
#define PT_LEVEL_BITS 9
|
||||
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
|
||||
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
|
||||
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
|
||||
#ifdef CONFIG_X86_64
|
||||
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
|
||||
#define CMPXCHG "cmpxchgq"
|
||||
#else
|
||||
#define PT_MAX_FULL_LEVELS 2
|
||||
#endif
|
||||
|
@ -42,36 +38,35 @@
|
|||
#define pt_element_t u32
|
||||
#define guest_walker guest_walker32
|
||||
#define FNAME(name) paging##32_##name
|
||||
#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
|
||||
#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
|
||||
#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
|
||||
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
|
||||
#define PT_LEVEL_BITS PT32_LEVEL_BITS
|
||||
#define PT_LEVEL_BITS 10
|
||||
#define PT_MAX_FULL_LEVELS 2
|
||||
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
|
||||
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
|
||||
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
|
||||
#define CMPXCHG "cmpxchgl"
|
||||
|
||||
#define PT32_DIR_PSE36_SIZE 4
|
||||
#define PT32_DIR_PSE36_SHIFT 13
|
||||
#define PT32_DIR_PSE36_MASK \
|
||||
(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
|
||||
#elif PTTYPE == PTTYPE_EPT
|
||||
#define pt_element_t u64
|
||||
#define guest_walker guest_walkerEPT
|
||||
#define FNAME(name) ept_##name
|
||||
#define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
|
||||
#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
|
||||
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
|
||||
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
|
||||
#define PT_LEVEL_BITS PT64_LEVEL_BITS
|
||||
#define PT_LEVEL_BITS 9
|
||||
#define PT_GUEST_DIRTY_SHIFT 9
|
||||
#define PT_GUEST_ACCESSED_SHIFT 8
|
||||
#define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
|
||||
#ifdef CONFIG_X86_64
|
||||
#define CMPXCHG "cmpxchgq"
|
||||
#endif
|
||||
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
|
||||
#else
|
||||
#error Invalid PTTYPE value
|
||||
#endif
|
||||
|
||||
/* Common logic, but per-type values. These also need to be undefined. */
|
||||
#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
|
||||
#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
|
||||
#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
|
||||
#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
|
||||
|
||||
#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
|
||||
#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
|
||||
|
||||
|
@ -97,6 +92,15 @@ struct guest_walker {
|
|||
struct x86_exception fault;
|
||||
};
|
||||
|
||||
#if PTTYPE == 32
|
||||
static inline gfn_t pse36_gfn_delta(u32 gpte)
|
||||
{
|
||||
int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
|
||||
|
||||
return (gpte & PT32_DIR_PSE36_MASK) << shift;
|
||||
}
|
||||
#endif
|
||||
|
||||
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
|
||||
{
|
||||
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
|
||||
|
@ -374,7 +378,7 @@ retry_walk:
|
|||
* information to fix the exit_qualification or exit_info_1
|
||||
* fields.
|
||||
*/
|
||||
if (unlikely(real_gpa == UNMAPPED_GVA))
|
||||
if (unlikely(real_gpa == INVALID_GPA))
|
||||
return 0;
|
||||
|
||||
host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
|
||||
|
@ -421,11 +425,13 @@ retry_walk:
|
|||
gfn = gpte_to_gfn_lvl(pte, walker->level);
|
||||
gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
|
||||
|
||||
if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
|
||||
#if PTTYPE == 32
|
||||
if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
|
||||
gfn += pse36_gfn_delta(pte);
|
||||
#endif
|
||||
|
||||
real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
|
||||
if (real_gpa == UNMAPPED_GVA)
|
||||
if (real_gpa == INVALID_GPA)
|
||||
return 0;
|
||||
|
||||
walker->gfn = real_gpa >> PAGE_SHIFT;
|
||||
|
@ -589,7 +595,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
|
|||
if (sp->role.direct)
|
||||
return __direct_pte_prefetch(vcpu, sp, sptep);
|
||||
|
||||
i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
|
||||
i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
|
||||
spte = sp->spt + i;
|
||||
|
||||
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
|
||||
|
@ -642,14 +648,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
|
|||
gfn_t table_gfn;
|
||||
|
||||
clear_sp_write_flooding_count(it.sptep);
|
||||
drop_large_spte(vcpu, it.sptep);
|
||||
|
||||
sp = NULL;
|
||||
if (!is_shadow_present_pte(*it.sptep)) {
|
||||
table_gfn = gw->table_gfn[it.level - 2];
|
||||
access = gw->pt_access[it.level - 2];
|
||||
sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
|
||||
it.level-1, false, access);
|
||||
table_gfn = gw->table_gfn[it.level - 2];
|
||||
access = gw->pt_access[it.level - 2];
|
||||
sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
|
||||
false, access);
|
||||
|
||||
if (sp != ERR_PTR(-EEXIST)) {
|
||||
/*
|
||||
* We must synchronize the pagetable before linking it
|
||||
* because the guest doesn't need to flush tlb when
|
||||
|
@ -678,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
|
|||
if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
|
||||
goto out_gpte_changed;
|
||||
|
||||
if (sp)
|
||||
if (sp != ERR_PTR(-EEXIST))
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
}
|
||||
|
||||
|
@ -702,16 +707,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
|
|||
|
||||
validate_direct_spte(vcpu, it.sptep, direct_access);
|
||||
|
||||
drop_large_spte(vcpu, it.sptep);
|
||||
sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
|
||||
true, direct_access);
|
||||
if (sp == ERR_PTR(-EEXIST))
|
||||
continue;
|
||||
|
||||
if (!is_shadow_present_pte(*it.sptep)) {
|
||||
sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
|
||||
it.level - 1, true, direct_access);
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
if (fault->huge_page_disallowed &&
|
||||
fault->req_level >= it.level)
|
||||
account_huge_nx_page(vcpu->kvm, sp);
|
||||
}
|
||||
link_shadow_page(vcpu, it.sptep, sp);
|
||||
if (fault->huge_page_disallowed &&
|
||||
fault->req_level >= it.level)
|
||||
account_huge_nx_page(vcpu->kvm, sp);
|
||||
}
|
||||
|
||||
if (WARN_ON_ONCE(it.level != fault->goal_level))
|
||||
|
@ -888,7 +892,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
|
|||
WARN_ON(sp->role.level != PG_LEVEL_4K);
|
||||
|
||||
if (PTTYPE == 32)
|
||||
offset = sp->role.quadrant << PT64_LEVEL_BITS;
|
||||
offset = sp->role.quadrant << SPTE_LEVEL_BITS;
|
||||
|
||||
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
|
||||
}
|
||||
|
@ -929,7 +933,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
|
|||
break;
|
||||
|
||||
pte_gpa = FNAME(get_level1_sp_gpa)(sp);
|
||||
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
|
||||
pte_gpa += spte_index(sptep) * sizeof(pt_element_t);
|
||||
|
||||
mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
|
||||
if (is_shadow_present_pte(old_spte))
|
||||
|
@ -958,7 +962,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
|||
struct x86_exception *exception)
|
||||
{
|
||||
struct guest_walker walker;
|
||||
gpa_t gpa = UNMAPPED_GVA;
|
||||
gpa_t gpa = INVALID_GPA;
|
||||
int r;
|
||||
|
||||
#ifndef CONFIG_X86_64
|
||||
|
@ -978,7 +982,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
|||
}
|
||||
|
||||
/*
|
||||
* Using the cached information from sp->gfns is safe because:
|
||||
* Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
|
||||
* safe because:
|
||||
* - The spte has a reference to the struct page, so the pfn for a given gfn
|
||||
* can't change unless all sptes pointing to it are nuked first.
|
||||
*
|
||||
|
@ -1023,7 +1028,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
|||
|
||||
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
|
||||
|
||||
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
|
||||
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
|
||||
u64 *sptep, spte;
|
||||
struct kvm_memory_slot *slot;
|
||||
unsigned pte_access;
|
||||
|
@ -1053,12 +1058,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
|||
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
|
||||
continue;
|
||||
|
||||
if (gfn != sp->gfns[i]) {
|
||||
/*
|
||||
* Drop the SPTE if the new protections would result in a RWX=0
|
||||
* SPTE or if the gfn is changing. The RWX=0 case only affects
|
||||
* EPT with execute-only support, i.e. EPT without an effective
|
||||
* "present" bit, as all other paging modes will create a
|
||||
* read-only SPTE if pte_access is zero.
|
||||
*/
|
||||
if ((!pte_access && !shadow_present_mask) ||
|
||||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
|
||||
drop_spte(vcpu->kvm, &sp->spt[i]);
|
||||
flush = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Update the shadowed access bits in case they changed. */
|
||||
kvm_mmu_page_set_access(sp, i, pte_access);
|
||||
|
||||
sptep = &sp->spt[i];
|
||||
spte = *sptep;
|
||||
host_writable = spte & shadow_host_writable_mask;
|
||||
|
@ -1070,6 +1086,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
|||
flush |= mmu_spte_update(sptep, spte);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note, any flush is purely for KVM's correctness, e.g. when dropping
|
||||
* an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
|
||||
* unmap or dirty logging event doesn't fail to flush. The guest is
|
||||
* responsible for flushing the TLB to ensure any changes in protection
|
||||
* bits are recognized, i.e. until the guest flushes or page faults on
|
||||
* a relevant address, KVM is architecturally allowed to let vCPUs use
|
||||
* cached translations with the old protection bits.
|
||||
*/
|
||||
return flush;
|
||||
}
|
||||
|
||||
|
@ -1084,7 +1109,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
|||
#undef PT_MAX_FULL_LEVELS
|
||||
#undef gpte_to_gfn
|
||||
#undef gpte_to_gfn_lvl
|
||||
#undef CMPXCHG
|
||||
#undef PT_GUEST_ACCESSED_MASK
|
||||
#undef PT_GUEST_DIRTY_MASK
|
||||
#undef PT_GUEST_DIRTY_SHIFT
|
||||
|
|
|
@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value;
|
|||
u64 __read_mostly shadow_mmio_mask;
|
||||
u64 __read_mostly shadow_mmio_access_mask;
|
||||
u64 __read_mostly shadow_present_mask;
|
||||
u64 __read_mostly shadow_memtype_mask;
|
||||
u64 __read_mostly shadow_me_value;
|
||||
u64 __read_mostly shadow_me_mask;
|
||||
u64 __read_mostly shadow_acc_track_mask;
|
||||
|
@ -129,6 +130,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
|||
u64 spte = SPTE_MMU_PRESENT_MASK;
|
||||
bool wrprot = false;
|
||||
|
||||
WARN_ON_ONCE(!pte_access && !shadow_present_mask);
|
||||
|
||||
if (sp->role.ad_disabled)
|
||||
spte |= SPTE_TDP_AD_DISABLED_MASK;
|
||||
else if (kvm_mmu_page_ad_need_write_protect(sp))
|
||||
|
@ -145,7 +148,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
|||
spte |= spte_shadow_accessed_mask(spte);
|
||||
|
||||
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
|
||||
is_nx_huge_page_enabled()) {
|
||||
is_nx_huge_page_enabled(vcpu->kvm)) {
|
||||
pte_access &= ~ACC_EXEC_MASK;
|
||||
}
|
||||
|
||||
|
@ -159,10 +162,10 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
|||
|
||||
if (level > PG_LEVEL_4K)
|
||||
spte |= PT_PAGE_SIZE_MASK;
|
||||
if (tdp_enabled)
|
||||
spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
|
||||
kvm_is_mmio_pfn(pfn));
|
||||
|
||||
if (shadow_memtype_mask)
|
||||
spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
|
||||
kvm_is_mmio_pfn(pfn));
|
||||
if (host_writable)
|
||||
spte |= shadow_host_writable_mask;
|
||||
else
|
||||
|
@ -244,10 +247,10 @@ static u64 make_spte_executable(u64 spte)
|
|||
* This is used during huge page splitting to build the SPTEs that make up the
|
||||
* new page table.
|
||||
*/
|
||||
u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
|
||||
u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role,
|
||||
int index)
|
||||
{
|
||||
u64 child_spte;
|
||||
int child_level;
|
||||
|
||||
if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
|
||||
return 0;
|
||||
|
@ -256,23 +259,23 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index)
|
|||
return 0;
|
||||
|
||||
child_spte = huge_spte;
|
||||
child_level = huge_level - 1;
|
||||
|
||||
/*
|
||||
* The child_spte already has the base address of the huge page being
|
||||
* split. So we just have to OR in the offset to the page at the next
|
||||
* lower level for the given index.
|
||||
*/
|
||||
child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
|
||||
child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT;
|
||||
|
||||
if (child_level == PG_LEVEL_4K) {
|
||||
if (role.level == PG_LEVEL_4K) {
|
||||
child_spte &= ~PT_PAGE_SIZE_MASK;
|
||||
|
||||
/*
|
||||
* When splitting to a 4K page, mark the page executable as the
|
||||
* NX hugepage mitigation no longer applies.
|
||||
* When splitting to a 4K page where execution is allowed, mark
|
||||
* the page executable as the NX hugepage mitigation no longer
|
||||
* applies.
|
||||
*/
|
||||
if (is_nx_huge_page_enabled())
|
||||
if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm))
|
||||
child_spte = make_spte_executable(child_spte);
|
||||
}
|
||||
|
||||
|
@ -299,7 +302,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
|
|||
{
|
||||
u64 new_spte;
|
||||
|
||||
new_spte = old_spte & ~PT64_BASE_ADDR_MASK;
|
||||
new_spte = old_spte & ~SPTE_BASE_ADDR_MASK;
|
||||
new_spte |= (u64)new_pfn << PAGE_SHIFT;
|
||||
|
||||
new_spte &= ~PT_WRITABLE_MASK;
|
||||
|
@ -389,6 +392,13 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
|
|||
shadow_nx_mask = 0ull;
|
||||
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
|
||||
shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
|
||||
/*
|
||||
* EPT overrides the host MTRRs, and so KVM must program the desired
|
||||
* memtype directly into the SPTEs. Note, this mask is just the mask
|
||||
* of all bits that factor into the memtype, the actual memtype must be
|
||||
* dynamically calculated, e.g. to ensure host MMIO is mapped UC.
|
||||
*/
|
||||
shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT;
|
||||
shadow_acc_track_mask = VMX_EPT_RWX_MASK;
|
||||
shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
|
||||
shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
|
||||
|
@ -439,6 +449,13 @@ void kvm_mmu_reset_all_pte_masks(void)
|
|||
shadow_nx_mask = PT64_NX_MASK;
|
||||
shadow_x_mask = 0;
|
||||
shadow_present_mask = PT_PRESENT_MASK;
|
||||
|
||||
/*
|
||||
* For shadow paging and NPT, KVM uses PAT entry '0' to encode WB
|
||||
* memtype in the SPTEs, i.e. relies on host MTRRs to provide the
|
||||
* correct memtype (WB is the "weakest" memtype).
|
||||
*/
|
||||
shadow_memtype_mask = 0;
|
||||
shadow_acc_track_mask = 0;
|
||||
shadow_me_mask = 0;
|
||||
shadow_me_value = 0;
|
||||
|
|
|
@ -36,12 +36,12 @@ extern bool __read_mostly enable_mmio_caching;
|
|||
static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
|
||||
|
||||
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
|
||||
#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
|
||||
#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
|
||||
#else
|
||||
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
|
||||
#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
|
||||
#endif
|
||||
|
||||
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
|
||||
#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
|
||||
| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
|
||||
|
||||
#define ACC_EXEC_MASK 1
|
||||
|
@ -50,17 +50,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
|
|||
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
|
||||
|
||||
/* The mask for the R/X bits in EPT PTEs */
|
||||
#define PT64_EPT_READABLE_MASK 0x1ull
|
||||
#define PT64_EPT_EXECUTABLE_MASK 0x4ull
|
||||
#define SPTE_EPT_READABLE_MASK 0x1ull
|
||||
#define SPTE_EPT_EXECUTABLE_MASK 0x4ull
|
||||
|
||||
#define PT64_LEVEL_BITS 9
|
||||
|
||||
#define PT64_LEVEL_SHIFT(level) \
|
||||
(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
|
||||
|
||||
#define PT64_INDEX(address, level)\
|
||||
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
|
||||
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
|
||||
#define SPTE_LEVEL_BITS 9
|
||||
#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
|
||||
#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS)
|
||||
#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS)
|
||||
|
||||
/*
|
||||
* The mask/shift to use for saving the original R/X bits when marking the PTE
|
||||
|
@ -69,8 +65,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
|
|||
* restored only when a write is attempted to the page. This mask obviously
|
||||
* must not overlap the A/D type mask.
|
||||
*/
|
||||
#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
|
||||
PT64_EPT_EXECUTABLE_MASK)
|
||||
#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \
|
||||
SPTE_EPT_EXECUTABLE_MASK)
|
||||
#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
|
||||
#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
|
||||
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
|
||||
|
@ -151,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value;
|
|||
extern u64 __read_mostly shadow_mmio_mask;
|
||||
extern u64 __read_mostly shadow_mmio_access_mask;
|
||||
extern u64 __read_mostly shadow_present_mask;
|
||||
extern u64 __read_mostly shadow_memtype_mask;
|
||||
extern u64 __read_mostly shadow_me_value;
|
||||
extern u64 __read_mostly shadow_me_mask;
|
||||
|
||||
|
@ -194,6 +191,12 @@ static inline bool is_removed_spte(u64 spte)
|
|||
return spte == REMOVED_SPTE;
|
||||
}
|
||||
|
||||
/* Get an SPTE's index into its parent's page table (and the spt array). */
|
||||
static inline int spte_index(u64 *sptep)
|
||||
{
|
||||
return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* In some cases, we need to preserve the GFN of a non-present or reserved
|
||||
* SPTE when we usurp the upper five bits of the physical address space to
|
||||
|
@ -282,7 +285,7 @@ static inline bool is_executable_pte(u64 spte)
|
|||
|
||||
static inline kvm_pfn_t spte_to_pfn(u64 pte)
|
||||
{
|
||||
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
|
||||
return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static inline bool is_accessed_spte(u64 spte)
|
||||
|
@ -425,7 +428,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
|||
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
|
||||
u64 old_spte, bool prefetch, bool can_unsync,
|
||||
bool host_writable, u64 *new_spte);
|
||||
u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index);
|
||||
u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
|
||||
union kvm_mmu_page_role role, int index);
|
||||
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
|
||||
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
|
||||
u64 mark_spte_for_access_track(u64 spte);
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
|
||||
{
|
||||
iter->sptep = iter->pt_path[iter->level - 1] +
|
||||
SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
|
||||
SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
|
||||
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
|
||||
}
|
||||
|
||||
|
@ -116,8 +116,8 @@ static bool try_step_side(struct tdp_iter *iter)
|
|||
* Check if the iterator is already at the end of the current page
|
||||
* table.
|
||||
*/
|
||||
if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
|
||||
(PT64_ENT_PER_PAGE - 1))
|
||||
if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
|
||||
(SPTE_ENT_PER_PAGE - 1))
|
||||
return false;
|
||||
|
||||
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
|
||||
|
@ -145,15 +145,6 @@ static bool try_step_up(struct tdp_iter *iter)
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Step the iterator back up a level in the paging structure. Should only be
|
||||
* used when the iterator is below the root level.
|
||||
*/
|
||||
void tdp_iter_step_up(struct tdp_iter *iter)
|
||||
{
|
||||
WARN_ON(!try_step_up(iter));
|
||||
}
|
||||
|
||||
/*
|
||||
* Step to the next SPTE in a pre-order traversal of the paging structure.
|
||||
* To get to the next SPTE, the iterator either steps down towards the goal
|
||||
|
|
|
@ -114,6 +114,5 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
|
|||
int min_level, gfn_t next_last_level_gfn);
|
||||
void tdp_iter_next(struct tdp_iter *iter);
|
||||
void tdp_iter_restart(struct tdp_iter *iter);
|
||||
void tdp_iter_step_up(struct tdp_iter *iter);
|
||||
|
||||
#endif /* __KVM_X86_MMU_TDP_ITER_H */
|
||||
|
|
|
@ -425,7 +425,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
|
|||
|
||||
tdp_mmu_unlink_sp(kvm, sp, shared);
|
||||
|
||||
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
|
||||
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
|
||||
tdp_ptep_t sptep = pt + i;
|
||||
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
|
||||
u64 old_spte;
|
||||
|
@ -633,7 +633,6 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
|
|||
u64 new_spte)
|
||||
{
|
||||
u64 *sptep = rcu_dereference(iter->sptep);
|
||||
u64 old_spte;
|
||||
|
||||
/*
|
||||
* The caller is responsible for ensuring the old SPTE is not a REMOVED
|
||||
|
@ -649,17 +648,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
|
|||
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
|
||||
* does not hold the mmu_lock.
|
||||
*/
|
||||
old_spte = cmpxchg64(sptep, iter->old_spte, new_spte);
|
||||
if (old_spte != iter->old_spte) {
|
||||
/*
|
||||
* The page table entry was modified by a different logical
|
||||
* CPU. Refresh iter->old_spte with the current value so the
|
||||
* caller operates on fresh data, e.g. if it retries
|
||||
* tdp_mmu_set_spte_atomic().
|
||||
*/
|
||||
iter->old_spte = old_spte;
|
||||
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
|
||||
new_spte, iter->level, true);
|
||||
|
@ -934,9 +924,6 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
|
|||
}
|
||||
|
||||
/*
|
||||
* Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs
|
||||
* have been cleared and a TLB flush is needed before releasing the MMU lock.
|
||||
*
|
||||
* If can_yield is true, will release the MMU lock and reschedule if the
|
||||
* scheduler needs the CPU or there is contention on the MMU lock. If this
|
||||
* function cannot yield, it will not release the MMU lock or reschedule and
|
||||
|
@ -979,10 +966,9 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||
}
|
||||
|
||||
/*
|
||||
* Tears down the mappings for the range of gfns, [start, end), and frees the
|
||||
* non-root pages mapping GFNs strictly within that range. Returns true if
|
||||
* SPTEs have been cleared and a TLB flush is needed before releasing the
|
||||
* MMU lock.
|
||||
* Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
|
||||
* true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
|
||||
* more SPTEs were zapped since the MMU lock was last acquired.
|
||||
*/
|
||||
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
|
||||
bool can_yield, bool flush)
|
||||
|
@ -1487,8 +1473,8 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
|
|||
* No need for atomics when writing to sp->spt since the page table has
|
||||
* not been linked in yet and thus is not reachable from any other CPU.
|
||||
*/
|
||||
for (i = 0; i < PT64_ENT_PER_PAGE; i++)
|
||||
sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i);
|
||||
for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
|
||||
sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
|
||||
|
||||
/*
|
||||
* Replace the huge spte with a pointer to the populated lower level
|
||||
|
@ -1507,7 +1493,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
|
|||
* are overwriting from the page stats. But we have to manually update
|
||||
* the page stats with the new present child pages.
|
||||
*/
|
||||
kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE);
|
||||
kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
|
||||
|
||||
out:
|
||||
trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
|
||||
|
@ -1731,10 +1717,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
|||
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear leaf entries which could be replaced by large mappings, for
|
||||
* GFNs within the slot.
|
||||
*/
|
||||
static void zap_collapsible_spte_range(struct kvm *kvm,
|
||||
struct kvm_mmu_page *root,
|
||||
const struct kvm_memory_slot *slot)
|
||||
|
@ -1743,61 +1725,52 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
|
|||
gfn_t end = start + slot->npages;
|
||||
struct tdp_iter iter;
|
||||
int max_mapping_level;
|
||||
kvm_pfn_t pfn;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
tdp_root_for_each_pte(iter, root, start, end) {
|
||||
for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
|
||||
retry:
|
||||
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
|
||||
continue;
|
||||
|
||||
if (!is_shadow_present_pte(iter.old_spte) ||
|
||||
!is_last_spte(iter.old_spte, iter.level))
|
||||
if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
|
||||
!is_shadow_present_pte(iter.old_spte))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* This is a leaf SPTE. Check if the PFN it maps can
|
||||
* be mapped at a higher level.
|
||||
* Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
|
||||
* a large page size, then its parent would have been zapped
|
||||
* instead of stepping down.
|
||||
*/
|
||||
pfn = spte_to_pfn(iter.old_spte);
|
||||
if (is_last_spte(iter.old_spte, iter.level))
|
||||
continue;
|
||||
|
||||
if (kvm_is_reserved_pfn(pfn))
|
||||
/*
|
||||
* If iter.gfn resides outside of the slot, i.e. the page for
|
||||
* the current level overlaps but is not contained by the slot,
|
||||
* then the SPTE can't be made huge. More importantly, trying
|
||||
* to query that info from slot->arch.lpage_info will cause an
|
||||
* out-of-bounds access.
|
||||
*/
|
||||
if (iter.gfn < start || iter.gfn >= end)
|
||||
continue;
|
||||
|
||||
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
|
||||
iter.gfn, pfn, PG_LEVEL_NUM);
|
||||
|
||||
WARN_ON(max_mapping_level < iter.level);
|
||||
|
||||
/*
|
||||
* If this page is already mapped at the highest
|
||||
* viable level, there's nothing more to do.
|
||||
*/
|
||||
if (max_mapping_level == iter.level)
|
||||
iter.gfn, PG_LEVEL_NUM);
|
||||
if (max_mapping_level < iter.level)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* The page can be remapped at a higher level, so step
|
||||
* up to zap the parent SPTE.
|
||||
*/
|
||||
while (max_mapping_level > iter.level)
|
||||
tdp_iter_step_up(&iter);
|
||||
|
||||
/* Note, a successful atomic zap also does a remote TLB flush. */
|
||||
tdp_mmu_zap_spte_atomic(kvm, &iter);
|
||||
|
||||
/*
|
||||
* If the atomic zap fails, the iter will recurse back into
|
||||
* the same subtree to retry.
|
||||
*/
|
||||
if (tdp_mmu_zap_spte_atomic(kvm, &iter))
|
||||
goto retry;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear non-leaf entries (and free associated page tables) which could
|
||||
* be replaced by large mappings, for GFNs within the slot.
|
||||
* Zap non-leaf SPTEs (and free their associated page tables) which could
|
||||
* be replaced by huge pages, for GFNs within the slot.
|
||||
*/
|
||||
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
||||
const struct kvm_memory_slot *slot)
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <linux/bsearch.h>
|
||||
#include <linux/sort.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/cpu_device_id.h>
|
||||
#include "x86.h"
|
||||
#include "cpuid.h"
|
||||
#include "lapic.h"
|
||||
|
@ -24,6 +25,15 @@
|
|||
/* This is enough to filter the vast majority of currently defined events. */
|
||||
#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
|
||||
|
||||
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
|
||||
EXPORT_SYMBOL_GPL(kvm_pmu_cap);
|
||||
|
||||
static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
|
||||
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
|
||||
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
|
||||
{}
|
||||
};
|
||||
|
||||
/* NOTE:
|
||||
* - Each perf counter is defined as "struct kvm_pmc";
|
||||
* - There are two types of perf counters: general purpose (gp) and fixed.
|
||||
|
@ -34,7 +44,9 @@
|
|||
* However AMD doesn't support fixed-counters;
|
||||
* - There are three types of index to access perf counters (PMC):
|
||||
* 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
|
||||
* has MSR_K7_PERFCTRn.
|
||||
* has MSR_K7_PERFCTRn and, for families 15H and later,
|
||||
* MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
|
||||
* aliased to MSR_K7_PERFCTRn.
|
||||
* 2. MSR Index (named idx): This normally is used by RDPMC instruction.
|
||||
* For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
|
||||
* C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
|
||||
|
@ -46,7 +58,8 @@
|
|||
* between pmc and perf counters is as the following:
|
||||
* * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
|
||||
* [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
|
||||
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
|
||||
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
|
||||
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
|
||||
*/
|
||||
|
||||
static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
|
||||
|
@ -86,15 +99,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
|
|||
static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
|
||||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
bool skip_pmi = false;
|
||||
|
||||
/* Ignore counters that have been reprogrammed already. */
|
||||
if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
|
||||
return;
|
||||
|
||||
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
|
||||
if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
|
||||
/* Indicate PEBS overflow PMI to guest. */
|
||||
skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
|
||||
(unsigned long *)&pmu->global_status);
|
||||
} else {
|
||||
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
|
||||
}
|
||||
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
|
||||
|
||||
if (!pmc->intr)
|
||||
if (!pmc->intr || skip_pmi)
|
||||
return;
|
||||
|
||||
/*
|
||||
|
@ -124,6 +144,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
|
|||
u64 config, bool exclude_user,
|
||||
bool exclude_kernel, bool intr)
|
||||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
struct perf_event *event;
|
||||
struct perf_event_attr attr = {
|
||||
.type = type,
|
||||
|
@ -135,9 +156,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
|
|||
.exclude_kernel = exclude_kernel,
|
||||
.config = config,
|
||||
};
|
||||
|
||||
if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
|
||||
return;
|
||||
bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
|
||||
|
||||
attr.sample_period = get_sample_period(pmc, pmc->counter);
|
||||
|
||||
|
@ -150,6 +169,25 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
|
|||
*/
|
||||
attr.sample_period = 0;
|
||||
}
|
||||
if (pebs) {
|
||||
/*
|
||||
* The non-zero precision level of guest event makes the ordinary
|
||||
* guest event becomes a guest PEBS event and triggers the host
|
||||
* PEBS PMI handler to determine whether the PEBS overflow PMI
|
||||
* comes from the host counters or the guest.
|
||||
*
|
||||
* For most PEBS hardware events, the difference in the software
|
||||
* precision levels of guest and host PEBS events will not affect
|
||||
* the accuracy of the PEBS profiling result, because the "event IP"
|
||||
* in the PEBS record is calibrated on the guest side.
|
||||
*
|
||||
* On Icelake everything is fine. Other hardware (GLC+, TNT+) that
|
||||
* could possibly care here is unsupported and needs changes.
|
||||
*/
|
||||
attr.precise_ip = 1;
|
||||
if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
|
||||
attr.precise_ip = 3;
|
||||
}
|
||||
|
||||
event = perf_event_create_kernel_counter(&attr, -1, current,
|
||||
kvm_perf_overflow, pmc);
|
||||
|
@ -163,7 +201,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
|
|||
pmc_to_pmu(pmc)->event_count++;
|
||||
clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
|
||||
pmc->is_paused = false;
|
||||
pmc->intr = intr;
|
||||
pmc->intr = intr || pebs;
|
||||
}
|
||||
|
||||
static void pmc_pause_counter(struct kvm_pmc *pmc)
|
||||
|
@ -189,6 +227,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
|
|||
get_sample_period(pmc, pmc->counter)))
|
||||
return false;
|
||||
|
||||
if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) &&
|
||||
pmc->perf_event->attr.precise_ip)
|
||||
return false;
|
||||
|
||||
/* reuse perf_event to serve as pmc_reprogram_counter() does*/
|
||||
perf_event_enable(pmc->perf_event);
|
||||
pmc->is_paused = false;
|
||||
|
@ -205,115 +247,83 @@ static int cmp_u64(const void *pa, const void *pb)
|
|||
return (a > b) - (a < b);
|
||||
}
|
||||
|
||||
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
|
||||
static bool check_pmu_event_filter(struct kvm_pmc *pmc)
|
||||
{
|
||||
u64 config;
|
||||
u32 type = PERF_TYPE_RAW;
|
||||
struct kvm *kvm = pmc->vcpu->kvm;
|
||||
struct kvm_pmu_event_filter *filter;
|
||||
struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
|
||||
struct kvm *kvm = pmc->vcpu->kvm;
|
||||
bool allow_event = true;
|
||||
__u64 key;
|
||||
int idx;
|
||||
|
||||
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
|
||||
printk_once("kvm pmu: pin control bit is ignored\n");
|
||||
|
||||
pmc->eventsel = eventsel;
|
||||
|
||||
pmc_pause_counter(pmc);
|
||||
|
||||
if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
|
||||
return;
|
||||
if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
|
||||
return false;
|
||||
|
||||
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
|
||||
if (filter) {
|
||||
__u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
|
||||
if (!filter)
|
||||
goto out;
|
||||
|
||||
if (pmc_is_gp(pmc)) {
|
||||
key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
|
||||
if (bsearch(&key, filter->events, filter->nevents,
|
||||
sizeof(__u64), cmp_u64))
|
||||
allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
|
||||
else
|
||||
allow_event = filter->action == KVM_PMU_EVENT_DENY;
|
||||
}
|
||||
if (!allow_event)
|
||||
return;
|
||||
|
||||
if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
|
||||
ARCH_PERFMON_EVENTSEL_INV |
|
||||
ARCH_PERFMON_EVENTSEL_CMASK |
|
||||
HSW_IN_TX |
|
||||
HSW_IN_TX_CHECKPOINTED))) {
|
||||
config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
|
||||
if (config != PERF_COUNT_HW_MAX)
|
||||
type = PERF_TYPE_HARDWARE;
|
||||
} else {
|
||||
idx = pmc->idx - INTEL_PMC_IDX_FIXED;
|
||||
if (filter->action == KVM_PMU_EVENT_DENY &&
|
||||
test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
|
||||
allow_event = false;
|
||||
if (filter->action == KVM_PMU_EVENT_ALLOW &&
|
||||
!test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
|
||||
allow_event = false;
|
||||
}
|
||||
|
||||
if (type == PERF_TYPE_RAW)
|
||||
config = eventsel & pmu->raw_event_mask;
|
||||
|
||||
if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
|
||||
return;
|
||||
|
||||
pmc_release_perf_event(pmc);
|
||||
|
||||
pmc->current_config = eventsel;
|
||||
pmc_reprogram_counter(pmc, type, config,
|
||||
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
|
||||
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
|
||||
eventsel & ARCH_PERFMON_EVENTSEL_INT);
|
||||
out:
|
||||
return allow_event;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(reprogram_gp_counter);
|
||||
|
||||
void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
|
||||
void reprogram_counter(struct kvm_pmc *pmc)
|
||||
{
|
||||
unsigned en_field = ctrl & 0x3;
|
||||
bool pmi = ctrl & 0x8;
|
||||
struct kvm_pmu_event_filter *filter;
|
||||
struct kvm *kvm = pmc->vcpu->kvm;
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
u64 eventsel = pmc->eventsel;
|
||||
u64 new_config = eventsel;
|
||||
u8 fixed_ctr_ctrl;
|
||||
|
||||
pmc_pause_counter(pmc);
|
||||
|
||||
if (!en_field || !pmc_is_enabled(pmc))
|
||||
if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
|
||||
return;
|
||||
|
||||
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
|
||||
if (filter) {
|
||||
if (filter->action == KVM_PMU_EVENT_DENY &&
|
||||
test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
|
||||
return;
|
||||
if (filter->action == KVM_PMU_EVENT_ALLOW &&
|
||||
!test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
|
||||
return;
|
||||
if (!check_pmu_event_filter(pmc))
|
||||
return;
|
||||
|
||||
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
|
||||
printk_once("kvm pmu: pin control bit is ignored\n");
|
||||
|
||||
if (pmc_is_fixed(pmc)) {
|
||||
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
|
||||
pmc->idx - INTEL_PMC_IDX_FIXED);
|
||||
if (fixed_ctr_ctrl & 0x1)
|
||||
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
|
||||
if (fixed_ctr_ctrl & 0x2)
|
||||
eventsel |= ARCH_PERFMON_EVENTSEL_USR;
|
||||
if (fixed_ctr_ctrl & 0x8)
|
||||
eventsel |= ARCH_PERFMON_EVENTSEL_INT;
|
||||
new_config = (u64)fixed_ctr_ctrl;
|
||||
}
|
||||
|
||||
if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc))
|
||||
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
|
||||
return;
|
||||
|
||||
pmc_release_perf_event(pmc);
|
||||
|
||||
pmc->current_config = (u64)ctrl;
|
||||
pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
|
||||
static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc),
|
||||
!(en_field & 0x2), /* exclude user */
|
||||
!(en_field & 0x1), /* exclude kernel */
|
||||
pmi);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
|
||||
|
||||
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
|
||||
{
|
||||
struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx);
|
||||
|
||||
if (!pmc)
|
||||
return;
|
||||
|
||||
if (pmc_is_gp(pmc))
|
||||
reprogram_gp_counter(pmc, pmc->eventsel);
|
||||
else {
|
||||
int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
|
||||
u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
|
||||
|
||||
reprogram_fixed_counter(pmc, ctrl, idx);
|
||||
}
|
||||
pmc->current_config = new_config;
|
||||
pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
|
||||
(eventsel & pmu->raw_event_mask),
|
||||
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
|
||||
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
|
||||
eventsel & ARCH_PERFMON_EVENTSEL_INT);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(reprogram_counter);
|
||||
|
||||
|
@ -329,8 +339,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
|
|||
clear_bit(bit, pmu->reprogram_pmi);
|
||||
continue;
|
||||
}
|
||||
|
||||
reprogram_counter(pmu, bit);
|
||||
reprogram_counter(pmc);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -471,17 +480,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
|
|||
kvm_pmu_refresh(vcpu);
|
||||
}
|
||||
|
||||
static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
|
||||
if (pmc_is_fixed(pmc))
|
||||
return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
|
||||
pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
|
||||
|
||||
return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
|
||||
}
|
||||
|
||||
/* Release perf_events for vPMCs that have been unused for a full time slice. */
|
||||
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
|
@ -514,13 +512,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
|
|||
|
||||
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
u64 prev_count;
|
||||
|
||||
prev_count = pmc->counter;
|
||||
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
|
||||
|
||||
reprogram_counter(pmu, pmc->idx);
|
||||
reprogram_counter(pmc);
|
||||
if (pmc->counter < prev_count)
|
||||
__kvm_perf_overflow(pmc, false);
|
||||
}
|
||||
|
@ -528,13 +525,8 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
|
|||
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
|
||||
unsigned int perf_hw_id)
|
||||
{
|
||||
u64 old_eventsel = pmc->eventsel;
|
||||
unsigned int config;
|
||||
|
||||
pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
|
||||
config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
|
||||
pmc->eventsel = old_eventsel;
|
||||
return config == perf_hw_id;
|
||||
return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
|
||||
AMD64_RAW_EVENT_MASK_NB);
|
||||
}
|
||||
|
||||
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
|
||||
|
|
|
@ -8,6 +8,9 @@
|
|||
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
|
||||
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
|
||||
|
||||
#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
|
||||
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
|
||||
|
||||
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
|
||||
#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
|
||||
|
||||
|
@ -22,7 +25,7 @@ struct kvm_event_hw_type_mapping {
|
|||
};
|
||||
|
||||
struct kvm_pmu_ops {
|
||||
unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc);
|
||||
bool (*hw_event_available)(struct kvm_pmc *pmc);
|
||||
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
|
||||
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
|
||||
struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
|
||||
|
@ -144,9 +147,43 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
|
|||
get_sample_period(pmc, pmc->counter));
|
||||
}
|
||||
|
||||
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
|
||||
void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
|
||||
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
|
||||
static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
|
||||
if (pmc_is_fixed(pmc))
|
||||
return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
|
||||
pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
|
||||
|
||||
return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
|
||||
}
|
||||
|
||||
extern struct x86_pmu_capability kvm_pmu_cap;
|
||||
|
||||
static inline void kvm_init_pmu_capability(void)
|
||||
{
|
||||
bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
|
||||
|
||||
perf_get_x86_pmu_capability(&kvm_pmu_cap);
|
||||
|
||||
/*
|
||||
* For Intel, only support guest architectural pmu
|
||||
* on a host with architectural pmu.
|
||||
*/
|
||||
if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp)
|
||||
enable_pmu = false;
|
||||
|
||||
if (!enable_pmu) {
|
||||
memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
|
||||
return;
|
||||
}
|
||||
|
||||
kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
|
||||
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
|
||||
KVM_PMC_MAX_FIXED);
|
||||
}
|
||||
|
||||
void reprogram_counter(struct kvm_pmc *pmc);
|
||||
|
||||
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
|
||||
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
|
||||
|
|
|
@ -40,6 +40,9 @@
|
|||
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
|
||||
#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
|
||||
|
||||
static bool force_avic;
|
||||
module_param_unsafe(force_avic, bool, 0444);
|
||||
|
||||
/* Note:
|
||||
* This hash table is used to map VM_ID to a struct kvm_svm,
|
||||
* when handling AMD IOMMU GALOG notification to schedule in
|
||||
|
@ -50,6 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
|
|||
static u32 next_vm_id = 0;
|
||||
static bool next_vm_id_wrapped = 0;
|
||||
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
|
||||
enum avic_modes avic_mode;
|
||||
|
||||
/*
|
||||
* This is a wrapper of struct amd_iommu_ir_data.
|
||||
|
@ -59,6 +63,54 @@ struct amd_svm_iommu_ir {
|
|||
void *data; /* Storing pointer to struct amd_ir_data */
|
||||
};
|
||||
|
||||
static void avic_activate_vmcb(struct vcpu_svm *svm)
|
||||
{
|
||||
struct vmcb *vmcb = svm->vmcb01.ptr;
|
||||
|
||||
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
|
||||
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
|
||||
|
||||
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
|
||||
|
||||
/* Note:
|
||||
* KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
|
||||
* MSR accesses, while interrupt injection to a running vCPU
|
||||
* can be achieved using AVIC doorbell. The AVIC hardware still
|
||||
* accelerate MMIO accesses, but this does not cause any harm
|
||||
* as the guest is not supposed to access xAPIC mmio when uses x2APIC.
|
||||
*/
|
||||
if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
|
||||
avic_mode == AVIC_MODE_X2) {
|
||||
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
|
||||
vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
|
||||
/* Disabling MSR intercept for x2APIC registers */
|
||||
svm_set_x2apic_msr_interception(svm, false);
|
||||
} else {
|
||||
/* For xAVIC and hybrid-xAVIC modes */
|
||||
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
|
||||
/* Enabling MSR intercept for x2APIC registers */
|
||||
svm_set_x2apic_msr_interception(svm, true);
|
||||
}
|
||||
}
|
||||
|
||||
static void avic_deactivate_vmcb(struct vcpu_svm *svm)
|
||||
{
|
||||
struct vmcb *vmcb = svm->vmcb01.ptr;
|
||||
|
||||
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
|
||||
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
|
||||
|
||||
/*
|
||||
* If running nested and the guest uses its own MSR bitmap, there
|
||||
* is no need to update L0's msr bitmap
|
||||
*/
|
||||
if (is_guest_mode(&svm->vcpu) &&
|
||||
vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
|
||||
return;
|
||||
|
||||
/* Enabling MSR intercept for x2APIC registers */
|
||||
svm_set_x2apic_msr_interception(svm, true);
|
||||
}
|
||||
|
||||
/* Note:
|
||||
* This function is called from IOMMU driver to notify
|
||||
|
@ -175,13 +227,12 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
|
|||
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
|
||||
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
|
||||
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
|
||||
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
|
||||
vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
|
||||
|
||||
if (kvm_apicv_activated(svm->vcpu.kvm))
|
||||
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
|
||||
avic_activate_vmcb(svm);
|
||||
else
|
||||
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
|
||||
avic_deactivate_vmcb(svm);
|
||||
}
|
||||
|
||||
static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
|
||||
|
@ -190,7 +241,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
|
|||
u64 *avic_physical_id_table;
|
||||
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
|
||||
|
||||
if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
|
||||
if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) ||
|
||||
(avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID))
|
||||
return NULL;
|
||||
|
||||
avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
|
||||
|
@ -237,7 +289,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
|
|||
int id = vcpu->vcpu_id;
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
|
||||
if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) ||
|
||||
(avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID))
|
||||
return -EINVAL;
|
||||
|
||||
if (!vcpu->arch.apic->regs)
|
||||
|
@ -279,8 +332,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
|
|||
*/
|
||||
int cpu = READ_ONCE(vcpu->cpu);
|
||||
|
||||
if (cpu != get_cpu())
|
||||
if (cpu != get_cpu()) {
|
||||
wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
|
||||
trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
|
||||
}
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
|
@ -303,7 +358,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
|
|||
if (apic_x2apic_mode(source))
|
||||
dest = icrh;
|
||||
else
|
||||
dest = GET_APIC_DEST_FIELD(icrh);
|
||||
dest = GET_XAPIC_DEST_FIELD(icrh);
|
||||
|
||||
if (dest_mode == APIC_DEST_PHYSICAL) {
|
||||
/* broadcast destination, use slow path */
|
||||
|
@ -345,9 +400,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
|
|||
|
||||
logid_index = cluster + __ffs(bitmap);
|
||||
|
||||
if (apic_x2apic_mode(source)) {
|
||||
l1_physical_id = logid_index;
|
||||
} else {
|
||||
if (!apic_x2apic_mode(source)) {
|
||||
u32 *avic_logical_id_table =
|
||||
page_address(kvm_svm->avic_logical_id_table_page);
|
||||
|
||||
|
@ -362,6 +415,23 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
|
|||
|
||||
l1_physical_id = logid_entry &
|
||||
AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
|
||||
} else {
|
||||
/*
|
||||
* For x2APIC logical mode, cannot leverage the index.
|
||||
* Instead, calculate physical ID from logical ID in ICRH.
|
||||
*/
|
||||
int cluster = (icrh & 0xffff0000) >> 16;
|
||||
int apic = ffs(icrh & 0xffff) - 1;
|
||||
|
||||
/*
|
||||
* If the x2APIC logical ID sub-field (i.e. icrh[15:0])
|
||||
* contains anything but a single bit, we cannot use the
|
||||
* fast path, because it is limited to a single vCPU.
|
||||
*/
|
||||
if (apic < 0 || icrh != (1 << apic))
|
||||
return -EINVAL;
|
||||
|
||||
l1_physical_id = (cluster << 4) + apic;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -396,9 +466,15 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
|
|||
* since entered the guest will have processed pending IRQs at VMRUN.
|
||||
*/
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
u32 dest;
|
||||
|
||||
if (apic_x2apic_mode(vcpu->arch.apic))
|
||||
dest = icrh;
|
||||
else
|
||||
dest = GET_XAPIC_DEST_FIELD(icrh);
|
||||
|
||||
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
|
||||
GET_APIC_DEST_FIELD(icrh),
|
||||
icrl & APIC_DEST_MASK)) {
|
||||
dest, icrl & APIC_DEST_MASK)) {
|
||||
vcpu->arch.apic->irr_pending = true;
|
||||
svm_complete_interrupt_delivery(vcpu,
|
||||
icrl & APIC_MODE_MASK,
|
||||
|
@ -514,8 +590,13 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
bool flat = svm->dfr_reg == APIC_DFR_FLAT;
|
||||
u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
|
||||
u32 *entry;
|
||||
|
||||
/* Note: x2AVIC does not use logical APIC ID table */
|
||||
if (apic_x2apic_mode(vcpu->arch.apic))
|
||||
return;
|
||||
|
||||
entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
|
||||
if (entry)
|
||||
clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
|
||||
}
|
||||
|
@ -527,6 +608,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
|
|||
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
|
||||
u32 id = kvm_xapic_id(vcpu->arch.apic);
|
||||
|
||||
/* AVIC does not support LDR update for x2APIC */
|
||||
if (apic_x2apic_mode(vcpu->arch.apic))
|
||||
return 0;
|
||||
|
||||
if (ldr == svm->ldr_reg)
|
||||
return 0;
|
||||
|
||||
|
@ -654,6 +739,18 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
|
|||
avic_handle_ldr_update(vcpu);
|
||||
}
|
||||
|
||||
void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE)
|
||||
return;
|
||||
|
||||
if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) {
|
||||
WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id);
|
||||
return;
|
||||
}
|
||||
avic_refresh_apicv_exec_ctrl(vcpu);
|
||||
}
|
||||
|
||||
static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
|
||||
{
|
||||
int ret = 0;
|
||||
|
@ -906,7 +1003,6 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
|
|||
BIT(APICV_INHIBIT_REASON_NESTED) |
|
||||
BIT(APICV_INHIBIT_REASON_IRQWIN) |
|
||||
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
|
||||
BIT(APICV_INHIBIT_REASON_X2APIC) |
|
||||
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
|
||||
BIT(APICV_INHIBIT_REASON_SEV) |
|
||||
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
|
||||
|
@ -968,7 +1064,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
|||
return;
|
||||
|
||||
entry = READ_ONCE(*(svm->avic_physical_id_cache));
|
||||
WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
|
||||
|
||||
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
|
||||
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
|
||||
|
@ -1016,9 +1111,9 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
|
|||
* accordingly before re-activating.
|
||||
*/
|
||||
avic_apicv_post_state_restore(vcpu);
|
||||
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
|
||||
avic_activate_vmcb(svm);
|
||||
} else {
|
||||
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
|
||||
avic_deactivate_vmcb(svm);
|
||||
}
|
||||
vmcb_mark_dirty(vmcb, VMCB_AVIC);
|
||||
|
||||
|
@ -1058,3 +1153,44 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
|
|||
|
||||
avic_vcpu_load(vcpu, vcpu->cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note:
|
||||
* - The module param avic enable both xAPIC and x2APIC mode.
|
||||
* - Hypervisor can support both xAVIC and x2AVIC in the same guest.
|
||||
* - The mode can be switched at run-time.
|
||||
*/
|
||||
bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
|
||||
{
|
||||
if (!npt_enabled)
|
||||
return false;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_AVIC)) {
|
||||
avic_mode = AVIC_MODE_X1;
|
||||
pr_info("AVIC enabled\n");
|
||||
} else if (force_avic) {
|
||||
/*
|
||||
* Some older systems does not advertise AVIC support.
|
||||
* See Revision Guide for specific AMD processor for more detail.
|
||||
*/
|
||||
avic_mode = AVIC_MODE_X1;
|
||||
pr_warn("AVIC is not supported in CPUID but force enabled");
|
||||
pr_warn("Your system might crash and burn");
|
||||
}
|
||||
|
||||
/* AVIC is a prerequisite for x2AVIC. */
|
||||
if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
|
||||
if (avic_mode == AVIC_MODE_X1) {
|
||||
avic_mode = AVIC_MODE_X2;
|
||||
pr_info("x2AVIC enabled\n");
|
||||
} else {
|
||||
pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
|
||||
pr_warn(FW_BUG "Try enable AVIC using force_avic option");
|
||||
}
|
||||
}
|
||||
|
||||
if (avic_mode != AVIC_MODE_NONE)
|
||||
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
|
||||
|
||||
return !!avic_mode;
|
||||
}
|
||||
|
|
|
@ -230,6 +230,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
|
|||
break;
|
||||
|
||||
p = msrpm_offsets[i];
|
||||
|
||||
/* x2apic msrs are intercepted always for the nested guest */
|
||||
if (is_x2apic_msrpm_offset(p))
|
||||
continue;
|
||||
|
||||
offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
|
||||
|
||||
if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
|
||||
|
@ -320,7 +325,8 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
|
|||
return false;
|
||||
}
|
||||
|
||||
if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
|
||||
/* Note, SVM doesn't have any additional restrictions on CR4. */
|
||||
if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
|
||||
return false;
|
||||
|
||||
if (CC(!kvm_valid_efer(vcpu, save->efer)))
|
||||
|
@ -371,6 +377,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
|
|||
to->nested_ctl = from->nested_ctl;
|
||||
to->event_inj = from->event_inj;
|
||||
to->event_inj_err = from->event_inj_err;
|
||||
to->next_rip = from->next_rip;
|
||||
to->nested_cr3 = from->nested_cr3;
|
||||
to->virt_ext = from->virt_ext;
|
||||
to->pause_filter_count = from->pause_filter_count;
|
||||
|
@ -608,7 +615,33 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
|
|||
}
|
||||
}
|
||||
|
||||
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
|
||||
static inline bool is_evtinj_soft(u32 evtinj)
|
||||
{
|
||||
u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
|
||||
u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
|
||||
|
||||
if (!(evtinj & SVM_EVTINJ_VALID))
|
||||
return false;
|
||||
|
||||
if (type == SVM_EVTINJ_TYPE_SOFT)
|
||||
return true;
|
||||
|
||||
return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
|
||||
}
|
||||
|
||||
static bool is_evtinj_nmi(u32 evtinj)
|
||||
{
|
||||
u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
|
||||
|
||||
if (!(evtinj & SVM_EVTINJ_VALID))
|
||||
return false;
|
||||
|
||||
return type == SVM_EVTINJ_TYPE_NMI;
|
||||
}
|
||||
|
||||
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
|
||||
unsigned long vmcb12_rip,
|
||||
unsigned long vmcb12_csbase)
|
||||
{
|
||||
u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
|
||||
u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
|
||||
|
@ -650,7 +683,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
|
|||
|
||||
vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
|
||||
|
||||
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
|
||||
if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
|
||||
WARN_ON(!svm->tsc_scaling_enabled);
|
||||
nested_svm_update_tsc_ratio_msr(vcpu);
|
||||
}
|
||||
|
@ -664,6 +697,30 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
|
|||
vmcb02->control.event_inj = svm->nested.ctl.event_inj;
|
||||
vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
|
||||
|
||||
/*
|
||||
* next_rip is consumed on VMRUN as the return address pushed on the
|
||||
* stack for injected soft exceptions/interrupts. If nrips is exposed
|
||||
* to L1, take it verbatim from vmcb12. If nrips is supported in
|
||||
* hardware but not exposed to L1, stuff the actual L2 RIP to emulate
|
||||
* what a nrips=0 CPU would do (L1 is responsible for advancing RIP
|
||||
* prior to injecting the event).
|
||||
*/
|
||||
if (svm->nrips_enabled)
|
||||
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
|
||||
else if (boot_cpu_has(X86_FEATURE_NRIPS))
|
||||
vmcb02->control.next_rip = vmcb12_rip;
|
||||
|
||||
svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
|
||||
if (is_evtinj_soft(vmcb02->control.event_inj)) {
|
||||
svm->soft_int_injected = true;
|
||||
svm->soft_int_csbase = vmcb12_csbase;
|
||||
svm->soft_int_old_rip = vmcb12_rip;
|
||||
if (svm->nrips_enabled)
|
||||
svm->soft_int_next_rip = svm->nested.ctl.next_rip;
|
||||
else
|
||||
svm->soft_int_next_rip = vmcb12_rip;
|
||||
}
|
||||
|
||||
vmcb02->control.virt_ext = vmcb01->control.virt_ext &
|
||||
LBR_CTL_ENABLE_MASK;
|
||||
if (svm->lbrv_enabled)
|
||||
|
@ -745,7 +802,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
|
|||
nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
|
||||
|
||||
svm_switch_vmcb(svm, &svm->nested.vmcb02);
|
||||
nested_vmcb02_prepare_control(svm);
|
||||
nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
|
||||
nested_vmcb02_prepare_save(svm, vmcb12);
|
||||
|
||||
ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
|
||||
|
@ -834,6 +891,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
|
|||
|
||||
out_exit_err:
|
||||
svm->nested.nested_run_pending = 0;
|
||||
svm->nmi_l1_to_l2 = false;
|
||||
svm->soft_int_injected = false;
|
||||
|
||||
svm->vmcb->control.exit_code = SVM_EXIT_ERR;
|
||||
svm->vmcb->control.exit_code_hi = 0;
|
||||
|
@ -982,7 +1041,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
|
|||
vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
|
||||
}
|
||||
|
||||
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
|
||||
if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
|
||||
WARN_ON(!svm->tsc_scaling_enabled);
|
||||
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
|
||||
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
|
||||
|
@ -1421,6 +1480,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
|
|||
dst->nested_ctl = from->nested_ctl;
|
||||
dst->event_inj = from->event_inj;
|
||||
dst->event_inj_err = from->event_inj_err;
|
||||
dst->next_rip = from->next_rip;
|
||||
dst->nested_cr3 = from->nested_cr3;
|
||||
dst->virt_ext = from->virt_ext;
|
||||
dst->pause_filter_count = from->pause_filter_count;
|
||||
|
@ -1605,7 +1665,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
|
|||
nested_copy_vmcb_control_to_cache(svm, ctl);
|
||||
|
||||
svm_switch_vmcb(svm, &svm->nested.vmcb02);
|
||||
nested_vmcb02_prepare_control(svm);
|
||||
nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
|
||||
|
||||
/*
|
||||
* While the nested guest CR3 is already checked and set by
|
||||
|
|
|
@ -33,34 +33,6 @@ enum index {
|
|||
INDEX_ERROR,
|
||||
};
|
||||
|
||||
/* duplicated from amd_perfmon_event_map, K7 and above should work. */
|
||||
static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
|
||||
[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
|
||||
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
|
||||
[2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES },
|
||||
[3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES },
|
||||
[4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
|
||||
[5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
|
||||
[6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
|
||||
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
|
||||
};
|
||||
|
||||
/* duplicated from amd_f17h_perfmon_event_map. */
|
||||
static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
|
||||
[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
|
||||
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
|
||||
[2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
|
||||
[3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
|
||||
[4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
|
||||
[5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
|
||||
[6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
|
||||
[7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
|
||||
};
|
||||
|
||||
/* amd_pmc_perf_hw_id depends on these being the same size */
|
||||
static_assert(ARRAY_SIZE(amd_event_mapping) ==
|
||||
ARRAY_SIZE(amd_f17h_event_mapping));
|
||||
|
||||
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
|
||||
{
|
||||
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
|
||||
|
@ -154,31 +126,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
|
|||
return &pmu->gp_counters[msr_to_index(msr)];
|
||||
}
|
||||
|
||||
static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
|
||||
static bool amd_hw_event_available(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_event_hw_type_mapping *event_mapping;
|
||||
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
|
||||
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
|
||||
int i;
|
||||
|
||||
/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
|
||||
if (WARN_ON(pmc_is_fixed(pmc)))
|
||||
return PERF_COUNT_HW_MAX;
|
||||
|
||||
if (guest_cpuid_family(pmc->vcpu) >= 0x17)
|
||||
event_mapping = amd_f17h_event_mapping;
|
||||
else
|
||||
event_mapping = amd_event_mapping;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
|
||||
if (event_mapping[i].eventsel == event_select
|
||||
&& event_mapping[i].unit_mask == unit_mask)
|
||||
break;
|
||||
|
||||
if (i == ARRAY_SIZE(amd_event_mapping))
|
||||
return PERF_COUNT_HW_MAX;
|
||||
|
||||
return event_mapping[i].event_type;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
|
||||
|
@ -286,8 +236,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
|||
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
|
||||
if (pmc) {
|
||||
data &= ~pmu->reserved_bits;
|
||||
if (data != pmc->eventsel)
|
||||
reprogram_gp_counter(pmc, data);
|
||||
if (data != pmc->eventsel) {
|
||||
pmc->eventsel = data;
|
||||
reprogram_counter(pmc);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -343,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
|
||||
struct kvm_pmu_ops amd_pmu_ops __initdata = {
|
||||
.pmc_perf_hw_id = amd_pmc_perf_hw_id,
|
||||
.hw_event_available = amd_hw_event_available,
|
||||
.pmc_is_enabled = amd_pmc_is_enabled,
|
||||
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
|
||||
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
|
||||
|
|
|
@ -603,6 +603,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
|
|||
save->xss = svm->vcpu.arch.ia32_xss;
|
||||
save->dr6 = svm->vcpu.arch.dr6;
|
||||
|
||||
pr_debug("Virtual Machine Save Area (VMSA):\n");
|
||||
print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1606,38 +1609,35 @@ static int sev_lock_vcpus_for_migration(struct kvm *kvm,
|
|||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
unsigned long i, j;
|
||||
bool first = true;
|
||||
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
if (mutex_lock_killable_nested(&vcpu->mutex, role))
|
||||
goto out_unlock;
|
||||
|
||||
if (first) {
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
if (!i)
|
||||
/*
|
||||
* Reset the role to one that avoids colliding with
|
||||
* the role used for the first vcpu mutex.
|
||||
*/
|
||||
role = SEV_NR_MIGRATION_ROLES;
|
||||
first = false;
|
||||
} else {
|
||||
else
|
||||
mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out_unlock:
|
||||
|
||||
first = true;
|
||||
kvm_for_each_vcpu(j, vcpu, kvm) {
|
||||
if (i == j)
|
||||
break;
|
||||
|
||||
if (first)
|
||||
first = false;
|
||||
else
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
if (j)
|
||||
mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
|
||||
|
||||
#endif
|
||||
|
||||
mutex_unlock(&vcpu->mutex);
|
||||
}
|
||||
|
|
|
@ -74,6 +74,8 @@ static uint64_t osvw_len = 4, osvw_status;
|
|||
|
||||
static DEFINE_PER_CPU(u64, current_tsc_ratio);
|
||||
|
||||
#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
|
||||
|
||||
static const struct svm_direct_access_msrs {
|
||||
u32 index; /* Index of the MSR */
|
||||
bool always; /* True if intercept is initially cleared */
|
||||
|
@ -100,6 +102,38 @@ static const struct svm_direct_access_msrs {
|
|||
{ .index = MSR_IA32_CR_PAT, .always = false },
|
||||
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
|
||||
{ .index = MSR_TSC_AUX, .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_ID), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_LVR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_EOI), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_RRR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_LDR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_DFR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_SPIV), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_ISR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_TMR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_IRR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_ESR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_ICR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_ICR2), .always = false },
|
||||
|
||||
/*
|
||||
* Note:
|
||||
* AMD does not virtualize APIC TSC-deadline timer mode, but it is
|
||||
* emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
|
||||
* the AVIC hardware would generate GP fault. Therefore, always
|
||||
* intercept the MSR 0x832, and do not setup direct_access_msr.
|
||||
*/
|
||||
{ .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_LVTPC), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_LVT0), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_LVT1), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_LVTERR), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_TMICT), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_TMCCT), .always = false },
|
||||
{ .index = X2APIC_MSR(APIC_TDCR), .always = false },
|
||||
{ .index = MSR_INVALID, .always = false },
|
||||
};
|
||||
|
||||
|
@ -188,9 +222,6 @@ module_param(tsc_scaling, int, 0444);
|
|||
static bool avic;
|
||||
module_param(avic, bool, 0444);
|
||||
|
||||
static bool force_avic;
|
||||
module_param_unsafe(force_avic, bool, 0444);
|
||||
|
||||
bool __read_mostly dump_invalid_vmcb;
|
||||
module_param(dump_invalid_vmcb, bool, 0644);
|
||||
|
||||
|
@ -342,9 +373,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
|
|||
|
||||
}
|
||||
|
||||
static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
|
||||
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
|
||||
bool commit_side_effects)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
unsigned long old_rflags;
|
||||
|
||||
/*
|
||||
* SEV-ES does not expose the next RIP. The RIP update is controlled by
|
||||
|
@ -359,18 +392,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
|
||||
if (!svm->next_rip) {
|
||||
if (unlikely(!commit_side_effects))
|
||||
old_rflags = svm->vmcb->save.rflags;
|
||||
|
||||
if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
|
||||
return 0;
|
||||
|
||||
if (unlikely(!commit_side_effects))
|
||||
svm->vmcb->save.rflags = old_rflags;
|
||||
} else {
|
||||
kvm_rip_write(vcpu, svm->next_rip);
|
||||
}
|
||||
|
||||
done:
|
||||
svm_set_interrupt_shadow(vcpu, 0);
|
||||
if (likely(commit_side_effects))
|
||||
svm_set_interrupt_shadow(vcpu, 0);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return __svm_skip_emulated_instruction(vcpu, true);
|
||||
}
|
||||
|
||||
static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
unsigned long rip, old_rip = kvm_rip_read(vcpu);
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
/*
|
||||
* Due to architectural shortcomings, the CPU doesn't always provide
|
||||
* NextRIP, e.g. if KVM intercepted an exception that occurred while
|
||||
* the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
|
||||
* the instruction even if NextRIP is supported to acquire the next
|
||||
* RIP so that it can be shoved into the NextRIP field, otherwise
|
||||
* hardware will fail to advance guest RIP during event injection.
|
||||
* Drop the exception/interrupt if emulation fails and effectively
|
||||
* retry the instruction, it's the least awful option. If NRIPS is
|
||||
* in use, the skip must not commit any side effects such as clearing
|
||||
* the interrupt shadow or RFLAGS.RF.
|
||||
*/
|
||||
if (!__svm_skip_emulated_instruction(vcpu, !nrips))
|
||||
return -EIO;
|
||||
|
||||
rip = kvm_rip_read(vcpu);
|
||||
|
||||
/*
|
||||
* Save the injection information, even when using next_rip, as the
|
||||
* VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
|
||||
* doesn't complete due to a VM-Exit occurring while the CPU is
|
||||
* vectoring the event. Decoding the instruction isn't guaranteed to
|
||||
* work as there may be no backing instruction, e.g. if the event is
|
||||
* being injected by L1 for L2, or if the guest is patching INT3 into
|
||||
* a different instruction.
|
||||
*/
|
||||
svm->soft_int_injected = true;
|
||||
svm->soft_int_csbase = svm->vmcb->save.cs.base;
|
||||
svm->soft_int_old_rip = old_rip;
|
||||
svm->soft_int_next_rip = rip;
|
||||
|
||||
if (nrips)
|
||||
kvm_rip_write(vcpu, old_rip);
|
||||
|
||||
if (static_cpu_has(X86_FEATURE_NRIPS))
|
||||
svm->vmcb->control.next_rip = rip;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void svm_queue_exception(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
@ -380,21 +470,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
|
|||
|
||||
kvm_deliver_exception_payload(vcpu);
|
||||
|
||||
if (nr == BP_VECTOR && !nrips) {
|
||||
unsigned long rip, old_rip = kvm_rip_read(vcpu);
|
||||
|
||||
/*
|
||||
* For guest debugging where we have to reinject #BP if some
|
||||
* INT3 is guest-owned:
|
||||
* Emulate nRIP by moving RIP forward. Will fail if injection
|
||||
* raises a fault that is not intercepted. Still better than
|
||||
* failing in all cases.
|
||||
*/
|
||||
(void)svm_skip_emulated_instruction(vcpu);
|
||||
rip = kvm_rip_read(vcpu);
|
||||
svm->int3_rip = rip + svm->vmcb->save.cs.base;
|
||||
svm->int3_injected = rip - old_rip;
|
||||
}
|
||||
if (kvm_exception_is_soft(nr) &&
|
||||
svm_update_soft_interrupt_rip(vcpu))
|
||||
return;
|
||||
|
||||
svm->vmcb->control.event_inj = nr
|
||||
| SVM_EVTINJ_VALID
|
||||
|
@ -736,6 +814,29 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
|
|||
}
|
||||
}
|
||||
|
||||
void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (intercept == svm->x2avic_msrs_intercepted)
|
||||
return;
|
||||
|
||||
if (avic_mode != AVIC_MODE_X2 ||
|
||||
!apic_x2apic_mode(svm->vcpu.arch.apic))
|
||||
return;
|
||||
|
||||
for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
|
||||
int index = direct_access_msrs[i].index;
|
||||
|
||||
if ((index < APIC_BASE_MSR) ||
|
||||
(index > APIC_BASE_MSR + 0xff))
|
||||
continue;
|
||||
set_msr_interception(&svm->vcpu, svm->msrpm, index,
|
||||
!intercept, !intercept);
|
||||
}
|
||||
|
||||
svm->x2avic_msrs_intercepted = intercept;
|
||||
}
|
||||
|
||||
void svm_vcpu_free_msrpm(u32 *msrpm)
|
||||
{
|
||||
|
@ -1231,7 +1332,7 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
|
|||
|
||||
svm_init_osvw(vcpu);
|
||||
vcpu->arch.microcode_version = 0x01000065;
|
||||
svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio;
|
||||
svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
|
||||
|
||||
if (sev_es_guest(vcpu->kvm))
|
||||
sev_es_vcpu_reset(svm);
|
||||
|
@ -1299,6 +1400,8 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
|
|||
goto error_free_vmsa_page;
|
||||
}
|
||||
|
||||
svm->x2avic_msrs_intercepted = true;
|
||||
|
||||
svm->vmcb01.ptr = page_address(vmcb01_page);
|
||||
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
|
||||
svm_switch_vmcb(svm, &svm->vmcb01);
|
||||
|
@ -2345,6 +2448,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
|
|||
kvm_clear_exception_queue(vcpu);
|
||||
break;
|
||||
case SVM_EXITINTINFO_TYPE_INTR:
|
||||
case SVM_EXITINTINFO_TYPE_SOFT:
|
||||
kvm_clear_interrupt_queue(vcpu);
|
||||
break;
|
||||
default:
|
||||
|
@ -3375,35 +3479,49 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
|
|||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
|
||||
|
||||
if (svm->nmi_l1_to_l2)
|
||||
return;
|
||||
|
||||
vcpu->arch.hflags |= HF_NMI_MASK;
|
||||
if (!sev_es_guest(vcpu->kvm))
|
||||
svm_set_intercept(svm, INTERCEPT_IRET);
|
||||
++vcpu->stat.nmi_injections;
|
||||
}
|
||||
|
||||
static void svm_inject_irq(struct kvm_vcpu *vcpu)
|
||||
static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
u32 type;
|
||||
|
||||
BUG_ON(!(gif_set(svm)));
|
||||
if (vcpu->arch.interrupt.soft) {
|
||||
if (svm_update_soft_interrupt_rip(vcpu))
|
||||
return;
|
||||
|
||||
trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
|
||||
type = SVM_EVTINJ_TYPE_SOFT;
|
||||
} else {
|
||||
type = SVM_EVTINJ_TYPE_INTR;
|
||||
}
|
||||
|
||||
trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
|
||||
vcpu->arch.interrupt.soft, reinjected);
|
||||
++vcpu->stat.irq_injections;
|
||||
|
||||
svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
|
||||
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
|
||||
SVM_EVTINJ_VALID | type;
|
||||
}
|
||||
|
||||
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
|
||||
int trig_mode, int vector)
|
||||
{
|
||||
/*
|
||||
* vcpu->arch.apicv_active must be read after vcpu->mode.
|
||||
* apic->apicv_active must be read after vcpu->mode.
|
||||
* Pairs with smp_store_release in vcpu_enter_guest.
|
||||
*/
|
||||
bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
|
||||
|
||||
if (!READ_ONCE(vcpu->arch.apicv_active)) {
|
||||
/* Note, this is called iff the local APIC is in-kernel. */
|
||||
if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
|
||||
/* Process the interrupt via inject_pending_event */
|
||||
kvm_make_request(KVM_REQ_EVENT, vcpu);
|
||||
kvm_vcpu_kick(vcpu);
|
||||
|
@ -3668,15 +3786,49 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
|
|||
svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
|
||||
}
|
||||
|
||||
static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
|
||||
int type)
|
||||
{
|
||||
bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
|
||||
bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
/*
|
||||
* If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
|
||||
* associated with the original soft exception/interrupt. next_rip is
|
||||
* cleared on all exits that can occur while vectoring an event, so KVM
|
||||
* needs to manually set next_rip for re-injection. Unlike the !nrips
|
||||
* case below, this needs to be done if and only if KVM is re-injecting
|
||||
* the same event, i.e. if the event is a soft exception/interrupt,
|
||||
* otherwise next_rip is unused on VMRUN.
|
||||
*/
|
||||
if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
|
||||
kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
|
||||
svm->vmcb->control.next_rip = svm->soft_int_next_rip;
|
||||
/*
|
||||
* If NRIPS isn't enabled, KVM must manually advance RIP prior to
|
||||
* injecting the soft exception/interrupt. That advancement needs to
|
||||
* be unwound if vectoring didn't complete. Note, the new event may
|
||||
* not be the injected event, e.g. if KVM injected an INTn, the INTn
|
||||
* hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
|
||||
* be the reported vectored event, but RIP still needs to be unwound.
|
||||
*/
|
||||
else if (!nrips && (is_soft || is_exception) &&
|
||||
kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
|
||||
kvm_rip_write(vcpu, svm->soft_int_old_rip);
|
||||
}
|
||||
|
||||
static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
u8 vector;
|
||||
int type;
|
||||
u32 exitintinfo = svm->vmcb->control.exit_int_info;
|
||||
unsigned int3_injected = svm->int3_injected;
|
||||
bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
|
||||
bool soft_int_injected = svm->soft_int_injected;
|
||||
|
||||
svm->int3_injected = 0;
|
||||
svm->nmi_l1_to_l2 = false;
|
||||
svm->soft_int_injected = false;
|
||||
|
||||
/*
|
||||
* If we've made progress since setting HF_IRET_MASK, we've
|
||||
|
@ -3701,9 +3853,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
|
|||
vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
|
||||
type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
|
||||
|
||||
if (soft_int_injected)
|
||||
svm_complete_soft_interrupt(vcpu, vector, type);
|
||||
|
||||
switch (type) {
|
||||
case SVM_EXITINTINFO_TYPE_NMI:
|
||||
vcpu->arch.nmi_injected = true;
|
||||
svm->nmi_l1_to_l2 = nmi_l1_to_l2;
|
||||
break;
|
||||
case SVM_EXITINTINFO_TYPE_EXEPT:
|
||||
/*
|
||||
|
@ -3712,18 +3868,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
|
|||
if (vector == X86_TRAP_VC)
|
||||
break;
|
||||
|
||||
/*
|
||||
* In case of software exceptions, do not reinject the vector,
|
||||
* but re-execute the instruction instead. Rewind RIP first
|
||||
* if we emulated INT3 before.
|
||||
*/
|
||||
if (kvm_exception_is_soft(vector)) {
|
||||
if (vector == BP_VECTOR && int3_injected &&
|
||||
kvm_is_linear_rip(vcpu, svm->int3_rip))
|
||||
kvm_rip_write(vcpu,
|
||||
kvm_rip_read(vcpu) - int3_injected);
|
||||
break;
|
||||
}
|
||||
if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
|
||||
u32 err = svm->vmcb->control.exit_int_info_err;
|
||||
kvm_requeue_exception_e(vcpu, vector, err);
|
||||
|
@ -3734,9 +3878,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
|
|||
case SVM_EXITINTINFO_TYPE_INTR:
|
||||
kvm_queue_interrupt(vcpu, vector, false);
|
||||
break;
|
||||
case SVM_EXITINTINFO_TYPE_SOFT:
|
||||
kvm_queue_interrupt(vcpu, vector, true);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void svm_cancel_injection(struct kvm_vcpu *vcpu)
|
||||
|
@ -3952,7 +4100,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
|
|||
hv_track_root_tdp(vcpu, root_hpa);
|
||||
|
||||
cr3 = vcpu->arch.cr3;
|
||||
} else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
|
||||
} else if (root_level >= PT64_ROOT_4LEVEL) {
|
||||
cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
|
||||
} else {
|
||||
/* PCID in the guest should be impossible with a 32-bit MMU. */
|
||||
|
@ -4013,16 +4161,10 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
|
|||
return true;
|
||||
}
|
||||
|
||||
static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
|
||||
vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
|
||||
boot_cpu_has(X86_FEATURE_XSAVE) &&
|
||||
|
@ -4049,19 +4191,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
|||
|
||||
/* For sev guests, the memory encryption bit is not reserved in CR3. */
|
||||
if (sev_guest(vcpu->kvm)) {
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
|
||||
if (best)
|
||||
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
|
||||
}
|
||||
|
||||
if (kvm_vcpu_apicv_active(vcpu)) {
|
||||
/*
|
||||
* AVIC does not work with an x2APIC mode guest. If the X2APIC feature
|
||||
* is exposed to the guest, disable AVIC.
|
||||
*/
|
||||
if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
|
||||
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
|
||||
}
|
||||
init_vmcb_after_set_cpuid(vcpu);
|
||||
}
|
||||
|
||||
|
@ -4673,11 +4807,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
|
|||
.enable_nmi_window = svm_enable_nmi_window,
|
||||
.enable_irq_window = svm_enable_irq_window,
|
||||
.update_cr8_intercept = svm_update_cr8_intercept,
|
||||
.set_virtual_apic_mode = avic_set_virtual_apic_mode,
|
||||
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
|
||||
.check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
|
||||
.apicv_post_state_restore = avic_apicv_post_state_restore,
|
||||
|
||||
.get_mt_mask = svm_get_mt_mask,
|
||||
.get_exit_info = svm_get_exit_info,
|
||||
|
||||
.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
|
||||
|
@ -4773,7 +4907,7 @@ static __init void svm_set_cpu_caps(void)
|
|||
{
|
||||
kvm_set_cpu_caps();
|
||||
|
||||
supported_xss = 0;
|
||||
kvm_caps.supported_xss = 0;
|
||||
|
||||
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
|
||||
if (nested) {
|
||||
|
@ -4849,7 +4983,8 @@ static __init int svm_hardware_setup(void)
|
|||
|
||||
init_msrpm_offsets();
|
||||
|
||||
supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
|
||||
kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
|
||||
XFEATURE_MASK_BNDCSR);
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
|
||||
kvm_enable_efer_bits(EFER_FFXSR);
|
||||
|
@ -4859,11 +4994,11 @@ static __init int svm_hardware_setup(void)
|
|||
tsc_scaling = false;
|
||||
} else {
|
||||
pr_info("TSC scaling supported\n");
|
||||
kvm_has_tsc_control = true;
|
||||
kvm_caps.has_tsc_control = true;
|
||||
}
|
||||
}
|
||||
kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
|
||||
kvm_tsc_scaling_ratio_frac_bits = 32;
|
||||
kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
|
||||
kvm_caps.tsc_scaling_ratio_frac_bits = 32;
|
||||
|
||||
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
|
||||
|
||||
|
@ -4917,17 +5052,9 @@ static __init int svm_hardware_setup(void)
|
|||
nrips = false;
|
||||
}
|
||||
|
||||
enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
|
||||
enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
|
||||
|
||||
if (enable_apicv) {
|
||||
if (!boot_cpu_has(X86_FEATURE_AVIC)) {
|
||||
pr_warn("AVIC is not supported in CPUID but force enabled");
|
||||
pr_warn("Your system might crash and burn");
|
||||
} else
|
||||
pr_info("AVIC enabled\n");
|
||||
|
||||
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
|
||||
} else {
|
||||
if (!enable_apicv) {
|
||||
svm_x86_ops.vcpu_blocking = NULL;
|
||||
svm_x86_ops.vcpu_unblocking = NULL;
|
||||
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
|
||||
|
|
|
@ -29,13 +29,21 @@
|
|||
#define IOPM_SIZE PAGE_SIZE * 3
|
||||
#define MSRPM_SIZE PAGE_SIZE * 2
|
||||
|
||||
#define MAX_DIRECT_ACCESS_MSRS 21
|
||||
#define MSRPM_OFFSETS 16
|
||||
#define MAX_DIRECT_ACCESS_MSRS 46
|
||||
#define MSRPM_OFFSETS 32
|
||||
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
|
||||
extern bool npt_enabled;
|
||||
extern int vgif;
|
||||
extern bool intercept_smi;
|
||||
|
||||
enum avic_modes {
|
||||
AVIC_MODE_NONE = 0,
|
||||
AVIC_MODE_X1,
|
||||
AVIC_MODE_X2,
|
||||
};
|
||||
|
||||
extern enum avic_modes avic_mode;
|
||||
|
||||
/*
|
||||
* Clean bits in VMCB.
|
||||
* VMCB_ALL_CLEAN_MASK might also need to
|
||||
|
@ -139,6 +147,7 @@ struct vmcb_ctrl_area_cached {
|
|||
u64 nested_ctl;
|
||||
u32 event_inj;
|
||||
u32 event_inj_err;
|
||||
u64 next_rip;
|
||||
u64 nested_cr3;
|
||||
u64 virt_ext;
|
||||
u32 clean;
|
||||
|
@ -228,9 +237,12 @@ struct vcpu_svm {
|
|||
|
||||
bool nmi_singlestep;
|
||||
u64 nmi_singlestep_guest_rflags;
|
||||
bool nmi_l1_to_l2;
|
||||
|
||||
unsigned int3_injected;
|
||||
unsigned long int3_rip;
|
||||
unsigned long soft_int_csbase;
|
||||
unsigned long soft_int_old_rip;
|
||||
unsigned long soft_int_next_rip;
|
||||
bool soft_int_injected;
|
||||
|
||||
/* optional nested SVM features that are enabled for this guest */
|
||||
bool nrips_enabled : 1;
|
||||
|
@ -264,6 +276,8 @@ struct vcpu_svm {
|
|||
struct vcpu_sev_es_state sev_es;
|
||||
|
||||
bool guest_state_loaded;
|
||||
|
||||
bool x2avic_msrs_intercepted;
|
||||
};
|
||||
|
||||
struct svm_cpu_data {
|
||||
|
@ -509,6 +523,15 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
|
|||
return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
|
||||
}
|
||||
|
||||
static inline bool is_x2apic_msrpm_offset(u32 offset)
|
||||
{
|
||||
/* 4 msrs per u8, and 4 u8 in u32 */
|
||||
u32 msr = offset * 16;
|
||||
|
||||
return (msr >= APIC_BASE_MSR) &&
|
||||
(msr < (APIC_BASE_MSR + 0x100));
|
||||
}
|
||||
|
||||
/* svm.c */
|
||||
#define MSR_INVALID 0xffffffffU
|
||||
|
||||
|
@ -534,6 +557,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value);
|
|||
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
|
||||
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
|
||||
int read, int write);
|
||||
void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
|
||||
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
|
||||
int trig_mode, int vec);
|
||||
|
||||
|
@ -603,6 +627,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
|
|||
|
||||
/* avic.c */
|
||||
|
||||
bool avic_hardware_setup(struct kvm_x86_ops *ops);
|
||||
int avic_ga_log_notifier(u32 ga_tag);
|
||||
void avic_vm_destroy(struct kvm *kvm);
|
||||
int avic_vm_init(struct kvm *kvm);
|
||||
|
@ -613,18 +638,16 @@ int avic_init_vcpu(struct vcpu_svm *svm);
|
|||
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
|
||||
void avic_vcpu_put(struct kvm_vcpu *vcpu);
|
||||
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
|
||||
void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
|
||||
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
|
||||
bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
|
||||
void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
|
||||
void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
|
||||
bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
|
||||
int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
|
||||
uint32_t guest_irq, bool set);
|
||||
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
|
||||
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
|
||||
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
|
||||
unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
|
||||
void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
|
||||
|
||||
|
||||
/* sev.c */
|
||||
|
||||
|
|
|
@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall,
|
|||
|
||||
TRACE_EVENT(kvm_pio,
|
||||
TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
|
||||
unsigned int count, void *data),
|
||||
unsigned int count, const void *data),
|
||||
TP_ARGS(rw, port, size, count, data),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
|
@ -333,18 +333,24 @@ TRACE_EVENT_KVM_EXIT(kvm_exit);
|
|||
* Tracepoint for kvm interrupt injection:
|
||||
*/
|
||||
TRACE_EVENT(kvm_inj_virq,
|
||||
TP_PROTO(unsigned int irq),
|
||||
TP_ARGS(irq),
|
||||
TP_PROTO(unsigned int vector, bool soft, bool reinjected),
|
||||
TP_ARGS(vector, soft, reinjected),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( unsigned int, irq )
|
||||
__field( unsigned int, vector )
|
||||
__field( bool, soft )
|
||||
__field( bool, reinjected )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->irq = irq;
|
||||
__entry->vector = vector;
|
||||
__entry->soft = soft;
|
||||
__entry->reinjected = reinjected;
|
||||
),
|
||||
|
||||
TP_printk("irq %u", __entry->irq)
|
||||
TP_printk("%s 0x%x%s",
|
||||
__entry->soft ? "Soft/INTn" : "IRQ", __entry->vector,
|
||||
__entry->reinjected ? " [reinjected]" : "")
|
||||
);
|
||||
|
||||
#define EXS(x) { x##_VECTOR, "#" #x }
|
||||
|
@ -358,25 +364,30 @@ TRACE_EVENT(kvm_inj_virq,
|
|||
* Tracepoint for kvm interrupt injection:
|
||||
*/
|
||||
TRACE_EVENT(kvm_inj_exception,
|
||||
TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
|
||||
TP_ARGS(exception, has_error, error_code),
|
||||
TP_PROTO(unsigned exception, bool has_error, unsigned error_code,
|
||||
bool reinjected),
|
||||
TP_ARGS(exception, has_error, error_code, reinjected),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( u8, exception )
|
||||
__field( u8, has_error )
|
||||
__field( u32, error_code )
|
||||
__field( bool, reinjected )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->exception = exception;
|
||||
__entry->has_error = has_error;
|
||||
__entry->error_code = error_code;
|
||||
__entry->reinjected = reinjected;
|
||||
),
|
||||
|
||||
TP_printk("%s (0x%x)",
|
||||
TP_printk("%s%s%s%s%s",
|
||||
__print_symbolic(__entry->exception, kvm_trace_sym_exc),
|
||||
/* FIXME: don't print error_code if not present */
|
||||
__entry->has_error ? __entry->error_code : 0)
|
||||
!__entry->has_error ? "" : " (",
|
||||
!__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }),
|
||||
!__entry->has_error ? "" : ")",
|
||||
__entry->reinjected ? " [reinjected]" : "")
|
||||
);
|
||||
|
||||
/*
|
||||
|
@ -1479,6 +1490,24 @@ TRACE_EVENT(kvm_avic_kick_vcpu_slowpath,
|
|||
__entry->icrh, __entry->icrl, __entry->index)
|
||||
);
|
||||
|
||||
TRACE_EVENT(kvm_avic_doorbell,
|
||||
TP_PROTO(u32 vcpuid, u32 apicid),
|
||||
TP_ARGS(vcpuid, apicid),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u32, vcpuid)
|
||||
__field(u32, apicid)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->vcpuid = vcpuid;
|
||||
__entry->apicid = apicid;
|
||||
),
|
||||
|
||||
TP_printk("vcpuid=%u, apicid=%u",
|
||||
__entry->vcpuid, __entry->apicid)
|
||||
);
|
||||
|
||||
TRACE_EVENT(kvm_hv_timer_state,
|
||||
TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
|
||||
TP_ARGS(vcpu_id, hv_timer_in_use),
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
|
||||
#include "../lapic.h"
|
||||
#include "../x86.h"
|
||||
#include "../pmu.h"
|
||||
#include "../cpuid.h"
|
||||
|
||||
extern bool __read_mostly enable_vpid;
|
||||
extern bool __read_mostly flexpriority_enabled;
|
||||
|
@ -13,6 +15,7 @@ extern bool __read_mostly enable_ept;
|
|||
extern bool __read_mostly enable_unrestricted_guest;
|
||||
extern bool __read_mostly enable_ept_ad_bits;
|
||||
extern bool __read_mostly enable_pml;
|
||||
extern bool __read_mostly enable_ipiv;
|
||||
extern int __read_mostly pt_mode;
|
||||
|
||||
#define PT_MODE_SYSTEM 0
|
||||
|
@ -59,6 +62,7 @@ struct vmcs_config {
|
|||
u32 pin_based_exec_ctrl;
|
||||
u32 cpu_based_exec_ctrl;
|
||||
u32 cpu_based_2nd_exec_ctrl;
|
||||
u64 cpu_based_3rd_exec_ctrl;
|
||||
u32 vmexit_ctrl;
|
||||
u32 vmentry_ctrl;
|
||||
struct nested_vmx_msrs nested;
|
||||
|
@ -94,20 +98,17 @@ static inline bool cpu_has_vmx_posted_intr(void)
|
|||
|
||||
static inline bool cpu_has_load_ia32_efer(void)
|
||||
{
|
||||
return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) &&
|
||||
(vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER);
|
||||
return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_load_perf_global_ctrl(void)
|
||||
{
|
||||
return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
|
||||
(vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
|
||||
return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_mpx(void)
|
||||
{
|
||||
return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
|
||||
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
|
||||
return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_tpr_shadow(void)
|
||||
|
@ -131,6 +132,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void)
|
|||
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_tertiary_exec_ctrls(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_exec_ctrl &
|
||||
CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl &
|
||||
|
@ -276,6 +283,11 @@ static inline bool cpu_has_vmx_apicv(void)
|
|||
cpu_has_vmx_posted_intr();
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_ipiv(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_flexpriority(void)
|
||||
{
|
||||
return cpu_has_vmx_tpr_shadow() &&
|
||||
|
@ -363,7 +375,6 @@ static inline bool cpu_has_vmx_intel_pt(void)
|
|||
rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
|
||||
return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) &&
|
||||
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
|
||||
(vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) &&
|
||||
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
|
||||
}
|
||||
|
||||
|
@ -385,23 +396,31 @@ static inline bool vmx_pt_mode_is_host_guest(void)
|
|||
return pt_mode == PT_MODE_HOST_GUEST;
|
||||
}
|
||||
|
||||
static inline bool vmx_pebs_supported(void)
|
||||
{
|
||||
return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
|
||||
}
|
||||
|
||||
static inline u64 vmx_get_perf_capabilities(void)
|
||||
{
|
||||
u64 perf_cap = 0;
|
||||
u64 perf_cap = PMU_CAP_FW_WRITES;
|
||||
u64 host_perf_cap = 0;
|
||||
|
||||
if (!enable_pmu)
|
||||
return perf_cap;
|
||||
return 0;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_PDCM))
|
||||
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
|
||||
rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
|
||||
|
||||
perf_cap &= PMU_CAP_LBR_FMT;
|
||||
perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
|
||||
|
||||
/*
|
||||
* Since counters are virtualized, KVM would support full
|
||||
* width counting unconditionally, even if the host lacks it.
|
||||
*/
|
||||
return PMU_CAP_FW_WRITES | perf_cap;
|
||||
if (vmx_pebs_supported()) {
|
||||
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
|
||||
if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
|
||||
perf_cap &= ~PERF_CAP_PEBS_BASELINE;
|
||||
}
|
||||
|
||||
return perf_cap;
|
||||
}
|
||||
|
||||
static inline u64 vmx_supported_debugctl(void)
|
||||
|
@ -417,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void)
|
|||
return debugctl;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_notify_vmexit(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl &
|
||||
SECONDARY_EXEC_NOTIFY_VM_EXITING;
|
||||
}
|
||||
|
||||
#endif /* __KVM_X86_VMX_CAPS_H */
|
||||
|
|
|
@ -297,8 +297,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
|
|||
#if IS_ENABLED(CONFIG_HYPERV)
|
||||
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
|
||||
{
|
||||
vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL;
|
||||
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
|
||||
vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
|
||||
vmcs_conf->cpu_based_3rd_exec_ctrl = 0;
|
||||
|
||||
vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
|
||||
vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
|
||||
|
|
|
@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
|
|||
*/
|
||||
#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
|
||||
PIN_BASED_VMX_PREEMPTION_TIMER)
|
||||
#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
|
||||
#define EVMCS1_UNSUPPORTED_2NDEXEC \
|
||||
(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
|
||||
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
|
||||
|
|
|
@ -311,11 +311,12 @@ static void free_nested(struct kvm_vcpu *vcpu)
|
|||
vmx->nested.cached_vmcs12 = NULL;
|
||||
kfree(vmx->nested.cached_shadow_vmcs12);
|
||||
vmx->nested.cached_shadow_vmcs12 = NULL;
|
||||
/* Unpin physical memory we referred to in the vmcs02 */
|
||||
if (vmx->nested.apic_access_page) {
|
||||
kvm_release_page_clean(vmx->nested.apic_access_page);
|
||||
vmx->nested.apic_access_page = NULL;
|
||||
}
|
||||
/*
|
||||
* Unpin physical memory we referred to in the vmcs02. The APIC access
|
||||
* page's backing page (yeah, confusing) shouldn't actually be accessed,
|
||||
* and if it is written, the contents are irrelevant.
|
||||
*/
|
||||
kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
|
||||
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
|
||||
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
|
||||
vmx->nested.pi_desc = NULL;
|
||||
|
@ -1223,7 +1224,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
|
|||
BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
|
||||
/* reserved */
|
||||
BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
|
||||
u64 vmx_basic = vmx->nested.msrs.basic;
|
||||
u64 vmx_basic = vmcs_config.nested.basic;
|
||||
|
||||
if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
|
||||
return -EINVAL;
|
||||
|
@ -1246,36 +1247,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
|
||||
static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
|
||||
u32 **low, u32 **high)
|
||||
{
|
||||
u64 supported;
|
||||
u32 *lowp, *highp;
|
||||
|
||||
switch (msr_index) {
|
||||
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
|
||||
lowp = &vmx->nested.msrs.pinbased_ctls_low;
|
||||
highp = &vmx->nested.msrs.pinbased_ctls_high;
|
||||
*low = &msrs->pinbased_ctls_low;
|
||||
*high = &msrs->pinbased_ctls_high;
|
||||
break;
|
||||
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
|
||||
lowp = &vmx->nested.msrs.procbased_ctls_low;
|
||||
highp = &vmx->nested.msrs.procbased_ctls_high;
|
||||
*low = &msrs->procbased_ctls_low;
|
||||
*high = &msrs->procbased_ctls_high;
|
||||
break;
|
||||
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
|
||||
lowp = &vmx->nested.msrs.exit_ctls_low;
|
||||
highp = &vmx->nested.msrs.exit_ctls_high;
|
||||
*low = &msrs->exit_ctls_low;
|
||||
*high = &msrs->exit_ctls_high;
|
||||
break;
|
||||
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
|
||||
lowp = &vmx->nested.msrs.entry_ctls_low;
|
||||
highp = &vmx->nested.msrs.entry_ctls_high;
|
||||
*low = &msrs->entry_ctls_low;
|
||||
*high = &msrs->entry_ctls_high;
|
||||
break;
|
||||
case MSR_IA32_VMX_PROCBASED_CTLS2:
|
||||
lowp = &vmx->nested.msrs.secondary_ctls_low;
|
||||
highp = &vmx->nested.msrs.secondary_ctls_high;
|
||||
*low = &msrs->secondary_ctls_low;
|
||||
*high = &msrs->secondary_ctls_high;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
|
||||
{
|
||||
u32 *lowp, *highp;
|
||||
u64 supported;
|
||||
|
||||
vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
|
||||
|
||||
supported = vmx_control_msr(*lowp, *highp);
|
||||
|
||||
|
@ -1287,6 +1294,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
|
|||
if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
|
||||
return -EINVAL;
|
||||
|
||||
vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
|
||||
*lowp = data;
|
||||
*highp = data >> 32;
|
||||
return 0;
|
||||
|
@ -1300,10 +1308,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
|
|||
BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
|
||||
/* reserved */
|
||||
GENMASK_ULL(13, 9) | BIT_ULL(31);
|
||||
u64 vmx_misc;
|
||||
|
||||
vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
|
||||
vmx->nested.msrs.misc_high);
|
||||
u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
|
||||
vmcs_config.nested.misc_high);
|
||||
|
||||
if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
|
||||
return -EINVAL;
|
||||
|
@ -1331,10 +1337,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
|
|||
|
||||
static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
|
||||
{
|
||||
u64 vmx_ept_vpid_cap;
|
||||
|
||||
vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
|
||||
vmx->nested.msrs.vpid_caps);
|
||||
u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
|
||||
vmcs_config.nested.vpid_caps);
|
||||
|
||||
/* Every bit is either reserved or a feature bit. */
|
||||
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
|
||||
|
@ -1345,20 +1349,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
|
||||
static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
|
||||
{
|
||||
u64 *msr;
|
||||
|
||||
switch (msr_index) {
|
||||
case MSR_IA32_VMX_CR0_FIXED0:
|
||||
msr = &vmx->nested.msrs.cr0_fixed0;
|
||||
break;
|
||||
return &msrs->cr0_fixed0;
|
||||
case MSR_IA32_VMX_CR4_FIXED0:
|
||||
msr = &vmx->nested.msrs.cr4_fixed0;
|
||||
break;
|
||||
return &msrs->cr4_fixed0;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
|
||||
{
|
||||
const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
|
||||
|
||||
/*
|
||||
* 1 bits (which indicates bits which "must-be-1" during VMX operation)
|
||||
|
@ -1367,7 +1372,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
|
|||
if (!is_bitwise_subset(data, *msr, -1ULL))
|
||||
return -EINVAL;
|
||||
|
||||
*msr = data;
|
||||
*vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1428,7 +1433,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
|
|||
vmx->nested.msrs.vmcs_enum = data;
|
||||
return 0;
|
||||
case MSR_IA32_VMX_VMFUNC:
|
||||
if (data & ~vmx->nested.msrs.vmfunc_controls)
|
||||
if (data & ~vmcs_config.nested.vmfunc_controls)
|
||||
return -EINVAL;
|
||||
vmx->nested.msrs.vmfunc_controls = data;
|
||||
return 0;
|
||||
|
@ -2133,6 +2138,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
|
|||
|
||||
static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
|
||||
{
|
||||
struct kvm *kvm = vmx->vcpu.kvm;
|
||||
|
||||
/*
|
||||
* If vmcs02 hasn't been initialized, set the constant vmcs02 state
|
||||
* according to L0's settings (vmcs12 is irrelevant here). Host
|
||||
|
@ -2175,6 +2182,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
|
|||
if (cpu_has_vmx_encls_vmexit())
|
||||
vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
|
||||
|
||||
if (kvm_notify_vmexit_enabled(kvm))
|
||||
vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
|
||||
|
||||
/*
|
||||
* Set the MSR load/store lists to match L0's settings. Only the
|
||||
* addresses are constant (for vmcs02), the counts can change based
|
||||
|
@ -2514,11 +2524,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
|
|||
vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
|
||||
} else {
|
||||
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
|
||||
vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
|
||||
vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
|
||||
}
|
||||
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
|
||||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
|
||||
vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
|
||||
vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs);
|
||||
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
|
||||
|
||||
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
|
||||
|
@ -2547,7 +2557,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
|
|||
vmx_get_l2_tsc_multiplier(vcpu));
|
||||
|
||||
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
|
||||
if (kvm_has_tsc_control)
|
||||
if (kvm_caps.has_tsc_control)
|
||||
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
|
||||
|
||||
nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
|
||||
|
@ -2613,6 +2623,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
|
|||
}
|
||||
|
||||
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
|
||||
intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
|
||||
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
|
||||
vmcs12->guest_ia32_perf_global_ctrl))) {
|
||||
*entry_failure_code = ENTRY_FAIL_DEFAULT;
|
||||
|
@ -3158,8 +3169,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
|
|||
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
struct kvm_host_map *map;
|
||||
struct page *page;
|
||||
u64 hpa;
|
||||
|
||||
if (!vcpu->arch.pdptrs_from_userspace &&
|
||||
!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
|
||||
|
@ -3174,23 +3183,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
|
|||
|
||||
|
||||
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
|
||||
/*
|
||||
* Translate L1 physical address to host physical
|
||||
* address for vmcs02. Keep the page pinned, so this
|
||||
* physical address remains valid. We keep a reference
|
||||
* to it so we can release it later.
|
||||
*/
|
||||
if (vmx->nested.apic_access_page) { /* shouldn't happen */
|
||||
kvm_release_page_clean(vmx->nested.apic_access_page);
|
||||
vmx->nested.apic_access_page = NULL;
|
||||
}
|
||||
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
|
||||
if (!is_error_page(page)) {
|
||||
vmx->nested.apic_access_page = page;
|
||||
hpa = page_to_phys(vmx->nested.apic_access_page);
|
||||
vmcs_write64(APIC_ACCESS_ADDR, hpa);
|
||||
map = &vmx->nested.apic_access_page_map;
|
||||
|
||||
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
|
||||
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
|
||||
} else {
|
||||
pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
|
||||
pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
|
||||
__func__);
|
||||
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
|
||||
vcpu->run->internal.suberror =
|
||||
|
@ -3373,11 +3371,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
|
|||
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
|
||||
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
|
||||
|
||||
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
|
||||
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
|
||||
if (!vmx->nested.nested_run_pending ||
|
||||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
|
||||
vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
|
||||
if (kvm_mpx_supported() &&
|
||||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
|
||||
vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
|
||||
(!vmx->nested.nested_run_pending ||
|
||||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
|
||||
vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
|
||||
|
||||
/*
|
||||
* Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
|
||||
|
@ -4096,8 +4096,6 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
|
|||
vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
|
||||
vmcs12->guest_pending_dbg_exceptions =
|
||||
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
|
||||
if (kvm_mpx_supported())
|
||||
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
|
||||
|
||||
vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
|
||||
}
|
||||
|
@ -4336,7 +4334,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
|
|||
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
|
||||
vcpu->arch.pat = vmcs12->host_ia32_pat;
|
||||
}
|
||||
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
|
||||
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
|
||||
intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
|
||||
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
|
||||
vmcs12->host_ia32_perf_global_ctrl));
|
||||
|
||||
|
@ -4609,7 +4608,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
|
|||
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
|
||||
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
|
||||
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
|
||||
if (kvm_has_tsc_control)
|
||||
if (kvm_caps.has_tsc_control)
|
||||
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
|
||||
|
||||
if (vmx->nested.l1_tpr_threshold != -1)
|
||||
|
@ -4626,10 +4625,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
|
|||
}
|
||||
|
||||
/* Unpin physical memory we referred to in vmcs02 */
|
||||
if (vmx->nested.apic_access_page) {
|
||||
kvm_release_page_clean(vmx->nested.apic_access_page);
|
||||
vmx->nested.apic_access_page = NULL;
|
||||
}
|
||||
kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
|
||||
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
|
||||
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
|
||||
vmx->nested.pi_desc = NULL;
|
||||
|
@ -4828,28 +4824,6 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
|
|||
return 0;
|
||||
}
|
||||
|
||||
void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
|
||||
bool vcpu_has_perf_global_ctrl)
|
||||
{
|
||||
struct vcpu_vmx *vmx;
|
||||
|
||||
if (!nested_vmx_allowed(vcpu))
|
||||
return;
|
||||
|
||||
vmx = to_vmx(vcpu);
|
||||
if (vcpu_has_perf_global_ctrl) {
|
||||
vmx->nested.msrs.entry_ctls_high |=
|
||||
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
|
||||
vmx->nested.msrs.exit_ctls_high |=
|
||||
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
|
||||
} else {
|
||||
vmx->nested.msrs.entry_ctls_high &=
|
||||
~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
|
||||
vmx->nested.msrs.exit_ctls_high &=
|
||||
~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
|
||||
}
|
||||
}
|
||||
|
||||
static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
|
||||
int *ret)
|
||||
{
|
||||
|
@ -4952,7 +4926,7 @@ out_vmcs02:
|
|||
}
|
||||
|
||||
/* Emulate the VMXON instruction. */
|
||||
static int handle_vmon(struct kvm_vcpu *vcpu)
|
||||
static int handle_vmxon(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
int ret;
|
||||
gpa_t vmptr;
|
||||
|
@ -4962,20 +4936,25 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
|
|||
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
|
||||
|
||||
/*
|
||||
* The Intel VMX Instruction Reference lists a bunch of bits that are
|
||||
* prerequisite to running VMXON, most notably cr4.VMXE must be set to
|
||||
* 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
|
||||
* Otherwise, we should fail with #UD. But most faulting conditions
|
||||
* have already been checked by hardware, prior to the VM-exit for
|
||||
* VMXON. We do test guest cr4.VMXE because processor CR4 always has
|
||||
* that bit set to 1 in non-root mode.
|
||||
* Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
|
||||
* that have higher priority than VM-Exit (see Intel SDM's pseudocode
|
||||
* for VMXON), as KVM must load valid CR0/CR4 values into hardware while
|
||||
* running the guest, i.e. KVM needs to check the _guest_ values.
|
||||
*
|
||||
* Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
|
||||
* !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real
|
||||
* Mode, but KVM will never take the guest out of those modes.
|
||||
*/
|
||||
if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
|
||||
if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
|
||||
!nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
|
||||
kvm_queue_exception(vcpu, UD_VECTOR);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* CPL=0 must be checked manually. */
|
||||
/*
|
||||
* CPL=0 and all other checks that are lower priority than VM-Exit must
|
||||
* be checked manually.
|
||||
*/
|
||||
if (vmx_get_cpl(vcpu)) {
|
||||
kvm_inject_gp(vcpu, 0);
|
||||
return 1;
|
||||
|
@ -5044,7 +5023,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
|
|||
}
|
||||
|
||||
/* Emulate the VMXOFF instruction */
|
||||
static int handle_vmoff(struct kvm_vcpu *vcpu)
|
||||
static int handle_vmxoff(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!nested_vmx_check_permission(vcpu))
|
||||
return 1;
|
||||
|
@ -6111,6 +6090,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
|
|||
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
|
||||
case EXIT_REASON_ENCLS:
|
||||
return nested_vmx_exit_handled_encls(vcpu, vmcs12);
|
||||
case EXIT_REASON_NOTIFY:
|
||||
/* Notify VM exit is not exposed to L1 */
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
@ -6775,6 +6757,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
|
|||
rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
|
||||
rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
|
||||
|
||||
if (vmx_umip_emulated())
|
||||
msrs->cr4_fixed1 |= X86_CR4_UMIP;
|
||||
|
||||
msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
|
||||
}
|
||||
|
||||
|
@ -6818,8 +6803,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
|
|||
exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
|
||||
exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
|
||||
exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
|
||||
exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
|
||||
exit_handlers[EXIT_REASON_VMON] = handle_vmon;
|
||||
exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff;
|
||||
exit_handlers[EXIT_REASON_VMON] = handle_vmxon;
|
||||
exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
|
||||
exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
|
||||
exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
|
||||
|
|
|
@ -32,8 +32,6 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
|
|||
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
|
||||
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
|
||||
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
|
||||
void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu,
|
||||
bool vcpu_has_perf_global_ctrl);
|
||||
void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
|
||||
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
|
||||
int size);
|
||||
|
@ -281,7 +279,8 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
|
|||
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
|
||||
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
|
||||
|
||||
return fixed_bits_valid(val, fixed0, fixed1);
|
||||
return fixed_bits_valid(val, fixed0, fixed1) &&
|
||||
__kvm_is_valid_cr4(vcpu, val);
|
||||
}
|
||||
|
||||
/* No difference in the restrictions on guest and host CR4 in VMX operation. */
|
||||
|
|
|
@ -37,23 +37,35 @@ static int fixed_pmc_events[] = {1, 0, 7};
|
|||
|
||||
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
|
||||
{
|
||||
struct kvm_pmc *pmc;
|
||||
u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl;
|
||||
int i;
|
||||
|
||||
pmu->fixed_ctr_ctrl = data;
|
||||
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
|
||||
u8 new_ctrl = fixed_ctrl_field(data, i);
|
||||
u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
|
||||
struct kvm_pmc *pmc;
|
||||
|
||||
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
|
||||
u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i);
|
||||
|
||||
if (old_ctrl == new_ctrl)
|
||||
continue;
|
||||
|
||||
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
|
||||
reprogram_fixed_counter(pmc, new_ctrl, i);
|
||||
}
|
||||
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
|
||||
|
||||
pmu->fixed_ctr_ctrl = data;
|
||||
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
|
||||
reprogram_counter(pmc);
|
||||
}
|
||||
}
|
||||
|
||||
static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
|
||||
{
|
||||
if (pmc_idx < INTEL_PMC_IDX_FIXED) {
|
||||
return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
|
||||
MSR_P6_EVNTSEL0);
|
||||
} else {
|
||||
u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
|
||||
|
||||
return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
|
||||
}
|
||||
}
|
||||
|
||||
/* function is called when global control register has been updated. */
|
||||
|
@ -61,14 +73,18 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
|
|||
{
|
||||
int bit;
|
||||
u64 diff = pmu->global_ctrl ^ data;
|
||||
struct kvm_pmc *pmc;
|
||||
|
||||
pmu->global_ctrl = data;
|
||||
|
||||
for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
|
||||
reprogram_counter(pmu, bit);
|
||||
for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
|
||||
pmc = intel_pmc_idx_to_pmc(pmu, bit);
|
||||
if (pmc)
|
||||
reprogram_counter(pmc);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
|
||||
static bool intel_hw_event_available(struct kvm_pmc *pmc)
|
||||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
|
||||
|
@ -82,15 +98,12 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
|
|||
|
||||
/* disable event that reported as not present by cpuid */
|
||||
if ((i < 7) && !(pmu->available_event_types & (1 << i)))
|
||||
return PERF_COUNT_HW_MAX + 1;
|
||||
return false;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(intel_arch_events))
|
||||
return PERF_COUNT_HW_MAX;
|
||||
|
||||
return intel_arch_events[i].event_type;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
|
||||
|
@ -98,21 +111,12 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
|
|||
{
|
||||
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
|
||||
|
||||
if (!intel_pmu_has_perf_global_ctrl(pmu))
|
||||
return true;
|
||||
|
||||
return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
|
||||
}
|
||||
|
||||
static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
|
||||
{
|
||||
if (pmc_idx < INTEL_PMC_IDX_FIXED)
|
||||
return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
|
||||
MSR_P6_EVNTSEL0);
|
||||
else {
|
||||
u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
|
||||
|
||||
return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
|
||||
}
|
||||
}
|
||||
|
||||
static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
|
||||
{
|
||||
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
|
||||
|
@ -167,16 +171,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
|
|||
return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
|
||||
}
|
||||
|
||||
bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/*
|
||||
* As a first step, a guest could only enable LBR feature if its
|
||||
* cpu model is the same as the host because the LBR registers
|
||||
* would be pass-through to the guest and they're model specific.
|
||||
*/
|
||||
return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
|
||||
}
|
||||
|
||||
bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
|
||||
|
@ -205,6 +199,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
|
|||
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
|
||||
{
|
||||
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
|
||||
u64 perf_capabilities;
|
||||
int ret;
|
||||
|
||||
switch (msr) {
|
||||
|
@ -212,7 +207,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
|
|||
case MSR_CORE_PERF_GLOBAL_STATUS:
|
||||
case MSR_CORE_PERF_GLOBAL_CTRL:
|
||||
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
|
||||
ret = pmu->version > 1;
|
||||
return intel_pmu_has_perf_global_ctrl(pmu);
|
||||
break;
|
||||
case MSR_IA32_PEBS_ENABLE:
|
||||
ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
|
||||
break;
|
||||
case MSR_IA32_DS_AREA:
|
||||
ret = guest_cpuid_has(vcpu, X86_FEATURE_DS);
|
||||
break;
|
||||
case MSR_PEBS_DATA_CFG:
|
||||
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
|
||||
ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) &&
|
||||
((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3);
|
||||
break;
|
||||
default:
|
||||
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
|
||||
|
@ -361,6 +367,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
|||
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
|
||||
msr_info->data = 0;
|
||||
return 0;
|
||||
case MSR_IA32_PEBS_ENABLE:
|
||||
msr_info->data = pmu->pebs_enable;
|
||||
return 0;
|
||||
case MSR_IA32_DS_AREA:
|
||||
msr_info->data = pmu->ds_area;
|
||||
return 0;
|
||||
case MSR_PEBS_DATA_CFG:
|
||||
msr_info->data = pmu->pebs_data_cfg;
|
||||
return 0;
|
||||
default:
|
||||
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
|
||||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
|
||||
|
@ -395,7 +410,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
|||
case MSR_CORE_PERF_FIXED_CTR_CTRL:
|
||||
if (pmu->fixed_ctr_ctrl == data)
|
||||
return 0;
|
||||
if (!(data & 0xfffffffffffff444ull)) {
|
||||
if (!(data & pmu->fixed_ctr_ctrl_mask)) {
|
||||
reprogram_fixed_counters(pmu, data);
|
||||
return 0;
|
||||
}
|
||||
|
@ -421,6 +436,29 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
|||
return 0;
|
||||
}
|
||||
break;
|
||||
case MSR_IA32_PEBS_ENABLE:
|
||||
if (pmu->pebs_enable == data)
|
||||
return 0;
|
||||
if (!(data & pmu->pebs_enable_mask)) {
|
||||
pmu->pebs_enable = data;
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
case MSR_IA32_DS_AREA:
|
||||
if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS))
|
||||
return 1;
|
||||
if (is_noncanonical_address(data, vcpu))
|
||||
return 1;
|
||||
pmu->ds_area = data;
|
||||
return 0;
|
||||
case MSR_PEBS_DATA_CFG:
|
||||
if (pmu->pebs_data_cfg == data)
|
||||
return 0;
|
||||
if (!(data & pmu->pebs_data_cfg_mask)) {
|
||||
pmu->pebs_data_cfg = data;
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
|
||||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
|
||||
|
@ -445,7 +483,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
|||
(pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
|
||||
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
|
||||
if (!(data & reserved_bits)) {
|
||||
reprogram_gp_counter(pmc, data);
|
||||
pmc->eventsel = data;
|
||||
reprogram_counter(pmc);
|
||||
return 0;
|
||||
}
|
||||
} else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
|
||||
|
@ -474,11 +513,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
|
|||
{
|
||||
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
|
||||
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
|
||||
|
||||
struct x86_pmu_capability x86_pmu;
|
||||
struct kvm_cpuid_entry2 *entry;
|
||||
union cpuid10_eax eax;
|
||||
union cpuid10_edx edx;
|
||||
u64 perf_capabilities;
|
||||
u64 counter_mask;
|
||||
int i;
|
||||
|
||||
pmu->nr_arch_gp_counters = 0;
|
||||
pmu->nr_arch_fixed_counters = 0;
|
||||
|
@ -487,8 +527,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
|
|||
pmu->version = 0;
|
||||
pmu->reserved_bits = 0xffffffff00200000ull;
|
||||
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
|
||||
pmu->global_ctrl_mask = ~0ull;
|
||||
pmu->global_ovf_ctrl_mask = ~0ull;
|
||||
pmu->fixed_ctr_ctrl_mask = ~0ull;
|
||||
pmu->pebs_enable_mask = ~0ull;
|
||||
pmu->pebs_data_cfg_mask = ~0ull;
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
|
||||
entry = kvm_find_cpuid_entry(vcpu, 0xa);
|
||||
if (!entry || !vcpu->kvm->arch.enable_pmu)
|
||||
return;
|
||||
eax.full = entry->eax;
|
||||
|
@ -498,13 +543,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
|
|||
if (!pmu->version)
|
||||
return;
|
||||
|
||||
perf_get_x86_pmu_capability(&x86_pmu);
|
||||
|
||||
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
|
||||
x86_pmu.num_counters_gp);
|
||||
eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp);
|
||||
kvm_pmu_cap.num_counters_gp);
|
||||
eax.split.bit_width = min_t(int, eax.split.bit_width,
|
||||
kvm_pmu_cap.bit_width_gp);
|
||||
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
|
||||
eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len);
|
||||
eax.split.mask_length = min_t(int, eax.split.mask_length,
|
||||
kvm_pmu_cap.events_mask_len);
|
||||
pmu->available_event_types = ~entry->ebx &
|
||||
((1ull << eax.split.mask_length) - 1);
|
||||
|
||||
|
@ -514,17 +559,19 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
|
|||
pmu->nr_arch_fixed_counters =
|
||||
min3(ARRAY_SIZE(fixed_pmc_events),
|
||||
(size_t) edx.split.num_counters_fixed,
|
||||
(size_t) x86_pmu.num_counters_fixed);
|
||||
edx.split.bit_width_fixed = min_t(int,
|
||||
edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
|
||||
(size_t)kvm_pmu_cap.num_counters_fixed);
|
||||
edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed,
|
||||
kvm_pmu_cap.bit_width_fixed);
|
||||
pmu->counter_bitmask[KVM_PMC_FIXED] =
|
||||
((u64)1 << edx.split.bit_width_fixed) - 1;
|
||||
setup_fixed_pmc_eventsel(pmu);
|
||||
}
|
||||
|
||||
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
|
||||
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
|
||||
pmu->global_ctrl_mask = ~pmu->global_ctrl;
|
||||
for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
|
||||
pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
|
||||
counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
|
||||
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
|
||||
pmu->global_ctrl_mask = counter_mask;
|
||||
pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
|
||||
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
|
||||
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
|
||||
|
@ -532,7 +579,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
|
|||
pmu->global_ovf_ctrl_mask &=
|
||||
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, 7, 0);
|
||||
entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
|
||||
if (entry &&
|
||||
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
|
||||
(entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) {
|
||||
|
@ -545,16 +592,29 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
|
|||
bitmap_set(pmu->all_valid_pmc_idx,
|
||||
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
|
||||
|
||||
nested_vmx_pmu_refresh(vcpu,
|
||||
intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL));
|
||||
|
||||
if (intel_pmu_lbr_is_compatible(vcpu))
|
||||
if (cpuid_model_is_consistent(vcpu))
|
||||
x86_perf_get_lbr(&lbr_desc->records);
|
||||
else
|
||||
lbr_desc->records.nr = 0;
|
||||
|
||||
if (lbr_desc->records.nr)
|
||||
bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
|
||||
|
||||
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
|
||||
if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
|
||||
if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
|
||||
pmu->pebs_enable_mask = counter_mask;
|
||||
pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
|
||||
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
|
||||
pmu->fixed_ctr_ctrl_mask &=
|
||||
~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
|
||||
}
|
||||
pmu->pebs_data_cfg_mask = ~0xff00000full;
|
||||
} else {
|
||||
pmu->pebs_enable_mask =
|
||||
~((1ull << pmu->nr_arch_gp_counters) - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void intel_pmu_init(struct kvm_vcpu *vcpu)
|
||||
|
@ -719,8 +779,28 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
|
|||
intel_pmu_release_guest_lbr_event(vcpu);
|
||||
}
|
||||
|
||||
void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
|
||||
{
|
||||
struct kvm_pmc *pmc = NULL;
|
||||
int bit;
|
||||
|
||||
for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl,
|
||||
X86_PMC_IDX_MAX) {
|
||||
pmc = intel_pmc_idx_to_pmc(pmu, bit);
|
||||
|
||||
if (!pmc || !pmc_speculative_in_use(pmc) ||
|
||||
!intel_pmc_is_enabled(pmc))
|
||||
continue;
|
||||
|
||||
if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) {
|
||||
pmu->host_cross_mapped_mask |=
|
||||
BIT_ULL(pmc->perf_event->hw.idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct kvm_pmu_ops intel_pmu_ops __initdata = {
|
||||
.pmc_perf_hw_id = intel_pmc_perf_hw_id,
|
||||
.hw_event_available = intel_hw_event_available,
|
||||
.pmc_is_enabled = intel_pmc_is_enabled,
|
||||
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
|
||||
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
|
||||
|
|
|
@ -34,7 +34,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
|
|||
return &(to_vmx(vcpu)->pi_desc);
|
||||
}
|
||||
|
||||
static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
|
||||
static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
|
||||
{
|
||||
/*
|
||||
* PID.ON can be set at any time by a different vCPU or by hardware,
|
||||
|
@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
|
|||
* update must be retried with a fresh snapshot an ON change causes
|
||||
* the cmpxchg to fail.
|
||||
*/
|
||||
if (cmpxchg64(&pi_desc->control, old, new) != old)
|
||||
if (!try_cmpxchg64(&pi_desc->control, pold, new))
|
||||
return -EBUSY;
|
||||
|
||||
return 0;
|
||||
|
@ -96,8 +96,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
|||
if (!x2apic_mode)
|
||||
dest = (dest << 8) & 0xFF00;
|
||||
|
||||
old.control = READ_ONCE(pi_desc->control);
|
||||
do {
|
||||
old.control = new.control = READ_ONCE(pi_desc->control);
|
||||
new.control = old.control;
|
||||
|
||||
/*
|
||||
* Clear SN (as above) and refresh the destination APIC ID to
|
||||
|
@ -111,7 +112,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
|||
* descriptor was modified on "put" to use the wakeup vector.
|
||||
*/
|
||||
new.nv = POSTED_INTR_VECTOR;
|
||||
} while (pi_try_set_control(pi_desc, old.control, new.control));
|
||||
} while (pi_try_set_control(pi_desc, &old.control, new.control));
|
||||
|
||||
local_irq_restore(flags);
|
||||
|
||||
|
@ -156,12 +157,12 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
|
|||
|
||||
WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
|
||||
|
||||
old.control = READ_ONCE(pi_desc->control);
|
||||
do {
|
||||
old.control = new.control = READ_ONCE(pi_desc->control);
|
||||
|
||||
/* set 'NV' to 'wakeup vector' */
|
||||
new.control = old.control;
|
||||
new.nv = POSTED_INTR_WAKEUP_VECTOR;
|
||||
} while (pi_try_set_control(pi_desc, old.control, new.control));
|
||||
} while (pi_try_set_control(pi_desc, &old.control, new.control));
|
||||
|
||||
/*
|
||||
* Send a wakeup IPI to this CPU if an interrupt may have been posted
|
||||
|
@ -177,11 +178,24 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
|
|||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/*
|
||||
* The default posted interrupt vector does nothing when
|
||||
* invoked outside guest mode. Return whether a blocked vCPU
|
||||
* can be the target of posted interrupts, as is the case when
|
||||
* using either IPI virtualization or VT-d PI, so that the
|
||||
* notification vector is switched to the one that calls
|
||||
* back to the pi_wakeup_handler() function.
|
||||
*/
|
||||
return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm);
|
||||
}
|
||||
|
||||
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
|
||||
if (!vmx_can_use_vtd_pi(vcpu->kvm))
|
||||
if (!vmx_needs_pi_wakeup(vcpu))
|
||||
return;
|
||||
|
||||
if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
#define POSTED_INTR_ON 0
|
||||
#define POSTED_INTR_SN 1
|
||||
|
||||
#define PID_TABLE_ENTRY_VALID 1
|
||||
|
||||
/* Posted-Interrupt Descriptor */
|
||||
struct pi_desc {
|
||||
u32 pir[8]; /* Posted interrupt requested */
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue