KVM: s390: Updates for 5.13

- properly handle MVPG in nesting KVM (vsie)
 - allow to forward the yield_to hypercall (diagnose 9c)
 - fixes
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2.0.22 (GNU/Linux)
 
 iQIcBAABAgAGBQJgdFvgAAoJEBF7vIC1phx8+IIP/0OdF4I5VqBJ1C9Roc3l4P+4
 b95OZX4nBLQ0L1JnPMeJqNo3V6JH/5356dwpIplQXv5wraS3+sQGX2D1xW00QnLE
 M6L3368uT30JmEVWnnrulUdLWwUqExJ17BEX9p4rmJQAm+7rLOJsVsWIKwclupyR
 BacDMG2q5aG+/eaceimBdEPyfE6YHJzbtD9BEBe12/Y+B0PyCyinAOiGALcugDkY
 kSqdqBcHFqXJuF37DsQn2gSlBFGByfvWlaYa0dKhdGFp4ps3TDhmC+qyoBAjHJFu
 nzTNOFdjgMlatUe92OsgwqilV0OUgdNZ+deKSyGHdmht+RknuLsJU0LqCvN66cTA
 H58D5s3PrM8868e/bflX47Lt0fbJSA7ZXZqJuyP84tEqTgQmAH43VvQg8t9bybTp
 dY2UUx19ZHpktVjL+FIylUcxyLXFSX8KTI0a/JxlMUUjE+NAaB22iCyBMMIoogSj
 ozqKGq7VwPJftoxLiUaGEUL4NyXlo7+XivZNTHFIjh0sjDZooH9IZ9LK/17684ra
 GLCAnw2hhB4xegNPuJWawo/vNJ5dAtiKVQ6Hwgr6ORaCEBLGtIlyYhm1XYAwb7f4
 vAfQ60lqbL1dpGtKnf4cMySrgNczotura4KPreXkDJ68eqNJCjbDUVnN+0XsBIC8
 7+SaOJRmJRd0VzeEPBg3
 =8wV0
 -----END PGP SIGNATURE-----

Merge tag 'kvm-s390-next-5.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD

KVM: s390: Updates for 5.13

- properly handle MVPG in nesting KVM (vsie)
- allow to forward the yield_to hypercall (diagnose 9c)
- fixes
This commit is contained in:
Paolo Bonzini 2021-04-15 13:02:13 -04:00
commit 6c377b02a8
10 changed files with 250 additions and 32 deletions

View File

@ -84,3 +84,36 @@ If the function code specifies 0x501, breakpoint functions may be performed.
This function code is handled by userspace.
This diagnose function code has no subfunctions and uses no parameters.
DIAGNOSE function code 'X'9C - Voluntary Time Slice Yield
---------------------------------------------------------
General register 1 contains the target CPU address.
In a guest of a hypervisor like LPAR, KVM or z/VM using shared host CPUs,
DIAGNOSE with function code 0x9c may improve system performance by
yielding the host CPU on which the guest CPU is running to be assigned
to another guest CPU, preferably the logical CPU containing the specified
target CPU.
DIAG 'X'9C forwarding
+++++++++++++++++++++
The guest may send a DIAGNOSE 0x9c in order to yield to a certain
other vcpu. An example is a Linux guest that tries to yield to the vcpu
that is currently holding a spinlock, but not running.
However, on the host the real cpu backing the vcpu may itself not be
running.
Forwarding the DIAGNOSE 0x9c initially sent by the guest to yield to
the backing cpu will hopefully cause that cpu, and thus subsequently
the guest's vcpu, to be scheduled.
diag9c_forwarding_hz
KVM kernel parameter allowing to specify the maximum number of DIAGNOSE
0x9c forwarding per second in the purpose of avoiding a DIAGNOSE 0x9c
forwarding storm.
A value of 0 turns the forwarding off.

View File

@ -454,6 +454,7 @@ struct kvm_vcpu_stat {
u64 diagnose_44;
u64 diagnose_9c;
u64 diagnose_9c_ignored;
u64 diagnose_9c_forward;
u64 diagnose_258;
u64 diagnose_308;
u64 diagnose_500;

View File

@ -63,5 +63,6 @@ extern void __noreturn cpu_die(void);
extern void __cpu_die(unsigned int cpu);
extern int __cpu_disable(void);
extern void schedule_mcck_handler(void);
void notrace smp_yield_cpu(int cpu);
#endif /* __ASM_SMP_H */

View File

@ -429,6 +429,7 @@ void notrace smp_yield_cpu(int cpu)
asm volatile("diag %0,0,0x9c"
: : "d" (pcpu_devices[cpu].address));
}
EXPORT_SYMBOL_GPL(smp_yield_cpu);
/*
* Send cpus emergency shutdown signal. This gives the cpus the

View File

@ -150,6 +150,19 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
return 0;
}
static int forward_cnt;
static unsigned long cur_slice;
static int diag9c_forwarding_overrun(void)
{
/* Reset the count on a new slice */
if (time_after(jiffies, cur_slice)) {
cur_slice = jiffies;
forward_cnt = diag9c_forwarding_hz / HZ;
}
return forward_cnt-- <= 0 ? 1 : 0;
}
static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tcpu;
@ -167,9 +180,21 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
if (!tcpu)
goto no_yield;
/* target already running */
if (READ_ONCE(tcpu->cpu) >= 0)
goto no_yield;
/* target guest VCPU already running */
if (READ_ONCE(tcpu->cpu) >= 0) {
if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
goto no_yield;
/* target host CPU already running */
if (!vcpu_is_preempted(tcpu->cpu))
goto no_yield;
smp_yield_cpu(tcpu->cpu);
VCPU_EVENT(vcpu, 5,
"diag time slice end directed to %d: yield forwarded",
tid);
vcpu->stat.diagnose_9c_forward++;
return 0;
}
if (kvm_vcpu_yield_to(tcpu) <= 0)
goto no_yield;

View File

@ -976,7 +976,9 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
* kvm_s390_shadow_tables - walk the guest page table and create shadow tables
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
* @pgt: pointer to the page table address result
* @pgt: pointer to the beginning of the page table for the given address if
* successful (return value 0), or to the first invalid DAT entry in
* case of exceptions (return value > 0)
* @fake: pgt references contiguous guest memory block, not a pgtable
*/
static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
@ -1034,6 +1036,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
rfte.val = ptr;
goto shadow_r2t;
}
*pgt = ptr + vaddr.rfx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
if (rc)
return rc;
@ -1060,6 +1063,7 @@ shadow_r2t:
rste.val = ptr;
goto shadow_r3t;
}
*pgt = ptr + vaddr.rsx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
if (rc)
return rc;
@ -1087,6 +1091,7 @@ shadow_r3t:
rtte.val = ptr;
goto shadow_sgt;
}
*pgt = ptr + vaddr.rtx * 8;
rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
if (rc)
return rc;
@ -1123,6 +1128,7 @@ shadow_sgt:
ste.val = ptr;
goto shadow_pgt;
}
*pgt = ptr + vaddr.sx * 8;
rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
if (rc)
return rc;
@ -1157,6 +1163,8 @@ shadow_pgt:
* @vcpu: virtual cpu
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
* @datptr: will contain the address of the faulting DAT table entry, or of
* the valid leaf, plus some flags
*
* Returns: - 0 if the shadow fault was successfully resolved
* - > 0 (pgm exception code) on exceptions while faulting
@ -1165,11 +1173,11 @@ shadow_pgt:
* - -ENOMEM if out of memory
*/
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
unsigned long saddr)
unsigned long saddr, unsigned long *datptr)
{
union vaddress vaddr;
union page_table_entry pte;
unsigned long pgt;
unsigned long pgt = 0;
int dat_protection, fake;
int rc;
@ -1191,8 +1199,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
pte.val = pgt + vaddr.px * PAGE_SIZE;
goto shadow_page;
}
if (!rc)
rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
switch (rc) {
case PGM_SEGMENT_TRANSLATION:
case PGM_REGION_THIRD_TRANS:
case PGM_REGION_SECOND_TRANS:
case PGM_REGION_FIRST_TRANS:
pgt |= PEI_NOT_PTE;
break;
case 0:
pgt += vaddr.px * 8;
rc = gmap_read_table(sg->parent, pgt, &pte.val);
}
if (datptr)
*datptr = pgt | dat_protection * PEI_DAT_PROT;
if (!rc && pte.i)
rc = PGM_PAGE_TRANSLATION;
if (!rc && pte.z)

View File

@ -16,6 +16,23 @@
#include <linux/ptrace.h>
#include "kvm-s390.h"
/**
* kvm_s390_real_to_abs - convert guest real address to guest absolute address
* @prefix - guest prefix
* @gra - guest real address
*
* Returns the guest absolute address that corresponds to the passed guest real
* address @gra of by applying the given prefix.
*/
static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra)
{
if (gra < 2 * PAGE_SIZE)
gra += prefix;
else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
gra -= prefix;
return gra;
}
/**
* kvm_s390_real_to_abs - convert guest real address to guest absolute address
* @vcpu - guest virtual cpu
@ -27,13 +44,30 @@
static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
unsigned long gra)
{
unsigned long prefix = kvm_s390_get_prefix(vcpu);
return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra);
}
if (gra < 2 * PAGE_SIZE)
gra += prefix;
else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE)
gra -= prefix;
return gra;
/**
* _kvm_s390_logical_to_effective - convert guest logical to effective address
* @psw: psw of the guest
* @ga: guest logical address
*
* Convert a guest logical address to an effective address by applying the
* rules of the addressing mode defined by bits 31 and 32 of the given PSW
* (extendended/basic addressing mode).
*
* Depending on the addressing mode, the upper 40 bits (24 bit addressing
* mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing
* mode) of @ga will be zeroed and the remaining bits will be returned.
*/
static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw,
unsigned long ga)
{
if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
return ga;
if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
return ga & ((1UL << 31) - 1);
return ga & ((1UL << 24) - 1);
}
/**
@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu,
unsigned long ga)
{
psw_t *psw = &vcpu->arch.sie_block->gpsw;
if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT)
return ga;
if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT)
return ga & ((1UL << 31) - 1);
return ga & ((1UL << 24) - 1);
return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga);
}
/*
@ -359,7 +387,11 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
/* MVPG PEI indication bits */
#define PEI_DAT_PROT 2
#define PEI_NOT_PTE 4
int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
unsigned long saddr);
unsigned long saddr, unsigned long *datptr);
#endif /* __KVM_S390_GACCESS_H */

View File

@ -158,6 +158,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("instruction_diag_44", diagnose_44),
VCPU_STAT("instruction_diag_9c", diagnose_9c),
VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
VCPU_STAT("instruction_diag_258", diagnose_258),
VCPU_STAT("instruction_diag_308", diagnose_308),
VCPU_STAT("instruction_diag_500", diagnose_500),
@ -185,6 +186,11 @@ static bool use_gisa = true;
module_param(use_gisa, bool, 0644);
MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it.");
/* maximum diag9c forwarding per second */
unsigned int diag9c_forwarding_hz;
module_param(diag9c_forwarding_hz, uint, 0644);
MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off");
/*
* For now we handle at most 16 double words as this is what the s390 base
* kernel handles and stores in the prefix page. If we ever need to go beyond
@ -4542,7 +4548,7 @@ int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
/*
* As we are starting a second VCPU, we have to disable
* the IBS facility on all VCPUs to remove potentially
* oustanding ENABLE requests.
* outstanding ENABLE requests.
*/
__disable_ibs_on_all_vcpus(vcpu->kvm);
}

View File

@ -471,4 +471,12 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
* @kvm: the KVM guest
*/
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
/**
* diag9c_forwarding_hz
*
* Set the maximum number of diag9c forwarding per second
*/
extern unsigned int diag9c_forwarding_hz;
#endif

View File

@ -417,11 +417,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
memcpy((void *)((u64)scb_o + 0xc0),
(void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
break;
case ICPT_PARTEXEC:
/* MVPG only */
memcpy((void *)((u64)scb_o + 0xc0),
(void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
break;
}
if (scb_s->ihcpu != 0xffffU)
@ -620,10 +615,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
if (!rc && (scb_s->ecb & ECB_TE))
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
prefix + PAGE_SIZE);
prefix + PAGE_SIZE, NULL);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
@ -914,7 +909,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
current->thread.gmap_addr, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
current->thread.gmap_addr);
current->thread.gmap_addr, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
current->thread.gmap_addr,
@ -936,7 +931,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
{
if (vsie_page->fault_addr)
kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
vsie_page->fault_addr);
vsie_page->fault_addr, NULL);
vsie_page->fault_addr = 0;
}
@ -983,6 +978,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
return 0;
}
/*
* Get a register for a nested guest.
* @vcpu the vcpu of the guest
* @vsie_page the vsie_page for the nested guest
* @reg the register number, the upper 4 bits are ignored.
* returns: the value of the register.
*/
static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
{
/* no need to validate the parameter and/or perform error handling */
reg &= 0xf;
switch (reg) {
case 15:
return vsie_page->scb_s.gg15;
case 14:
return vsie_page->scb_s.gg14;
default:
return vcpu->run->s.regs.gprs[reg];
}
}
static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
unsigned long pei_dest, pei_src, src, dest, mask, prefix;
u64 *pei_block = &vsie_page->scb_o->mcic;
int edat, rc_dest, rc_src;
union ctlreg0 cr0;
cr0.val = vcpu->arch.sie_block->gcr[0];
edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK);
prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso;
src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
/*
* Either everything went well, or something non-critical went wrong
* e.g. because of a race. In either case, simply retry.
*/
if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) {
retry_vsie_icpt(vsie_page);
return -EAGAIN;
}
/* Something more serious went wrong, propagate the error */
if (rc_dest < 0)
return rc_dest;
if (rc_src < 0)
return rc_src;
/* The only possible suppressing exception: just deliver it */
if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) {
clear_vsie_icpt(vsie_page);
rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC);
WARN_ON_ONCE(rc_dest);
return 1;
}
/*
* Forward the PEI intercept to the guest if it was a page fault, or
* also for segment and region table faults if EDAT applies.
*/
if (edat) {
rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0;
rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0;
} else {
rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0;
rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
}
if (!rc_dest && !rc_src) {
pei_block[0] = pei_dest;
pei_block[1] = pei_src;
return 1;
}
retry_vsie_icpt(vsie_page);
/*
* The host has edat, and the guest does not, or it was an ASCE type
* exception. The host needs to inject the appropriate DAT interrupts
* into the guest.
*/
if (rc_dest)
return inject_fault(vcpu, rc_dest, dest, 1);
return inject_fault(vcpu, rc_src, src, 0);
}
/*
* Run the vsie on a shadow scb and a shadow gmap, without any further
* sanity checks, handling SIE faults.
@ -1071,6 +1158,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if ((scb_s->ipa & 0xf000) != 0xf000)
scb_s->ipa += 0x1000;
break;
case ICPT_PARTEXEC:
if (scb_s->ipa == 0xb254)
rc = vsie_handle_mvpg(vcpu, vsie_page);
break;
}
return rc;
}