Fixes for PPC KVM:

- Close a hole which could possibly lead to the host timebase getting
   out of sync.
 
 - Three fixes relating to PTEs and TLB entries for radix guests.
 
 - Fix a bug which could lead to an interrupt never getting delivered
   to the guest, if it is pending for a guest vCPU when the vCPU gets
   offlined.
 -----BEGIN PGP SIGNATURE-----
 
 iQFGBAABCgAwFiEEv0VLfXa2m9eKuaRpnZrqdyxjcZ8FAlsGTWMSHHBhdWx1c0Bv
 emxhYnMub3JnAAoJEJ2a6ncsY3GfPKQH/3dopz+qjpZqvhgvqfC0wkLlGLcTxmKK
 +y77M5YStFEeytYB52hyrAs4KptM1If5+BfShX4tTzGY5MGS4RMvzY7tLNzLlmFg
 S/ghzlFCh4dIz+LTk58FIyFmyn7GrvJRP33FoiAPCCp1AkRL7MlSD5cu3N6fHo6P
 GU5lHLLyaGEIkC4KxLQdr4smV3tKNk1k6iz4eMHwDOeLoxcLnz0LbiM7xBr/Txmu
 miF68B29hU/peKM/GbtSAh5TpWY6WlcPTBUEiHXghcuYmXqgW43fjGleuL330mN4
 HtSONLuapa6VNSJy3UuGBlI1puIEbUrtTPfy0UxKQG3Em7L8UnxO2wk=
 =7/7b
 -----END PGP SIGNATURE-----

Merge tag 'kvm-ppc-fixes-4.17-1' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc

Fixes for PPC KVM:

- Close a hole which could possibly lead to the host timebase getting
  out of sync.

- Three fixes relating to PTEs and TLB entries for radix guests.

- Fix a bug which could lead to an interrupt never getting delivered
  to the guest, if it is pending for a guest vCPU when the vCPU gets
  offlined.
This commit is contained in:
Radim Krčmář 2018-05-24 16:48:05 +02:00
commit b09efdc250
6 changed files with 159 additions and 55 deletions

View File

@ -96,6 +96,7 @@ struct kvmppc_vcore {
struct kvm_vcpu *runner; struct kvm_vcpu *runner;
struct kvm *kvm; struct kvm *kvm;
u64 tb_offset; /* guest timebase - host timebase */ u64 tb_offset; /* guest timebase - host timebase */
u64 tb_offset_applied; /* timebase offset currently in force */
ulong lpcr; ulong lpcr;
u32 arch_compat; u32 arch_compat;
ulong pcr; ulong pcr;

View File

@ -562,6 +562,7 @@ int main(void)
OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads); OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads);
OFFSET(VCORE_KVM, kvmppc_vcore, kvm); OFFSET(VCORE_KVM, kvmppc_vcore, kvm);
OFFSET(VCORE_TB_OFFSET, kvmppc_vcore, tb_offset); OFFSET(VCORE_TB_OFFSET, kvmppc_vcore, tb_offset);
OFFSET(VCORE_TB_OFFSET_APPL, kvmppc_vcore, tb_offset_applied);
OFFSET(VCORE_LPCR, kvmppc_vcore, lpcr); OFFSET(VCORE_LPCR, kvmppc_vcore, lpcr);
OFFSET(VCORE_PCR, kvmppc_vcore, pcr); OFFSET(VCORE_PCR, kvmppc_vcore, pcr);
OFFSET(VCORE_DPDES, kvmppc_vcore, dpdes); OFFSET(VCORE_DPDES, kvmppc_vcore, dpdes);

View File

@ -162,7 +162,7 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG))
asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1) asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
: : "r" (addr), "r" (kvm->arch.lpid) : "memory"); : : "r" (addr), "r" (kvm->arch.lpid) : "memory");
asm volatile("ptesync": : :"memory"); asm volatile("eieio ; tlbsync ; ptesync": : :"memory");
} }
static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr) static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr)
@ -173,7 +173,7 @@ static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr)
/* RIC=1 PRS=0 R=1 IS=2 */ /* RIC=1 PRS=0 R=1 IS=2 */
asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1) asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1)
: : "r" (rb), "r" (kvm->arch.lpid) : "memory"); : : "r" (rb), "r" (kvm->arch.lpid) : "memory");
asm volatile("ptesync": : :"memory"); asm volatile("eieio ; tlbsync ; ptesync": : :"memory");
} }
unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
@ -584,7 +584,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (ptep && pte_present(*ptep)) { if (ptep && pte_present(*ptep)) {
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
gpa, shift); gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift);
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {

View File

@ -2441,6 +2441,7 @@ static void init_vcore_to_run(struct kvmppc_vcore *vc)
vc->in_guest = 0; vc->in_guest = 0;
vc->napping_threads = 0; vc->napping_threads = 0;
vc->conferring_threads = 0; vc->conferring_threads = 0;
vc->tb_offset_applied = 0;
} }
static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)

View File

@ -692,6 +692,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
22: ld r8,VCORE_TB_OFFSET(r5) 22: ld r8,VCORE_TB_OFFSET(r5)
cmpdi r8,0 cmpdi r8,0
beq 37f beq 37f
std r8, VCORE_TB_OFFSET_APPL(r5)
mftb r6 /* current host timebase */ mftb r6 /* current host timebase */
add r8,r8,r6 add r8,r8,r6
mtspr SPRN_TBU40,r8 /* update upper 40 bits */ mtspr SPRN_TBU40,r8 /* update upper 40 bits */
@ -940,18 +941,6 @@ FTR_SECTION_ELSE
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
8: 8:
/*
* Set the decrementer to the guest decrementer.
*/
ld r8,VCPU_DEC_EXPIRES(r4)
/* r8 is a host timebase value here, convert to guest TB */
ld r5,HSTATE_KVM_VCORE(r13)
ld r6,VCORE_TB_OFFSET(r5)
add r8,r8,r6
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
ld r5, VCPU_SPRG0(r4) ld r5, VCPU_SPRG0(r4)
ld r6, VCPU_SPRG1(r4) ld r6, VCPU_SPRG1(r4)
ld r7, VCPU_SPRG2(r4) ld r7, VCPU_SPRG2(r4)
@ -1005,6 +994,18 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
mtspr SPRN_LPCR,r8 mtspr SPRN_LPCR,r8
isync isync
/*
* Set the decrementer to the guest decrementer.
*/
ld r8,VCPU_DEC_EXPIRES(r4)
/* r8 is a host timebase value here, convert to guest TB */
ld r5,HSTATE_KVM_VCORE(r13)
ld r6,VCORE_TB_OFFSET_APPL(r5)
add r8,r8,r6
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
/* Check if HDEC expires soon */ /* Check if HDEC expires soon */
mfspr r3, SPRN_HDEC mfspr r3, SPRN_HDEC
EXTEND_HDEC(r3) EXTEND_HDEC(r3)
@ -1597,8 +1598,27 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
guest_bypass: guest_bypass:
stw r12, STACK_SLOT_TRAP(r1) stw r12, STACK_SLOT_TRAP(r1)
mr r3, r12
/* Save DEC */
/* Do this before kvmhv_commence_exit so we know TB is guest TB */
ld r3, HSTATE_KVM_VCORE(r13)
mfspr r5,SPRN_DEC
mftb r6
/* On P9, if the guest has large decr enabled, don't sign extend */
BEGIN_FTR_SECTION
ld r4, VCORE_LPCR(r3)
andis. r4, r4, LPCR_LD@h
bne 16f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r5,r5
16: add r5,r5,r6
/* r5 is a guest timebase value here, convert to host TB */
ld r4,VCORE_TB_OFFSET_APPL(r3)
subf r5,r4,r5
std r5,VCPU_DEC_EXPIRES(r9)
/* Increment exit count, poke other threads to exit */ /* Increment exit count, poke other threads to exit */
mr r3, r12
bl kvmhv_commence_exit bl kvmhv_commence_exit
nop nop
ld r9, HSTATE_KVM_VCPU(r13) ld r9, HSTATE_KVM_VCPU(r13)
@ -1639,23 +1659,6 @@ guest_bypass:
mtspr SPRN_PURR,r3 mtspr SPRN_PURR,r3
mtspr SPRN_SPURR,r4 mtspr SPRN_SPURR,r4
/* Save DEC */
ld r3, HSTATE_KVM_VCORE(r13)
mfspr r5,SPRN_DEC
mftb r6
/* On P9, if the guest has large decr enabled, don't sign extend */
BEGIN_FTR_SECTION
ld r4, VCORE_LPCR(r3)
andis. r4, r4, LPCR_LD@h
bne 16f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r5,r5
16: add r5,r5,r6
/* r5 is a guest timebase value here, convert to host TB */
ld r4,VCORE_TB_OFFSET(r3)
subf r5,r4,r5
std r5,VCPU_DEC_EXPIRES(r9)
BEGIN_FTR_SECTION BEGIN_FTR_SECTION
b 8f b 8f
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
@ -1905,6 +1908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
cmpwi cr2, r0, 0 cmpwi cr2, r0, 0
beq cr2, 4f beq cr2, 4f
/*
* Radix: do eieio; tlbsync; ptesync sequence in case we
* interrupted the guest between a tlbie and a ptesync.
*/
eieio
tlbsync
ptesync
/* Radix: Handle the case where the guest used an illegal PID */ /* Radix: Handle the case where the guest used an illegal PID */
LOAD_REG_ADDR(r4, mmu_base_pid) LOAD_REG_ADDR(r4, mmu_base_pid)
lwz r3, VCPU_GUEST_PID(r9) lwz r3, VCPU_GUEST_PID(r9)
@ -2017,9 +2028,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
27: 27:
/* Subtract timebase offset from timebase */ /* Subtract timebase offset from timebase */
ld r8,VCORE_TB_OFFSET(r5) ld r8, VCORE_TB_OFFSET_APPL(r5)
cmpdi r8,0 cmpdi r8,0
beq 17f beq 17f
li r0, 0
std r0, VCORE_TB_OFFSET_APPL(r5)
mftb r6 /* current guest timebase */ mftb r6 /* current guest timebase */
subf r8,r8,r6 subf r8,r8,r6
mtspr SPRN_TBU40,r8 /* update upper 40 bits */ mtspr SPRN_TBU40,r8 /* update upper 40 bits */
@ -2700,7 +2713,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
add r3, r3, r5 add r3, r3, r5
ld r4, HSTATE_KVM_VCPU(r13) ld r4, HSTATE_KVM_VCPU(r13)
ld r5, HSTATE_KVM_VCORE(r13) ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_TB_OFFSET(r5) ld r6, VCORE_TB_OFFSET_APPL(r5)
subf r3, r6, r3 /* convert to host TB value */ subf r3, r6, r3 /* convert to host TB value */
std r3, VCPU_DEC_EXPIRES(r4) std r3, VCPU_DEC_EXPIRES(r4)
@ -2799,7 +2812,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/* Restore guest decrementer */ /* Restore guest decrementer */
ld r3, VCPU_DEC_EXPIRES(r4) ld r3, VCPU_DEC_EXPIRES(r4)
ld r5, HSTATE_KVM_VCORE(r13) ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_TB_OFFSET(r5) ld r6, VCORE_TB_OFFSET_APPL(r5)
add r3, r3, r6 /* convert host TB to guest TB value */ add r3, r3, r6 /* convert host TB to guest TB value */
mftb r7 mftb r7
subf r3, r7, r3 subf r3, r7, r3
@ -3606,12 +3619,9 @@ kvmppc_fix_pmao:
*/ */
kvmhv_start_timing: kvmhv_start_timing:
ld r5, HSTATE_KVM_VCORE(r13) ld r5, HSTATE_KVM_VCORE(r13)
lbz r6, VCORE_IN_GUEST(r5) ld r6, VCORE_TB_OFFSET_APPL(r5)
cmpwi r6, 0 mftb r5
beq 5f /* if in guest, need to */ subf r5, r6, r5 /* subtract current timebase offset */
ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */
5: mftb r5
subf r5, r6, r5
std r3, VCPU_CUR_ACTIVITY(r4) std r3, VCPU_CUR_ACTIVITY(r4)
std r5, VCPU_ACTIVITY_START(r4) std r5, VCPU_ACTIVITY_START(r4)
blr blr
@ -3622,15 +3632,12 @@ kvmhv_start_timing:
*/ */
kvmhv_accumulate_time: kvmhv_accumulate_time:
ld r5, HSTATE_KVM_VCORE(r13) ld r5, HSTATE_KVM_VCORE(r13)
lbz r8, VCORE_IN_GUEST(r5) ld r8, VCORE_TB_OFFSET_APPL(r5)
cmpwi r8, 0 ld r5, VCPU_CUR_ACTIVITY(r4)
beq 4f /* if in guest, need to */
ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */
4: ld r5, VCPU_CUR_ACTIVITY(r4)
ld r6, VCPU_ACTIVITY_START(r4) ld r6, VCPU_ACTIVITY_START(r4)
std r3, VCPU_CUR_ACTIVITY(r4) std r3, VCPU_CUR_ACTIVITY(r4)
mftb r7 mftb r7
subf r7, r8, r7 subf r7, r8, r7 /* subtract current timebase offset */
std r7, VCPU_ACTIVITY_START(r4) std r7, VCPU_ACTIVITY_START(r4)
cmpdi r5, 0 cmpdi r5, 0
beqlr beqlr

View File

@ -11,6 +11,9 @@
#define XGLUE(a,b) a##b #define XGLUE(a,b) a##b
#define GLUE(a,b) XGLUE(a,b) #define GLUE(a,b) XGLUE(a,b)
/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */
#define XICS_DUMMY 1
static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
{ {
u8 cppr; u8 cppr;
@ -205,6 +208,10 @@ skip_ipi:
goto skip_ipi; goto skip_ipi;
} }
/* If it's the dummy interrupt, continue searching */
if (hirq == XICS_DUMMY)
goto skip_ipi;
/* If fetching, update queue pointers */ /* If fetching, update queue pointers */
if (scan_type == scan_fetch) { if (scan_type == scan_fetch) {
q->idx = idx; q->idx = idx;
@ -385,9 +392,76 @@ static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc)
__x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING); __x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING);
} }
static void GLUE(X_PFX,scan_for_rerouted_irqs)(struct kvmppc_xive *xive,
struct kvmppc_xive_vcpu *xc)
{
unsigned int prio;
/* For each priority that is now masked */
for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
struct xive_q *q = &xc->queues[prio];
struct kvmppc_xive_irq_state *state;
struct kvmppc_xive_src_block *sb;
u32 idx, toggle, entry, irq, hw_num;
struct xive_irq_data *xd;
__be32 *qpage;
u16 src;
idx = q->idx;
toggle = q->toggle;
qpage = READ_ONCE(q->qpage);
if (!qpage)
continue;
/* For each interrupt in the queue */
for (;;) {
entry = be32_to_cpup(qpage + idx);
/* No more ? */
if ((entry >> 31) == toggle)
break;
irq = entry & 0x7fffffff;
/* Skip dummies and IPIs */
if (irq == XICS_DUMMY || irq == XICS_IPI)
goto next;
sb = kvmppc_xive_find_source(xive, irq, &src);
if (!sb)
goto next;
state = &sb->irq_state[src];
/* Has it been rerouted ? */
if (xc->server_num == state->act_server)
goto next;
/*
* Allright, it *has* been re-routed, kill it from
* the queue.
*/
qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY);
/* Find the HW interrupt */
kvmppc_xive_select_irq(state, &hw_num, &xd);
/* If it's not an LSI, set PQ to 11 the EOI will force a resend */
if (!(xd->flags & XIVE_IRQ_FLAG_LSI))
GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_11);
/* EOI the source */
GLUE(X_PFX,source_eoi)(hw_num, xd);
next:
idx = (idx + 1) & q->msk;
if (idx == 0)
toggle ^= 1;
}
}
}
X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
{ {
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
u8 old_cppr; u8 old_cppr;
pr_devel("H_CPPR(cppr=%ld)\n", cppr); pr_devel("H_CPPR(cppr=%ld)\n", cppr);
@ -407,14 +481,34 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
*/ */
smp_mb(); smp_mb();
/* if (cppr > old_cppr) {
* We are masking less, we need to look for pending things /*
* to deliver and set VP pending bits accordingly to trigger * We are masking less, we need to look for pending things
* a new interrupt otherwise we might miss MFRR changes for * to deliver and set VP pending bits accordingly to trigger
* which we have optimized out sending an IPI signal. * a new interrupt otherwise we might miss MFRR changes for
*/ * which we have optimized out sending an IPI signal.
if (cppr > old_cppr) */
GLUE(X_PFX,push_pending_to_hw)(xc); GLUE(X_PFX,push_pending_to_hw)(xc);
} else {
/*
* We are masking more, we need to check the queue for any
* interrupt that has been routed to another CPU, take
* it out (replace it with the dummy) and retrigger it.
*
* This is necessary since those interrupts may otherwise
* never be processed, at least not until this CPU restores
* its CPPR.
*
* This is in theory racy vs. HW adding new interrupts to
* the queue. In practice this works because the interesting
* cases are when the guest has done a set_xive() to move the
* interrupt away, which flushes the xive, followed by the
* target CPU doing a H_CPPR. So any new interrupt coming into
* the queue must still be routed to us and isn't a source
* of concern.
*/
GLUE(X_PFX,scan_for_rerouted_irqs)(xive, xc);
}
/* Apply new CPPR */ /* Apply new CPPR */
xc->hw_cppr = cppr; xc->hw_cppr = cppr;