Commit f0888f70 authored by Paul Mackerras's avatar Paul Mackerras Committed by Avi Kivity

KVM: PPC: Book3S HV: Make secondary threads more robust against stray IPIs

Currently on POWER7, if we are running the guest on a core and we don't
need all the hardware threads, we do nothing to ensure that the unused
threads aren't executing in the kernel (other than checking that they
are offline).  We just assume they're napping and we don't do anything
to stop them trying to enter the kernel while the guest is running.
This means that a stray IPI can wake up the hardware thread and it will
then try to enter the kernel, but since the core is in guest context,
it will execute code from the guest in hypervisor mode once it turns the
MMU on, which tends to lead to crashes or hangs in the host.

This fixes the problem by adding two new one-byte flags in the
kvmppc_host_state structure in the PACA which are used to interlock
between the primary thread and the unused secondary threads when entering
the guest.  With these flags, the primary thread can ensure that the
unused secondaries are not already in kernel mode (i.e. handling a stray
IPI) and then indicate that they should not try to enter the kernel
if they do get woken for any reason.  Instead they will go into KVM code,
find that there is no vcpu to run, acknowledge and clear the IPI and go
back to nap mode.
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
Signed-off-by: default avatarAlexander Graf <agraf@suse.de>
Signed-off-by: default avatarAvi Kivity <avi@redhat.com>
parent f6127716
...@@ -79,6 +79,9 @@ struct kvmppc_host_state { ...@@ -79,6 +79,9 @@ struct kvmppc_host_state {
u8 napping; u8 napping;
#ifdef CONFIG_KVM_BOOK3S_64_HV #ifdef CONFIG_KVM_BOOK3S_64_HV
u8 hwthread_req;
u8 hwthread_state;
struct kvm_vcpu *kvm_vcpu; struct kvm_vcpu *kvm_vcpu;
struct kvmppc_vcore *kvm_vcore; struct kvmppc_vcore *kvm_vcore;
unsigned long xics_phys; unsigned long xics_phys;
...@@ -122,4 +125,9 @@ struct kvmppc_book3s_shadow_vcpu { ...@@ -122,4 +125,9 @@ struct kvmppc_book3s_shadow_vcpu {
#endif /*__ASSEMBLY__ */ #endif /*__ASSEMBLY__ */
/* Values for kvm_state */
#define KVM_HWTHREAD_IN_KERNEL 0
#define KVM_HWTHREAD_IN_NAP 1
#define KVM_HWTHREAD_IN_KVM 2
#endif /* __ASM_KVM_BOOK3S_ASM_H__ */ #endif /* __ASM_KVM_BOOK3S_ASM_H__ */
...@@ -540,6 +540,8 @@ int main(void) ...@@ -540,6 +540,8 @@ int main(void)
HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
HSTATE_FIELD(HSTATE_NAPPING, napping); HSTATE_FIELD(HSTATE_NAPPING, napping);
HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req);
HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state);
#ifdef CONFIG_KVM_BOOK3S_64_HV #ifdef CONFIG_KVM_BOOK3S_64_HV
HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
......
...@@ -63,11 +63,13 @@ BEGIN_FTR_SECTION ...@@ -63,11 +63,13 @@ BEGIN_FTR_SECTION
GET_PACA(r13) GET_PACA(r13)
#ifdef CONFIG_KVM_BOOK3S_64_HV #ifdef CONFIG_KVM_BOOK3S_64_HV
lbz r0,PACAPROCSTART(r13) li r0,KVM_HWTHREAD_IN_KERNEL
cmpwi r0,0x80 stb r0,HSTATE_HWTHREAD_STATE(r13)
bne 1f /* Order setting hwthread_state vs. testing hwthread_req */
li r0,1 sync
stb r0,PACAPROCSTART(r13) lbz r0,HSTATE_HWTHREAD_REQ(r13)
cmpwi r0,0
beq 1f
b kvm_start_guest b kvm_start_guest
1: 1:
#endif #endif
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/ppc-opcode.h> #include <asm/ppc-opcode.h>
#include <asm/hw_irq.h> #include <asm/hw_irq.h>
#include <asm/kvm_book3s_asm.h>
#undef DEBUG #undef DEBUG
...@@ -81,6 +82,12 @@ _GLOBAL(power7_idle) ...@@ -81,6 +82,12 @@ _GLOBAL(power7_idle)
std r9,_MSR(r1) std r9,_MSR(r1)
std r1,PACAR1(r13) std r1,PACAR1(r13)
#ifdef CONFIG_KVM_BOOK3S_64_HV
/* Tell KVM we're napping */
li r4,KVM_HWTHREAD_IN_NAP
stb r4,HSTATE_HWTHREAD_STATE(r13)
#endif
/* Magic NAP mode enter sequence */ /* Magic NAP mode enter sequence */
std r0,0(r1) std r0,0(r1)
ptesync ptesync
......
...@@ -569,6 +569,45 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, ...@@ -569,6 +569,45 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
list_del(&vcpu->arch.run_list); list_del(&vcpu->arch.run_list);
} }
static int kvmppc_grab_hwthread(int cpu)
{
struct paca_struct *tpaca;
long timeout = 1000;
tpaca = &paca[cpu];
/* Ensure the thread won't go into the kernel if it wakes */
tpaca->kvm_hstate.hwthread_req = 1;
/*
* If the thread is already executing in the kernel (e.g. handling
* a stray interrupt), wait for it to get back to nap mode.
* The smp_mb() is to ensure that our setting of hwthread_req
* is visible before we look at hwthread_state, so if this
* races with the code at system_reset_pSeries and the thread
* misses our setting of hwthread_req, we are sure to see its
* setting of hwthread_state, and vice versa.
*/
smp_mb();
while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
if (--timeout <= 0) {
pr_err("KVM: couldn't grab cpu %d\n", cpu);
return -EBUSY;
}
udelay(1);
}
return 0;
}
static void kvmppc_release_hwthread(int cpu)
{
struct paca_struct *tpaca;
tpaca = &paca[cpu];
tpaca->kvm_hstate.hwthread_req = 0;
tpaca->kvm_hstate.kvm_vcpu = NULL;
}
static void kvmppc_start_thread(struct kvm_vcpu *vcpu) static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
{ {
int cpu; int cpu;
...@@ -588,8 +627,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) ...@@ -588,8 +627,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
smp_wmb(); smp_wmb();
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
if (vcpu->arch.ptid) { if (vcpu->arch.ptid) {
tpaca->cpu_start = 0x80; kvmppc_grab_hwthread(cpu);
wmb();
xics_wake_cpu(cpu); xics_wake_cpu(cpu);
++vc->n_woken; ++vc->n_woken;
} }
...@@ -639,7 +677,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) ...@@ -639,7 +677,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
struct kvm_vcpu *vcpu, *vcpu0, *vnext; struct kvm_vcpu *vcpu, *vcpu0, *vnext;
long ret; long ret;
u64 now; u64 now;
int ptid; int ptid, i;
/* don't start if any threads have a signal pending */ /* don't start if any threads have a signal pending */
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
...@@ -686,12 +724,17 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) ...@@ -686,12 +724,17 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
vc->napping_threads = 0; vc->napping_threads = 0;
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
kvmppc_start_thread(vcpu); kvmppc_start_thread(vcpu);
/* Grab any remaining hw threads so they can't go into the kernel */
for (i = ptid; i < threads_per_core; ++i)
kvmppc_grab_hwthread(vc->pcpu + i);
preempt_disable(); preempt_disable();
spin_unlock(&vc->lock); spin_unlock(&vc->lock);
kvm_guest_enter(); kvm_guest_enter();
__kvmppc_vcore_entry(NULL, vcpu0); __kvmppc_vcore_entry(NULL, vcpu0);
for (i = 0; i < threads_per_core; ++i)
kvmppc_release_hwthread(vc->pcpu + i);
spin_lock(&vc->lock); spin_lock(&vc->lock);
/* disable sending of IPIs on virtual external irqs */ /* disable sending of IPIs on virtual external irqs */
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <asm/hvcall.h> #include <asm/hvcall.h>
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/exception-64s.h> #include <asm/exception-64s.h>
#include <asm/kvm_book3s_asm.h>
/***************************************************************************** /*****************************************************************************
* * * *
...@@ -82,6 +83,7 @@ _GLOBAL(kvmppc_hv_entry_trampoline) ...@@ -82,6 +83,7 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
#define XICS_XIRR 4 #define XICS_XIRR 4
#define XICS_QIRR 0xc #define XICS_QIRR 0xc
#define XICS_IPI 2 /* interrupt source # for IPIs */
/* /*
* We come in here when wakened from nap mode on a secondary hw thread. * We come in here when wakened from nap mode on a secondary hw thread.
...@@ -94,26 +96,54 @@ kvm_start_guest: ...@@ -94,26 +96,54 @@ kvm_start_guest:
subi r1,r1,STACK_FRAME_OVERHEAD subi r1,r1,STACK_FRAME_OVERHEAD
ld r2,PACATOC(r13) ld r2,PACATOC(r13)
/* were we napping due to cede? */ li r0,KVM_HWTHREAD_IN_KVM
lbz r0,HSTATE_NAPPING(r13) stb r0,HSTATE_HWTHREAD_STATE(r13)
cmpwi r0,0
bne kvm_end_cede
/* get vcpu pointer */ /* NV GPR values from power7_idle() will no longer be valid */
ld r4, HSTATE_KVM_VCPU(r13) li r0,1
stb r0,PACA_NAPSTATELOST(r13)
/* We got here with an IPI; clear it */ /* get vcpu pointer, NULL if we have no vcpu to run */
ld r5, HSTATE_XICS_PHYS(r13) ld r4,HSTATE_KVM_VCPU(r13)
li r0, 0xff cmpdi cr1,r4,0
li r6, XICS_QIRR
li r7, XICS_XIRR /* Check the wake reason in SRR1 to see why we got here */
lwzcix r8, r5, r7 /* ack the interrupt */ mfspr r3,SPRN_SRR1
rlwinm r3,r3,44-31,0x7 /* extract wake reason field */
cmpwi r3,4 /* was it an external interrupt? */
bne 27f
/*
* External interrupt - for now assume it is an IPI, since we
* should never get any other interrupts sent to offline threads.
* Only do this for secondary threads.
*/
beq cr1,25f
lwz r3,VCPU_PTID(r4)
cmpwi r3,0
beq 27f
25: ld r5,HSTATE_XICS_PHYS(r13)
li r0,0xff
li r6,XICS_QIRR
li r7,XICS_XIRR
lwzcix r8,r5,r7 /* get and ack the interrupt */
sync sync
stbcix r0, r5, r6 /* clear it */ clrldi. r9,r8,40 /* get interrupt source ID. */
stwcix r8, r5, r7 /* EOI it */ beq 27f /* none there? */
cmpwi r9,XICS_IPI
bne 26f
stbcix r0,r5,r6 /* clear IPI */
26: stwcix r8,r5,r7 /* EOI the interrupt */
/* NV GPR values from power7_idle() will no longer be valid */ 27: /* XXX should handle hypervisor maintenance interrupts etc. here */
stb r0, PACA_NAPSTATELOST(r13)
/* if we have no vcpu to run, go back to sleep */
beq cr1,kvm_no_guest
/* were we napping due to cede? */
lbz r0,HSTATE_NAPPING(r13)
cmpwi r0,0
bne kvm_end_cede
.global kvmppc_hv_entry .global kvmppc_hv_entry
kvmppc_hv_entry: kvmppc_hv_entry:
...@@ -1445,8 +1475,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) ...@@ -1445,8 +1475,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
* Take a nap until a decrementer or external interrupt occurs, * Take a nap until a decrementer or external interrupt occurs,
* with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR
*/ */
li r0,0x80 li r0,1
stb r0,PACAPROCSTART(r13) stb r0,HSTATE_HWTHREAD_REQ(r13)
mfspr r5,SPRN_LPCR mfspr r5,SPRN_LPCR
ori r5,r5,LPCR_PECE0 | LPCR_PECE1 ori r5,r5,LPCR_PECE0 | LPCR_PECE1
mtspr SPRN_LPCR,r5 mtspr SPRN_LPCR,r5
...@@ -1463,26 +1493,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) ...@@ -1463,26 +1493,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
kvm_end_cede: kvm_end_cede:
/* Woken by external or decrementer interrupt */ /* Woken by external or decrementer interrupt */
ld r1, HSTATE_HOST_R1(r13) ld r1, HSTATE_HOST_R1(r13)
ld r2, PACATOC(r13)
/* If we're a secondary thread and we got here by an IPI, ack it */
ld r4,HSTATE_KVM_VCPU(r13)
lwz r3,VCPU_PTID(r4)
cmpwi r3,0
beq 27f
mfspr r3,SPRN_SRR1
rlwinm r3,r3,44-31,0x7 /* extract wake reason field */
cmpwi r3,4 /* was it an external interrupt? */
bne 27f
ld r5, HSTATE_XICS_PHYS(r13)
li r0,0xff
li r6,XICS_QIRR
li r7,XICS_XIRR
lwzcix r8,r5,r7 /* ack the interrupt */
sync
stbcix r0,r5,r6 /* clear it */
stwcix r8,r5,r7 /* EOI it */
27:
/* load up FP state */ /* load up FP state */
bl kvmppc_load_fp bl kvmppc_load_fp
...@@ -1580,12 +1591,17 @@ secondary_nap: ...@@ -1580,12 +1591,17 @@ secondary_nap:
stwcx. r3, 0, r4 stwcx. r3, 0, r4
bne 51b bne 51b
kvm_no_guest:
li r0, KVM_HWTHREAD_IN_NAP
stb r0, HSTATE_HWTHREAD_STATE(r13)
li r0, 0
std r0, HSTATE_KVM_VCPU(r13)
li r3, LPCR_PECE0 li r3, LPCR_PECE0
mfspr r4, SPRN_LPCR mfspr r4, SPRN_LPCR
rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
mtspr SPRN_LPCR, r4 mtspr SPRN_LPCR, r4
isync isync
li r0, 0
std r0, HSTATE_SCRATCH0(r13) std r0, HSTATE_SCRATCH0(r13)
ptesync ptesync
ld r0, HSTATE_SCRATCH0(r13) ld r0, HSTATE_SCRATCH0(r13)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment