Commit a29ebeaf authored by Paul Mackerras's avatar Paul Mackerras Committed by Michael Ellerman

KVM: PPC: Book3S HV: Invalidate TLB on radix guest vcpu movement

With radix, the guest can do TLB invalidations itself using the tlbie
(global) and tlbiel (local) TLB invalidation instructions.  Linux guests
use local TLB invalidations for translations that have only ever been
accessed on one vcpu.  However, that doesn't mean that the translations
have only been accessed on one physical cpu (pcpu) since vcpus can move
around from one pcpu to another.  Thus a tlbiel might leave behind stale
TLB entries on a pcpu where the vcpu previously ran, and if that task
then moves back to that previous pcpu, it could see those stale TLB
entries and thus access memory incorrectly.  The usual symptom of this
is random segfaults in userspace programs in the guest.

To cope with this, we detect when a vcpu is about to start executing on
a thread in a core that is a different core from the last time it
executed.  If that is the case, then we mark the core as needing a
TLB flush and then send an interrupt to any thread in the core that is
currently running a vcpu from the same guest.  This will get those vcpus
out of the guest, and the first one to re-enter the guest will do the
TLB flush.  The reason for interrupting the vcpus executing on the old
core is to cope with the following scenario:

	CPU 0			CPU 1			CPU 4
	(core 0)			(core 0)			(core 1)

	VCPU 0 runs task X      VCPU 1 runs
	core 0 TLB gets
	entries from task X
	VCPU 0 moves to CPU 4
							VCPU 0 runs task X
							Unmap pages of task X
							tlbiel

				(still VCPU 1)			task X moves to VCPU 1
				task X runs
				task X sees stale TLB
				entries

That is, as soon as the VCPU starts executing on the new core, it
could unmap and tlbiel some page table entries, and then the task
could migrate to one of the VCPUs running on the old core and
potentially see stale TLB entries.

Since the TLB is shared between all the threads in a core, we only
use the bit of kvm->arch.need_tlb_flush corresponding to the first
thread in the core.  To ensure that we don't have a window where we
can miss a flush, this moves the clearing of the bit from before the
actual flush to after it.  This way, two threads might both do the
flush, but we prevent the situation where one thread can enter the
guest before the flush is finished.
Signed-off-by: default avatarPaul Mackerras <paulus@ozlabs.org>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent 65dae540
...@@ -263,6 +263,7 @@ struct kvm_arch { ...@@ -263,6 +263,7 @@ struct kvm_arch {
unsigned long hpt_mask; unsigned long hpt_mask;
atomic_t hpte_mod_interest; atomic_t hpte_mod_interest;
cpumask_t need_tlb_flush; cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
int hpt_cma_alloc; int hpt_cma_alloc;
u8 radix; u8 radix;
pgd_t *pgtable; pgd_t *pgtable;
...@@ -661,6 +662,7 @@ struct kvm_vcpu_arch { ...@@ -661,6 +662,7 @@ struct kvm_vcpu_arch {
int state; int state;
int ptid; int ptid;
int thread_cpu; int thread_cpu;
int prev_cpu;
bool timer_running; bool timer_running;
wait_queue_head_t cpu_run; wait_queue_head_t cpu_run;
......
...@@ -1821,6 +1821,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, ...@@ -1821,6 +1821,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
vcpu->arch.vcore = vcore; vcpu->arch.vcore = vcore;
vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
vcpu->arch.thread_cpu = -1; vcpu->arch.thread_cpu = -1;
vcpu->arch.prev_cpu = -1;
vcpu->arch.cpu_type = KVM_CPU_3S_64; vcpu->arch.cpu_type = KVM_CPU_3S_64;
kvmppc_sanity_check(vcpu); kvmppc_sanity_check(vcpu);
...@@ -1950,11 +1951,33 @@ static void kvmppc_release_hwthread(int cpu) ...@@ -1950,11 +1951,33 @@ static void kvmppc_release_hwthread(int cpu)
tpaca->kvm_hstate.kvm_split_mode = NULL; tpaca->kvm_hstate.kvm_split_mode = NULL;
} }
static void do_nothing(void *x)
{
}
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{
int i;
cpu = cpu_first_thread_sibling(cpu);
cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
/*
* Make sure setting of bit in need_tlb_flush precedes
* testing of cpu_in_guest bits. The matching barrier on
* the other side is the first smp_mb() in kvmppc_run_core().
*/
smp_mb();
for (i = 0; i < threads_per_core; ++i)
if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
smp_call_function_single(cpu + i, do_nothing, NULL, 1);
}
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
{ {
int cpu; int cpu;
struct paca_struct *tpaca; struct paca_struct *tpaca;
struct kvmppc_vcore *mvc = vc->master_vcore; struct kvmppc_vcore *mvc = vc->master_vcore;
struct kvm *kvm = vc->kvm;
cpu = vc->pcpu; cpu = vc->pcpu;
if (vcpu) { if (vcpu) {
...@@ -1965,6 +1988,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) ...@@ -1965,6 +1988,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
cpu += vcpu->arch.ptid; cpu += vcpu->arch.ptid;
vcpu->cpu = mvc->pcpu; vcpu->cpu = mvc->pcpu;
vcpu->arch.thread_cpu = cpu; vcpu->arch.thread_cpu = cpu;
/*
* With radix, the guest can do TLB invalidations itself,
* and it could choose to use the local form (tlbiel) if
* it is invalidating a translation that has only ever been
* used on one vcpu. However, that doesn't mean it has
* only ever been used on one physical cpu, since vcpus
* can move around between pcpus. To cope with this, when
* a vcpu moves from one pcpu to another, we need to tell
* any vcpus running on the same core as this vcpu previously
* ran to flush the TLB. The TLB is shared between threads,
* so we use a single bit in .need_tlb_flush for all 4 threads.
*/
if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
if (vcpu->arch.prev_cpu >= 0 &&
cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
cpu_first_thread_sibling(cpu))
radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
vcpu->arch.prev_cpu = cpu;
}
cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
} }
tpaca = &paca[cpu]; tpaca = &paca[cpu];
tpaca->kvm_hstate.kvm_vcpu = vcpu; tpaca->kvm_hstate.kvm_vcpu = vcpu;
...@@ -2552,6 +2596,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) ...@@ -2552,6 +2596,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
kvmppc_release_hwthread(pcpu + i); kvmppc_release_hwthread(pcpu + i);
if (sip && sip->napped[i]) if (sip && sip->napped[i])
kvmppc_ipi_thread(pcpu + i); kvmppc_ipi_thread(pcpu + i);
cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
} }
kvmppc_set_host_core(pcpu); kvmppc_set_host_core(pcpu);
......
...@@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x) ...@@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x)
static int global_invalidates(struct kvm *kvm, unsigned long flags) static int global_invalidates(struct kvm *kvm, unsigned long flags)
{ {
int global; int global;
int cpu;
/* /*
* If there is only one vcore, and it's currently running, * If there is only one vcore, and it's currently running,
...@@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags) ...@@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags)
/* any other core might now have stale TLB entries... */ /* any other core might now have stale TLB entries... */
smp_wmb(); smp_wmb();
cpumask_setall(&kvm->arch.need_tlb_flush); cpumask_setall(&kvm->arch.need_tlb_flush);
cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, cpu = local_paca->kvm_hstate.kvm_vcore->pcpu;
&kvm->arch.need_tlb_flush); /*
* On POWER9, threads are independent but the TLB is shared,
* so use the bit for the first thread to represent the core.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
cpu = cpu_first_thread_sibling(cpu);
cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
} }
return global; return global;
......
...@@ -598,30 +598,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) ...@@ -598,30 +598,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/* See if we need to flush the TLB */ /* See if we need to flush the TLB */
lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */
BEGIN_FTR_SECTION
/*
* On POWER9, individual threads can come in here, but the
* TLB is shared between the 4 threads in a core, hence
* invalidating on one thread invalidates for all.
* Thus we make all 4 threads use the same bit here.
*/
clrrdi r6,r6,2
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
clrldi r7,r6,64-6 /* extract bit number (6 bits) */ clrldi r7,r6,64-6 /* extract bit number (6 bits) */
srdi r6,r6,6 /* doubleword number */ srdi r6,r6,6 /* doubleword number */
sldi r6,r6,3 /* address offset */ sldi r6,r6,3 /* address offset */
add r6,r6,r9 add r6,r6,r9
addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
li r0,1 li r8,1
sld r0,r0,r7 sld r8,r8,r7
ld r7,0(r6) ld r7,0(r6)
and. r7,r7,r0 and. r7,r7,r8
beq 22f beq 22f
23: ldarx r7,0,r6 /* if set, clear the bit */
andc r7,r7,r0
stdcx. r7,0,r6
bne 23b
/* Flush the TLB of any entries for this LPID */ /* Flush the TLB of any entries for this LPID */
lwz r6,KVM_TLB_SETS(r9) lwz r0,KVM_TLB_SETS(r9)
li r0,0 /* RS for P9 version of tlbiel */ mtctr r0
mtctr r6
li r7,0x800 /* IS field = 0b10 */ li r7,0x800 /* IS field = 0b10 */
ptesync ptesync
28: tlbiel r7 li r0,0 /* RS for P9 version of tlbiel */
bne cr7, 29f
28: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */
addi r7,r7,0x1000 addi r7,r7,0x1000
bdnz 28b bdnz 28b
ptesync b 30f
29: PPC_TLBIEL(7,0,2,1,1) /* for radix, RIC=2, PRS=1, R=1 */
addi r7,r7,0x1000
bdnz 29b
30: ptesync
23: ldarx r7,0,r6 /* clear the bit after TLB flushed */
andc r7,r7,r8
stdcx. r7,0,r6
bne 23b
/* Add timebase offset onto timebase */ /* Add timebase offset onto timebase */
22: ld r8,VCORE_TB_OFFSET(r5) 22: ld r8,VCORE_TB_OFFSET(r5)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment