Commit ff0d6be4 authored by Michael Ellerman's avatar Michael Ellerman

Merge branch 'topic/ppc-kvm' into next

This merge's Nick's big P9 KVM series, original cover letter follows:

KVM: PPC: Book3S HV P9: entry/exit optimisations

This reduces radix guest full entry/exit latency on POWER9 and POWER10
by 2x.

Nested HV guests should see smaller improvements in their L1 entry/exit,
but this is also combined with most L0 speedups also applying to nested
entry. nginx localhost throughput test in a SMP nested guest is improved
about 10% (in a direct guest it doesn't change much because it uses XIVE
for IPIs) when L0 and L1 are patched.

It does this in several main ways:

- Rearrange code to optimise SPR accesses. Mainly, avoid scoreboard
  stalls.

- Test SPR values to avoid mtSPRs where possible. mtSPRs are expensive.

- Reduce mftb. mftb is expensive.

- Demand fault certain facilities to avoid saving and/or restoring them
  (at the cost of fault when they are used, but this is mitigated over
  a number of entries, like the facilities when context switching
  processes). PM, TM, and EBB so far.

- Defer some sequences that are made just in case a guest is interrupted
  in the middle of a critical section to the case where the guest is
  scheduled on a different CPU, rather than every time (at the cost of
  an extra IPI in this case). Namely the tlbsync sequence for radix with
  GTSE, which is very expensive.

- Reduce locking, barriers, atomics related to the vcpus-per-vcore > 1
  handling that the P9 path does not require.
parents 13605725 9c5a432a
......@@ -4144,6 +4144,14 @@
Override pmtimer IOPort with a hex value.
e.g. pmtmr=0x508
pmu_override= [PPC] Override the PMU.
This option takes over the PMU facility, so it is no
longer usable by perf. Setting this option starts the
PMU counters by setting MMCR0 to 0 (the FC bit is
cleared). If a number is given, then MMCR1 is set to
that number, otherwise (e.g., 'pmu_override=on'), MMCR1
remains 0.
pm_debug_messages [SUSPEND,KNL]
Enable suspend/resume debug messages during boot up.
......
......@@ -141,11 +141,6 @@ static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
bool preserve_nv) { }
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
void kvmhv_save_host_pmu(void);
void kvmhv_load_host_pmu(void);
void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu);
long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
......
......@@ -79,6 +79,7 @@
#define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800
#define BOOK3S_INTERRUPT_DECREMENTER 0x900
#define BOOK3S_INTERRUPT_HV_DECREMENTER 0x980
#define BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER 0x1980
#define BOOK3S_INTERRUPT_DOORBELL 0xa00
#define BOOK3S_INTERRUPT_SYSCALL 0xc00
#define BOOK3S_INTERRUPT_TRACE 0xd00
......
......@@ -406,6 +406,12 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
return vcpu->arch.fault_dar;
}
/* Expiry time of vcpu DEC relative to host TB */
static inline u64 kvmppc_dec_expires_host_tb(struct kvm_vcpu *vcpu)
{
return vcpu->arch.dec_expires - vcpu->arch.vcore->tb_offset;
}
static inline bool is_kvmppc_resume_guest(int r)
{
return (r == RESUME_GUEST || r == RESUME_GUEST_NV);
......
......@@ -44,7 +44,6 @@ struct kvm_nested_guest {
struct mutex tlb_lock; /* serialize page faults and tlbies */
struct kvm_nested_guest *next;
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
short prev_cpu[NR_CPUS];
u8 radix; /* is this nested guest radix */
};
......@@ -154,7 +153,9 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu)
return radix;
}
int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr);
unsigned long kvmppc_msr_hard_disable_set_facilities(struct kvm_vcpu *vcpu, unsigned long msr);
int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb);
#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
#endif
......
......@@ -287,7 +287,6 @@ struct kvm_arch {
u32 online_vcores;
atomic_t hpte_mod_interest;
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
u8 radix;
u8 fwnmi_enabled;
u8 secure_guest;
......@@ -579,6 +578,10 @@ struct kvm_vcpu_arch {
ulong cfar;
ulong ppr;
u32 pspb;
u8 load_ebb;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
u8 load_tm;
#endif
ulong fscr;
ulong shadow_fscr;
ulong ebbhr;
......@@ -741,7 +744,7 @@ struct kvm_vcpu_arch {
struct hrtimer dec_timer;
u64 dec_jiffies;
u64 dec_expires;
u64 dec_expires; /* Relative to guest timebase. */
unsigned long pending_exceptions;
u8 ceded;
u8 prodded;
......
......@@ -552,8 +552,7 @@ extern void kvm_hv_vm_activated(void);
extern void kvm_hv_vm_deactivated(void);
extern bool kvm_hv_mode_active(void);
extern void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
struct kvm_nested_guest *nested);
extern void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu);
#else
static inline void __init kvm_cma_reserve(void)
......@@ -760,6 +759,7 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
void kvmppc_subcore_enter_guest(void);
void kvmppc_subcore_exit_guest(void);
long kvmppc_realmode_hmi_handler(void);
long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu);
long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel);
long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
......
......@@ -112,6 +112,9 @@ static inline void clear_task_ebb(struct task_struct *t)
#endif
}
void kvmppc_save_user_regs(void);
void kvmppc_save_current_sprs(void);
extern int set_thread_tidr(struct task_struct *t);
#endif /* _ASM_POWERPC_SWITCH_TO_H */
......@@ -18,6 +18,8 @@
#include <asm/vdso/timebase.h>
/* time.c */
extern u64 decrementer_max;
extern unsigned long tb_ticks_per_jiffy;
extern unsigned long tb_ticks_per_usec;
extern unsigned long tb_ticks_per_sec;
......@@ -97,19 +99,16 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
extern void secondary_cpu_time_init(void);
extern void __init time_init(void);
#ifdef CONFIG_PPC64
static inline unsigned long test_irq_work_pending(void)
{
unsigned long x;
DECLARE_PER_CPU(u64, decrementers_next_tb);
asm volatile("lbz %0,%1(13)"
: "=r" (x)
: "i" (offsetof(struct paca_struct, irq_work_pending)));
return x;
static inline u64 timer_get_next_tb(void)
{
return __this_cpu_read(decrementers_next_tb);
}
#endif
DECLARE_PER_CPU(u64, decrementers_next_tb);
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
void timer_rearm_host_dec(u64 now);
#endif
/* Convert timebase ticks to nanoseconds */
unsigned long long tb_to_ns(unsigned long long tb_ticks);
......
......@@ -109,7 +109,7 @@ static void init_PMU_HV_ISA207(void)
static void init_PMU(void)
{
mtspr(SPRN_MMCRA, 0);
mtspr(SPRN_MMCR0, 0);
mtspr(SPRN_MMCR0, MMCR0_FC);
mtspr(SPRN_MMCR1, 0);
mtspr(SPRN_MMCR2, 0);
}
......@@ -123,7 +123,7 @@ static void init_PMU_ISA31(void)
{
mtspr(SPRN_MMCR3, 0);
mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);
mtspr(SPRN_MMCR0, MMCR0_PMCCEXT);
mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);
}
/*
......@@ -137,6 +137,7 @@ void __setup_cpu_power7(unsigned long offset, struct cpu_spec *t)
return;
mtspr(SPRN_LPID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA206(mfspr(SPRN_LPCR), LPCR_LPES1 >> LPCR_LPES_SH);
}
......@@ -150,6 +151,7 @@ void __restore_cpu_power7(void)
return;
mtspr(SPRN_LPID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA206(mfspr(SPRN_LPCR), LPCR_LPES1 >> LPCR_LPES_SH);
}
......@@ -164,6 +166,7 @@ void __setup_cpu_power8(unsigned long offset, struct cpu_spec *t)
return;
mtspr(SPRN_LPID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA206(mfspr(SPRN_LPCR) | LPCR_PECEDH, 0); /* LPES = 0 */
init_HFSCR();
......@@ -184,6 +187,7 @@ void __restore_cpu_power8(void)
return;
mtspr(SPRN_LPID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA206(mfspr(SPRN_LPCR) | LPCR_PECEDH, 0); /* LPES = 0 */
init_HFSCR();
......@@ -202,6 +206,7 @@ void __setup_cpu_power9(unsigned long offset, struct cpu_spec *t)
mtspr(SPRN_PSSCR, 0);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_PID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\
LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0);
......@@ -223,6 +228,7 @@ void __restore_cpu_power9(void)
mtspr(SPRN_PSSCR, 0);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_PID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\
LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0);
......@@ -242,6 +248,7 @@ void __setup_cpu_power10(unsigned long offset, struct cpu_spec *t)
mtspr(SPRN_PSSCR, 0);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_PID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\
LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0);
......@@ -264,6 +271,7 @@ void __restore_cpu_power10(void)
mtspr(SPRN_PSSCR, 0);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_PID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\
LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0);
......
......@@ -80,6 +80,7 @@ static void __restore_cpu_cpufeatures(void)
mtspr(SPRN_LPCR, system_registers.lpcr);
if (hv_mode) {
mtspr(SPRN_LPID, 0);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_HFSCR, system_registers.hfscr);
mtspr(SPRN_PCR, system_registers.pcr);
}
......@@ -216,6 +217,7 @@ static int __init feat_enable_hv(struct dt_cpu_feature *f)
}
mtspr(SPRN_LPID, 0);
mtspr(SPRN_AMOR, ~0);
lpcr = mfspr(SPRN_LPCR);
lpcr &= ~LPCR_LPES0; /* HV external interrupts */
......@@ -351,7 +353,7 @@ static void init_pmu_power8(void)
}
mtspr(SPRN_MMCRA, 0);
mtspr(SPRN_MMCR0, 0);
mtspr(SPRN_MMCR0, MMCR0_FC);
mtspr(SPRN_MMCR1, 0);
mtspr(SPRN_MMCR2, 0);
mtspr(SPRN_MMCRS, 0);
......@@ -390,7 +392,7 @@ static void init_pmu_power9(void)
mtspr(SPRN_MMCRC, 0);
mtspr(SPRN_MMCRA, 0);
mtspr(SPRN_MMCR0, 0);
mtspr(SPRN_MMCR0, MMCR0_FC);
mtspr(SPRN_MMCR1, 0);
mtspr(SPRN_MMCR2, 0);
}
......@@ -426,7 +428,7 @@ static void init_pmu_power10(void)
mtspr(SPRN_MMCR3, 0);
mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);
mtspr(SPRN_MMCR0, MMCR0_PMCCEXT);
mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);
}
static int __init feat_enable_pmu_power10(struct dt_cpu_feature *f)
......
......@@ -1156,6 +1156,40 @@ static inline void save_sprs(struct thread_struct *t)
#endif
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
void kvmppc_save_user_regs(void)
{
unsigned long usermsr;
if (!current->thread.regs)
return;
usermsr = current->thread.regs->msr;
if (usermsr & MSR_FP)
save_fpu(current);
if (usermsr & MSR_VEC)
save_altivec(current);
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (usermsr & MSR_TM) {
current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
current->thread.tm_texasr = mfspr(SPRN_TEXASR);
current->thread.regs->msr &= ~MSR_TM;
}
#endif
}
EXPORT_SYMBOL_GPL(kvmppc_save_user_regs);
void kvmppc_save_current_sprs(void)
{
save_sprs(&current->thread);
}
EXPORT_SYMBOL_GPL(kvmppc_save_current_sprs);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
static inline void restore_sprs(struct thread_struct *old_thread,
struct thread_struct *new_thread)
{
......
......@@ -88,6 +88,7 @@ static struct clocksource clocksource_timebase = {
#define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF
u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
EXPORT_SYMBOL_GPL(decrementer_max); /* for KVM HDEC */
static int decrementer_set_next_event(unsigned long evt,
struct clock_event_device *dev);
......@@ -107,6 +108,7 @@ struct clock_event_device decrementer_clockevent = {
EXPORT_SYMBOL(decrementer_clockevent);
DEFINE_PER_CPU(u64, decrementers_next_tb);
EXPORT_SYMBOL_GPL(decrementers_next_tb);
static DEFINE_PER_CPU(struct clock_event_device, decrementers);
#define XSEC_PER_SEC (1024*1024)
......@@ -496,6 +498,16 @@ EXPORT_SYMBOL(profile_pc);
* 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
*/
#ifdef CONFIG_PPC64
static inline unsigned long test_irq_work_pending(void)
{
unsigned long x;
asm volatile("lbz %0,%1(13)"
: "=r" (x)
: "i" (offsetof(struct paca_struct, irq_work_pending)));
return x;
}
static inline void set_irq_work_pending_flag(void)
{
asm volatile("stb %0,%1(13)" : :
......@@ -539,13 +551,44 @@ void arch_irq_work_raise(void)
preempt_enable();
}
static void set_dec_or_work(u64 val)
{
set_dec(val);
/* We may have raced with new irq work */
if (unlikely(test_irq_work_pending()))
set_dec(1);
}
#else /* CONFIG_IRQ_WORK */
#define test_irq_work_pending() 0
#define clear_irq_work_pending()
static void set_dec_or_work(u64 val)
{
set_dec(val);
}
#endif /* CONFIG_IRQ_WORK */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
void timer_rearm_host_dec(u64 now)
{
u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
WARN_ON_ONCE(!arch_irqs_disabled());
WARN_ON_ONCE(mfmsr() & MSR_EE);
if (now >= *next_tb) {
local_paca->irq_happened |= PACA_IRQ_DEC;
} else {
now = *next_tb - now;
if (now <= decrementer_max)
set_dec_or_work(now);
}
}
EXPORT_SYMBOL_GPL(timer_rearm_host_dec);
#endif
/*
* timer_interrupt - gets called when the decrementer overflows,
* with interrupts disabled.
......@@ -606,10 +649,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
} else {
now = *next_tb - now;
if (now <= decrementer_max)
set_dec(now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
set_dec_or_work(now);
__this_cpu_inc(irq_stat.timer_irqs_others);
}
......@@ -843,11 +883,7 @@ static int decrementer_set_next_event(unsigned long evt,
struct clock_event_device *dev)
{
__this_cpu_write(decrementers_next_tb, get_tb() + evt);
set_dec(evt);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
set_dec_or_work(evt);
return 0;
}
......
......@@ -130,6 +130,21 @@ config KVM_BOOK3S_HV_EXIT_TIMING
If unsure, say N.
config KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND
bool "Nested L0 host workaround for L1 KVM host PMU handling bug" if EXPERT
depends on KVM_BOOK3S_HV_POSSIBLE
default !EXPERT
help
Old nested HV capable Linux guests have a bug where they don't
reflect the PMU in-use status of their L2 guest to the L0 host
while the L2 PMU registers are live. This can result in loss
of L2 PMU register state, causing perf to not work correctly in
L2 guests.
Selecting this option for the L0 host implements a workaround for
those buggy L1s which saves the L2 state, at the cost of performance
in all nested-capable guest entry/exit.
config KVM_BOOKE_HV
bool
......
......@@ -374,11 +374,16 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
BEGIN_FTR_SECTION
mtspr SPRN_DAWRX1,r10
END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
mtspr SPRN_PID,r10
/*
* Switch to host MMU mode
* Switch to host MMU mode (don't have the real host PID but we aren't
* going back to userspace).
*/
hwsync
isync
mtspr SPRN_PID,r10
ld r10, HSTATE_KVM_VCPU(r13)
ld r10, VCPU_KVM(r10)
lwz r10, KVM_HOST_LPID(r10)
......@@ -389,6 +394,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
ld r10, KVM_HOST_LPCR(r10)
mtspr SPRN_LPCR,r10
isync
/*
* Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
* MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
......
......@@ -57,6 +57,8 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
preempt_disable();
asm volatile("hwsync" ::: "memory");
isync();
/* switch the lpid first to avoid running host with unallocated pid */
old_lpid = mfspr(SPRN_LPID);
if (old_lpid != lpid)
......@@ -75,6 +77,8 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
ret = __copy_to_user_inatomic((void __user *)to, from, n);
pagefault_enable();
asm volatile("hwsync" ::: "memory");
isync();
/* switch the pid first to avoid running host with unallocated pid */
if (quadrant == 1 && pid != old_pid)
mtspr(SPRN_PID, old_pid);
......
This diff is collapsed.
// SPDX-License-Identifier: GPL-2.0-only
/*
* Privileged (non-hypervisor) host registers to save.
*/
struct p9_host_os_sprs {
unsigned long iamr;
unsigned long amr;
unsigned int pmc1;
unsigned int pmc2;
unsigned int pmc3;
unsigned int pmc4;
unsigned int pmc5;
unsigned int pmc6;
unsigned long mmcr0;
unsigned long mmcr1;
unsigned long mmcr2;
unsigned long mmcr3;
unsigned long mmcra;
unsigned long siar;
unsigned long sier1;
unsigned long sier2;
unsigned long sier3;
unsigned long sdar;
};
static inline bool nesting_enabled(struct kvm *kvm)
{
return kvm->arch.nested_enable && kvm_is_radix(kvm);
}
bool load_vcpu_state(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs);
void store_vcpu_state(struct kvm_vcpu *vcpu);
void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs);
void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs);
void switch_pmu_to_guest(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs);
void switch_pmu_to_host(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs);
......@@ -649,6 +649,8 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
int ext;
unsigned long lpcr;
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
/* Insert EXTERNAL bit into LPCR at the MER bit position */
ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
lpcr = mfspr(SPRN_LPCR);
......@@ -682,57 +684,23 @@ static void flush_guest_tlb(struct kvm *kvm)
unsigned long rb, set;
rb = PPC_BIT(52); /* IS = 2 */
if (kvm_is_radix(kvm)) {
/* R=1 PRS=1 RIC=2 */
for (set = 0; set < kvm->arch.tlb_sets; ++set) {
/* R=0 PRS=0 RIC=0 */
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r" (rb), "i" (1), "i" (1), "i" (2),
: : "r" (rb), "i" (0), "i" (0), "i" (0),
"r" (0) : "memory");
for (set = 1; set < kvm->arch.tlb_sets; ++set) {
rb += PPC_BIT(51); /* increment set number */
/* R=1 PRS=1 RIC=0 */
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r" (rb), "i" (1), "i" (1), "i" (0),
"r" (0) : "memory");
}
asm volatile("ptesync": : :"memory");
asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory");
} else {
for (set = 0; set < kvm->arch.tlb_sets; ++set) {
/* R=0 PRS=0 RIC=0 */
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r" (rb), "i" (0), "i" (0), "i" (0),
"r" (0) : "memory");
rb += PPC_BIT(51); /* increment set number */
}
asm volatile("ptesync": : :"memory");
asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory");
rb += PPC_BIT(51); /* increment set number */
}
asm volatile("ptesync": : :"memory");
}
void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
struct kvm_nested_guest *nested)
void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu)
{
cpumask_t *need_tlb_flush;
/*
* On POWER9, individual threads can come in here, but the
* TLB is shared between the 4 threads in a core, hence
* invalidating on one thread invalidates for all.
* Thus we make all 4 threads use the same bit.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
pcpu = cpu_first_tlb_thread_sibling(pcpu);
if (nested)
need_tlb_flush = &nested->need_tlb_flush;
else
need_tlb_flush = &kvm->arch.need_tlb_flush;
if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
if (cpumask_test_cpu(pcpu, &kvm->arch.need_tlb_flush)) {
flush_guest_tlb(kvm);
/* Clear the bit after the TLB flush */
cpumask_clear_cpu(pcpu, need_tlb_flush);
cpumask_clear_cpu(pcpu, &kvm->arch.need_tlb_flush);
}
}
EXPORT_SYMBOL_GPL(kvmppc_check_need_tlb_flush);
......@@ -20,10 +20,15 @@ void wait_for_subcore_guest_exit(void)
/*
* NULL bitmap pointer indicates that KVM module hasn't
* been loaded yet and hence no guests are running.
* been loaded yet and hence no guests are running, or running
* on POWER9 or newer CPU.
*
* If no KVM is in use, no need to co-ordinate among threads
* as all of them will always be in host and no one is going
* to modify TB other than the opal hmi handler.
*
* POWER9 and newer don't need this synchronisation.
*
* Hence, just return from here.
*/
if (!local_paca->sibling_subcore_state)
......
......@@ -104,7 +104,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtlr r0
blr
_GLOBAL(kvmhv_save_host_pmu)
/*
* void kvmhv_save_host_pmu(void)
*/
kvmhv_save_host_pmu:
BEGIN_FTR_SECTION
/* Work around P8 PMAE bug */
li r3, -1
......@@ -138,14 +141,6 @@ BEGIN_FTR_SECTION
std r8, HSTATE_MMCR2(r13)
std r9, HSTATE_SIER(r13)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
mfspr r5, SPRN_MMCR3
mfspr r6, SPRN_SIER2
mfspr r7, SPRN_SIER3
std r5, HSTATE_MMCR3(r13)
std r6, HSTATE_SIER2(r13)
std r7, HSTATE_SIER3(r13)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31)
mfspr r3, SPRN_PMC1
mfspr r5, SPRN_PMC2
mfspr r6, SPRN_PMC3
......
......@@ -358,6 +358,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
/* convert TB values/offsets to host (L0) values */
hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
vc->tb_offset += l2_hv.tb_offset;
vcpu->arch.dec_expires += l2_hv.tb_offset;
/* set L1 state to L2 state */
vcpu->arch.nested = l2;
......@@ -374,11 +375,6 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
do {
if (mftb() >= hdec_exp) {
vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
r = RESUME_HOST;
break;
}
r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr);
} while (is_kvmppc_resume_guest(r));
......@@ -399,6 +395,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
if (l2_regs.msr & MSR_TS_MASK)
vcpu->arch.shregs.msr |= MSR_TS_S;
vc->tb_offset = saved_l1_hv.tb_offset;
/* XXX: is this always the same delta as saved_l1_hv.tb_offset? */
vcpu->arch.dec_expires -= l2_hv.tb_offset;
restore_hv_regs(vcpu, &saved_l1_hv);
vcpu->arch.purr += delta_purr;
vcpu->arch.spurr += delta_spurr;
......
This diff is collapsed.
......@@ -136,6 +136,60 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
vcpu->arch.mce_evt = mce_evt;
}
long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
long ret = 0;
/*
* Unapply and clear the offset first. That way, if the TB was not
* resynced then it will remain in host-offset, and if it was resynced
* then it is brought into host-offset. Then the tb offset is
* re-applied before continuing with the KVM exit.
*
* This way, we don't need to actually know whether not OPAL resynced
* the timebase or do any of the complicated dance that the P7/8
* path requires.
*/
if (vc->tb_offset_applied) {
u64 new_tb = mftb() - vc->tb_offset_applied;
mtspr(SPRN_TBU40, new_tb);
if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
new_tb += 0x1000000;
mtspr(SPRN_TBU40, new_tb);
}
vc->tb_offset_applied = 0;
}
local_paca->hmi_irqs++;
if (hmi_handle_debugtrig(NULL) >= 0) {
ret = 1;
goto out;
}
if (ppc_md.hmi_exception_early)
ppc_md.hmi_exception_early(NULL);
out:
if (vc->tb_offset) {
u64 new_tb = mftb() + vc->tb_offset;
mtspr(SPRN_TBU40, new_tb);
if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
new_tb += 0x1000000;
mtspr(SPRN_TBU40, new_tb);
}
vc->tb_offset_applied = vc->tb_offset;
}
return ret;
}
/*
* The following subcore HMI handling is all only for pre-POWER9 CPUs.
*/
/* Check if dynamic split is in force and return subcore size accordingly. */
static inline int kvmppc_cur_subcore_size(void)
{
......
......@@ -55,12 +55,6 @@ static int global_invalidates(struct kvm *kvm)
smp_wmb();
cpumask_setall(&kvm->arch.need_tlb_flush);
cpu = local_paca->kvm_hstate.kvm_vcore->pcpu;
/*
* On POWER9, threads are independent but the TLB is shared,
* so use the bit for the first thread to represent the core.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
cpu = cpu_first_tlb_thread_sibling(cpu);
cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
}
......
......@@ -778,17 +778,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/* Restore AMR and UAMOR, set AMOR to all 1s */
ld r5,VCPU_AMR(r4)
ld r6,VCPU_UAMOR(r4)
li r7,-1
mtspr SPRN_AMR,r5
mtspr SPRN_UAMOR,r6
mtspr SPRN_AMOR,r7
/* Restore state of CTRL run bit; assume 1 on entry */
/* Restore state of CTRL run bit; the host currently has it set to 1 */
lwz r5,VCPU_CTRL(r4)
andi. r5,r5,1
bne 4f
mfspr r6,SPRN_CTRLF
clrrdi r6,r6,1
li r6,0
mtspr SPRN_CTRLT,r6
4:
/* Secondary threads wait for primary to have done partition switch */
......@@ -817,10 +814,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
* Set the decrementer to the guest decrementer.
*/
ld r8,VCPU_DEC_EXPIRES(r4)
/* r8 is a host timebase value here, convert to guest TB */
ld r5,HSTATE_KVM_VCORE(r13)
ld r6,VCORE_TB_OFFSET_APPL(r5)
add r8,r8,r6
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
......@@ -1195,9 +1188,6 @@ guest_bypass:
mftb r6
extsw r5,r5
16: add r5,r5,r6
/* r5 is a guest timebase value here, convert to host TB */
ld r4,VCORE_TB_OFFSET_APPL(r3)
subf r5,r4,r5
std r5,VCPU_DEC_EXPIRES(r9)
/* Increment exit count, poke other threads to exit */
......@@ -1211,12 +1201,12 @@ guest_bypass:
stw r0, VCPU_CPU(r9)
stw r0, VCPU_THREAD_CPU(r9)
/* Save guest CTRL register, set runlatch to 1 */
/* Save guest CTRL register, set runlatch to 1 if it was clear */
mfspr r6,SPRN_CTRLF
stw r6,VCPU_CTRL(r9)
andi. r0,r6,1
bne 4f
ori r6,r6,1
li r6,1
mtspr SPRN_CTRLT,r6
4:
/*
......@@ -2163,9 +2153,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM)
/* save expiry time of guest decrementer */
add r3, r3, r5
ld r4, HSTATE_KVM_VCPU(r13)
ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_TB_OFFSET_APPL(r5)
subf r3, r6, r3 /* convert to host TB value */
std r3, VCPU_DEC_EXPIRES(r4)
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
......@@ -2186,8 +2173,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM)
* Also clear the runlatch bit before napping.
*/
kvm_do_nap:
mfspr r0, SPRN_CTRLF
clrrdi r0, r0, 1
li r0,0
mtspr SPRN_CTRLT, r0
li r0,1
......@@ -2206,8 +2192,7 @@ kvm_nap_sequence: /* desired LPCR value in r5 */
bl isa206_idle_insn_mayloss
mfspr r0, SPRN_CTRLF
ori r0, r0, 1
li r0,1
mtspr SPRN_CTRLT, r0
mtspr SPRN_SRR1, r3
......@@ -2264,9 +2249,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM)
/* Restore guest decrementer */
ld r3, VCPU_DEC_EXPIRES(r4)
ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_TB_OFFSET_APPL(r5)
add r3, r3, r6 /* convert host TB to guest TB value */
mftb r7
subf r3, r7, r3
mtspr SPRN_DEC, r3
......@@ -2778,10 +2760,11 @@ kvmppc_msr_interrupt:
blr
/*
* void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu)
*
* Load up guest PMU state. R3 points to the vcpu struct.
*/
_GLOBAL(kvmhv_load_guest_pmu)
EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
kvmhv_load_guest_pmu:
mr r4, r3
mflr r0
li r3, 1
......@@ -2815,27 +2798,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
mtspr SPRN_MMCRA, r6
mtspr SPRN_SIAR, r7
mtspr SPRN_SDAR, r8
BEGIN_FTR_SECTION
ld r5, VCPU_MMCR + 24(r4)
ld r6, VCPU_SIER + 8(r4)
ld r7, VCPU_SIER + 16(r4)
mtspr SPRN_MMCR3, r5
mtspr SPRN_SIER2, r6
mtspr SPRN_SIER3, r7
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31)
BEGIN_FTR_SECTION
ld r5, VCPU_MMCR + 16(r4)
ld r6, VCPU_SIER(r4)
mtspr SPRN_MMCR2, r5
mtspr SPRN_SIER, r6
BEGIN_FTR_SECTION_NESTED(96)
lwz r7, VCPU_PMC + 24(r4)
lwz r8, VCPU_PMC + 28(r4)
ld r9, VCPU_MMCRS(r4)
mtspr SPRN_SPMC1, r7
mtspr SPRN_SPMC2, r8
mtspr SPRN_MMCRS, r9
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
......@@ -2843,10 +2816,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
blr
/*
* void kvmhv_load_host_pmu(void)
*
* Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
*/
_GLOBAL(kvmhv_load_host_pmu)
EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
kvmhv_load_host_pmu:
mflr r0
lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
cmpwi r4, 0
......@@ -2884,25 +2858,18 @@ BEGIN_FTR_SECTION
mtspr SPRN_MMCR2, r8
mtspr SPRN_SIER, r9
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
ld r5, HSTATE_MMCR3(r13)
ld r6, HSTATE_SIER2(r13)
ld r7, HSTATE_SIER3(r13)
mtspr SPRN_MMCR3, r5
mtspr SPRN_SIER2, r6
mtspr SPRN_SIER3, r7
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31)
mtspr SPRN_MMCR0, r3
isync
mtlr r0
23: blr
/*
* void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use)
*
* Save guest PMU state into the vcpu struct.
* r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
*/
_GLOBAL(kvmhv_save_guest_pmu)
EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
kvmhv_save_guest_pmu:
mr r9, r3
mr r8, r4
BEGIN_FTR_SECTION
......@@ -2951,14 +2918,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
std r10, VCPU_MMCR + 16(r9)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
mfspr r5, SPRN_MMCR3
mfspr r6, SPRN_SIER2
mfspr r7, SPRN_SIER3
std r5, VCPU_MMCR + 24(r9)
std r6, VCPU_SIER + 8(r9)
std r7, VCPU_SIER + 16(r9)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31)
std r7, VCPU_SIAR(r9)
std r8, VCPU_SDAR(r9)
mfspr r3, SPRN_PMC1
......@@ -2976,7 +2935,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31)
BEGIN_FTR_SECTION
mfspr r5, SPRN_SIER
std r5, VCPU_SIER(r9)
BEGIN_FTR_SECTION_NESTED(96)
mfspr r6, SPRN_SPMC1
mfspr r7, SPRN_SPMC2
mfspr r8, SPRN_MMCRS
......@@ -2985,7 +2943,6 @@ BEGIN_FTR_SECTION_NESTED(96)
std r8, VCPU_MMCRS(r9)
lis r4, 0x8000
mtspr SPRN_MMCRS, r4
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
22: blr
......
......@@ -572,18 +572,6 @@ void __init radix__early_init_devtree(void)
return;
}
static void radix_init_amor(void)
{
/*
* In HV mode, we init AMOR (Authority Mask Override Register) so that
* the hypervisor and guest can setup IAMR (Instruction Authority Mask
* Register), enable key 0 and set it to 1.
*
* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
*/
mtspr(SPRN_AMOR, (3ul << 62));
}
void __init radix__early_init_mmu(void)
{
unsigned long lpcr;
......@@ -644,7 +632,6 @@ void __init radix__early_init_mmu(void)
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
radix_init_partition_table();
radix_init_amor();
} else {
radix_init_pseries();
}
......@@ -668,8 +655,6 @@ void radix__early_init_mmu_secondary(void)
set_ptcr_when_no_uv(__pa(partition_tb) |
(PATB_SIZE_SHIFT - 12));
radix_init_amor();
}
radix__switch_mmu_context(NULL, &init_mm);
......
......@@ -2419,8 +2419,24 @@ int register_power_pmu(struct power_pmu *pmu)
}
#ifdef CONFIG_PPC64
static bool pmu_override = false;
static unsigned long pmu_override_val;
static void do_pmu_override(void *data)
{
ppc_set_pmu_inuse(1);
if (pmu_override_val)
mtspr(SPRN_MMCR1, pmu_override_val);
mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
}
static int __init init_ppc64_pmu(void)
{
if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
pr_warn("disabling perf due to pmu_override= command line option.\n");
on_each_cpu(do_pmu_override, NULL, 1);
return 0;
}
/* run through all the pmu drivers one at a time */
if (!init_power5_pmu())
return 0;
......@@ -2442,4 +2458,23 @@ static int __init init_ppc64_pmu(void)
return init_generic_compat_pmu();
}
early_initcall(init_ppc64_pmu);
static int __init pmu_setup(char *str)
{
unsigned long val;
if (!early_cpu_has_feature(CPU_FTR_HVMODE))
return 0;
pmu_override = true;
if (kstrtoul(str, 0, &val))
val = 0;
pmu_override_val = val;
return 1;
}
__setup("pmu_override=", pmu_setup);
#endif
......@@ -306,8 +306,8 @@ struct p7_sprs {
/* per thread SPRs that get lost in shallow states */
u64 amr;
u64 iamr;
u64 amor;
u64 uamor;
/* amor is restored to constant ~0 */
};
static unsigned long power7_idle_insn(unsigned long type)
......@@ -378,7 +378,6 @@ static unsigned long power7_idle_insn(unsigned long type)
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
sprs.amr = mfspr(SPRN_AMR);
sprs.iamr = mfspr(SPRN_IAMR);
sprs.amor = mfspr(SPRN_AMOR);
sprs.uamor = mfspr(SPRN_UAMOR);
}
......@@ -397,7 +396,7 @@ static unsigned long power7_idle_insn(unsigned long type)
*/
mtspr(SPRN_AMR, sprs.amr);
mtspr(SPRN_IAMR, sprs.iamr);
mtspr(SPRN_AMOR, sprs.amor);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_UAMOR, sprs.uamor);
}
}
......@@ -589,7 +588,6 @@ struct p9_sprs {
u64 purr;
u64 spurr;
u64 dscr;
u64 wort;
u64 ciabr;
u64 mmcra;
......@@ -687,7 +685,6 @@ static unsigned long power9_idle_stop(unsigned long psscr)
sprs.amr = mfspr(SPRN_AMR);
sprs.iamr = mfspr(SPRN_IAMR);
sprs.amor = mfspr(SPRN_AMOR);
sprs.uamor = mfspr(SPRN_UAMOR);
srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */
......@@ -708,7 +705,7 @@ static unsigned long power9_idle_stop(unsigned long psscr)
*/
mtspr(SPRN_AMR, sprs.amr);
mtspr(SPRN_IAMR, sprs.iamr);
mtspr(SPRN_AMOR, sprs.amor);
mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_UAMOR, sprs.uamor);
/*
......
......@@ -2107,8 +2107,14 @@ static void dump_300_sprs(void)
if (!cpu_has_feature(CPU_FTR_ARCH_300))
return;
printf("pidr = %.16lx tidr = %.16lx\n",
mfspr(SPRN_PID), mfspr(SPRN_TIDR));
if (cpu_has_feature(CPU_FTR_P9_TIDR)) {
printf("pidr = %.16lx tidr = %.16lx\n",
mfspr(SPRN_PID), mfspr(SPRN_TIDR));
} else {
printf("pidr = %.16lx\n",
mfspr(SPRN_PID));
}
printf("psscr = %.16lx\n",
hv ? mfspr(SPRN_PSSCR) : mfspr(SPRN_PSSCR_PR));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment