Commit d2b9b207 authored by Radim Krčmář's avatar Radim Krčmář

Merge tag 'kvm-ppc-next-4.16-1' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc

PPC KVM update for 4.16

- Allow HPT guests to run on a radix host on POWER9 v2.2 CPUs
  without requiring the complex thread synchronization that earlier
  CPU versions required.

- A series from Ben Herrenschmidt to improve the handling of
  escalation interrupts with the XIVE interrupt controller.

- Provide for the decrementer register to be copied across on
  migration.

- Various minor cleanups and bugfixes.
parents 7bf14c28 9b9b13a6
...@@ -1841,6 +1841,7 @@ registers, find a list below: ...@@ -1841,6 +1841,7 @@ registers, find a list below:
PPC | KVM_REG_PPC_DBSR | 32 PPC | KVM_REG_PPC_DBSR | 32
PPC | KVM_REG_PPC_TIDR | 64 PPC | KVM_REG_PPC_TIDR | 64
PPC | KVM_REG_PPC_PSSCR | 64 PPC | KVM_REG_PPC_PSSCR | 64
PPC | KVM_REG_PPC_DEC_EXPIRY | 64
PPC | KVM_REG_PPC_TM_GPR0 | 64 PPC | KVM_REG_PPC_TM_GPR0 | 64
... ...
PPC | KVM_REG_PPC_TM_GPR31 | 64 PPC | KVM_REG_PPC_TM_GPR31 | 64
......
...@@ -42,4 +42,8 @@ extern void wait_for_tb_resync(void); ...@@ -42,4 +42,8 @@ extern void wait_for_tb_resync(void);
static inline void wait_for_subcore_guest_exit(void) { } static inline void wait_for_subcore_guest_exit(void) { }
static inline void wait_for_tb_resync(void) { } static inline void wait_for_tb_resync(void) { }
#endif #endif
struct pt_regs;
extern long hmi_handle_debugtrig(struct pt_regs *regs);
#endif /* __ASM_PPC64_HMI_H__ */ #endif /* __ASM_PPC64_HMI_H__ */
...@@ -122,13 +122,13 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) ...@@ -122,13 +122,13 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
lphi = (l >> 16) & 0xf; lphi = (l >> 16) & 0xf;
switch ((l >> 12) & 0xf) { switch ((l >> 12) & 0xf) {
case 0: case 0:
return !lphi ? 24 : -1; /* 16MB */ return !lphi ? 24 : 0; /* 16MB */
break; break;
case 1: case 1:
return 16; /* 64kB */ return 16; /* 64kB */
break; break;
case 3: case 3:
return !lphi ? 34 : -1; /* 16GB */ return !lphi ? 34 : 0; /* 16GB */
break; break;
case 7: case 7:
return (16 << 8) + 12; /* 64kB in 4kB */ return (16 << 8) + 12; /* 64kB in 4kB */
...@@ -140,7 +140,7 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) ...@@ -140,7 +140,7 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
return (24 << 8) + 12; /* 16MB in 4kB */ return (24 << 8) + 12; /* 16MB in 4kB */
break; break;
} }
return -1; return 0;
} }
static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l) static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l)
...@@ -159,7 +159,11 @@ static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l ...@@ -159,7 +159,11 @@ static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l
static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r) static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r)
{ {
return 1ul << kvmppc_hpte_actual_page_shift(v, r); int shift = kvmppc_hpte_actual_page_shift(v, r);
if (shift)
return 1ul << shift;
return 0;
} }
static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift) static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift)
...@@ -232,7 +236,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, ...@@ -232,7 +236,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
va_low ^= v >> (SID_SHIFT_1T - 16); va_low ^= v >> (SID_SHIFT_1T - 16);
va_low &= 0x7ff; va_low &= 0x7ff;
if (b_pgshift == 12) { if (b_pgshift <= 12) {
if (a_pgshift > 12) { if (a_pgshift > 12) {
sllp = (a_pgshift == 16) ? 5 : 4; sllp = (a_pgshift == 16) ? 5 : 4;
rb |= sllp << 5; /* AP field */ rb |= sllp << 5; /* AP field */
......
...@@ -709,6 +709,7 @@ struct kvm_vcpu_arch { ...@@ -709,6 +709,7 @@ struct kvm_vcpu_arch {
u8 ceded; u8 ceded;
u8 prodded; u8 prodded;
u8 doorbell_request; u8 doorbell_request;
u8 irq_pending; /* Used by XIVE to signal pending guest irqs */
u32 last_inst; u32 last_inst;
struct swait_queue_head *wqp; struct swait_queue_head *wqp;
...@@ -738,8 +739,11 @@ struct kvm_vcpu_arch { ...@@ -738,8 +739,11 @@ struct kvm_vcpu_arch {
struct kvmppc_icp *icp; /* XICS presentation controller */ struct kvmppc_icp *icp; /* XICS presentation controller */
struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */ struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
__be32 xive_cam_word; /* Cooked W2 in proper endian with valid bit */ __be32 xive_cam_word; /* Cooked W2 in proper endian with valid bit */
u32 xive_pushed; /* Is the VP pushed on the physical CPU ? */ u8 xive_pushed; /* Is the VP pushed on the physical CPU ? */
u8 xive_esc_on; /* Is the escalation irq enabled ? */
union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */ union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
u64 xive_esc_raddr; /* Escalation interrupt ESB real addr */
u64 xive_esc_vaddr; /* Escalation interrupt ESB virt addr */
#endif #endif
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
......
...@@ -1073,6 +1073,7 @@ enum { ...@@ -1073,6 +1073,7 @@ enum {
/* Flags for OPAL_XIVE_GET/SET_VP_INFO */ /* Flags for OPAL_XIVE_GET/SET_VP_INFO */
enum { enum {
OPAL_XIVE_VP_ENABLED = 0x00000001, OPAL_XIVE_VP_ENABLED = 0x00000001,
OPAL_XIVE_VP_SINGLE_ESCALATION = 0x00000002,
}; };
/* "Any chip" replacement for chip ID for allocation functions */ /* "Any chip" replacement for chip ID for allocation functions */
......
...@@ -432,8 +432,9 @@ ...@@ -432,8 +432,9 @@
#define SPRN_LPID 0x13F /* Logical Partition Identifier */ #define SPRN_LPID 0x13F /* Logical Partition Identifier */
#endif #endif
#define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */
#define SPRN_HMER 0x150 /* Hardware m? error recovery */ #define SPRN_HMER 0x150 /* Hypervisor maintenance exception reg */
#define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ #define HMER_DEBUG_TRIG (1ul << (63 - 17)) /* Debug trigger */
#define SPRN_HMEER 0x151 /* Hyp maintenance exception enable reg */
#define SPRN_PCR 0x152 /* Processor compatibility register */ #define SPRN_PCR 0x152 /* Processor compatibility register */
#define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ #define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */
#define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */ #define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */
......
...@@ -9,6 +9,41 @@ ...@@ -9,6 +9,41 @@
#ifndef _ASM_POWERPC_XIVE_REGS_H #ifndef _ASM_POWERPC_XIVE_REGS_H
#define _ASM_POWERPC_XIVE_REGS_H #define _ASM_POWERPC_XIVE_REGS_H
/*
* "magic" Event State Buffer (ESB) MMIO offsets.
*
* Each interrupt source has a 2-bit state machine called ESB
* which can be controlled by MMIO. It's made of 2 bits, P and
* Q. P indicates that an interrupt is pending (has been sent
* to a queue and is waiting for an EOI). Q indicates that the
* interrupt has been triggered while pending.
*
* This acts as a coalescing mechanism in order to guarantee
* that a given interrupt only occurs at most once in a queue.
*
* When doing an EOI, the Q bit will indicate if the interrupt
* needs to be re-triggered.
*
* The following offsets into the ESB MMIO allow to read or
* manipulate the PQ bits. They must be used with an 8-bytes
* load instruction. They all return the previous state of the
* interrupt (atomically).
*
* Additionally, some ESB pages support doing an EOI via a
* store at 0 and some ESBs support doing a trigger via a
* separate trigger page.
*/
#define XIVE_ESB_STORE_EOI 0x400 /* Store */
#define XIVE_ESB_LOAD_EOI 0x000 /* Load */
#define XIVE_ESB_GET 0x800 /* Load */
#define XIVE_ESB_SET_PQ_00 0xc00 /* Load */
#define XIVE_ESB_SET_PQ_01 0xd00 /* Load */
#define XIVE_ESB_SET_PQ_10 0xe00 /* Load */
#define XIVE_ESB_SET_PQ_11 0xf00 /* Load */
#define XIVE_ESB_VAL_P 0x2
#define XIVE_ESB_VAL_Q 0x1
/* /*
* Thread Management (aka "TM") registers * Thread Management (aka "TM") registers
*/ */
......
...@@ -58,6 +58,9 @@ struct xive_irq_data { ...@@ -58,6 +58,9 @@ struct xive_irq_data {
#define XIVE_IRQ_FLAG_EOI_FW 0x10 #define XIVE_IRQ_FLAG_EOI_FW 0x10
#define XIVE_IRQ_FLAG_H_INT_ESB 0x20 #define XIVE_IRQ_FLAG_H_INT_ESB 0x20
/* Special flag set by KVM for excalation interrupts */
#define XIVE_IRQ_NO_EOI 0x80
#define XIVE_INVALID_CHIP_ID -1 #define XIVE_INVALID_CHIP_ID -1
/* A queue tracking structure in a CPU */ /* A queue tracking structure in a CPU */
...@@ -72,41 +75,6 @@ struct xive_q { ...@@ -72,41 +75,6 @@ struct xive_q {
atomic_t pending_count; atomic_t pending_count;
}; };
/*
* "magic" Event State Buffer (ESB) MMIO offsets.
*
* Each interrupt source has a 2-bit state machine called ESB
* which can be controlled by MMIO. It's made of 2 bits, P and
* Q. P indicates that an interrupt is pending (has been sent
* to a queue and is waiting for an EOI). Q indicates that the
* interrupt has been triggered while pending.
*
* This acts as a coalescing mechanism in order to guarantee
* that a given interrupt only occurs at most once in a queue.
*
* When doing an EOI, the Q bit will indicate if the interrupt
* needs to be re-triggered.
*
* The following offsets into the ESB MMIO allow to read or
* manipulate the PQ bits. They must be used with an 8-bytes
* load instruction. They all return the previous state of the
* interrupt (atomically).
*
* Additionally, some ESB pages support doing an EOI via a
* store at 0 and some ESBs support doing a trigger via a
* separate trigger page.
*/
#define XIVE_ESB_STORE_EOI 0x400 /* Store */
#define XIVE_ESB_LOAD_EOI 0x000 /* Load */
#define XIVE_ESB_GET 0x800 /* Load */
#define XIVE_ESB_SET_PQ_00 0xc00 /* Load */
#define XIVE_ESB_SET_PQ_01 0xd00 /* Load */
#define XIVE_ESB_SET_PQ_10 0xe00 /* Load */
#define XIVE_ESB_SET_PQ_11 0xf00 /* Load */
#define XIVE_ESB_VAL_P 0x2
#define XIVE_ESB_VAL_Q 0x1
/* Global enable flags for the XIVE support */ /* Global enable flags for the XIVE support */
extern bool __xive_enabled; extern bool __xive_enabled;
...@@ -143,9 +111,10 @@ extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); ...@@ -143,9 +111,10 @@ extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
extern void xive_native_sync_source(u32 hw_irq); extern void xive_native_sync_source(u32 hw_irq);
extern bool is_xive_irq(struct irq_chip *chip); extern bool is_xive_irq(struct irq_chip *chip);
extern int xive_native_enable_vp(u32 vp_id); extern int xive_native_enable_vp(u32 vp_id, bool single_escalation);
extern int xive_native_disable_vp(u32 vp_id); extern int xive_native_disable_vp(u32 vp_id);
extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
extern bool xive_native_has_single_escalation(void);
#else #else
......
...@@ -632,6 +632,8 @@ struct kvm_ppc_cpu_char { ...@@ -632,6 +632,8 @@ struct kvm_ppc_cpu_char {
#define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc) #define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
#define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd) #define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
#define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
/* Transactional Memory checkpointed state: /* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs * This is all GPRs, all VSX regs and a subset of SPRs
*/ */
......
...@@ -519,6 +519,7 @@ int main(void) ...@@ -519,6 +519,7 @@ int main(void)
OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending);
OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
...@@ -738,6 +739,9 @@ int main(void) ...@@ -738,6 +739,9 @@ int main(void)
DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu, DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
arch.xive_cam_word)); arch.xive_cam_word));
DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed)); DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on));
DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr));
DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr));
#endif #endif
#ifdef CONFIG_KVM_EXIT_TIMING #ifdef CONFIG_KVM_EXIT_TIMING
......
...@@ -495,37 +495,123 @@ long machine_check_early(struct pt_regs *regs) ...@@ -495,37 +495,123 @@ long machine_check_early(struct pt_regs *regs)
return handled; return handled;
} }
long hmi_exception_realmode(struct pt_regs *regs) /* Possible meanings for HMER_DEBUG_TRIG bit being set on POWER9 */
static enum {
DTRIG_UNKNOWN,
DTRIG_VECTOR_CI, /* need to emulate vector CI load instr */
DTRIG_SUSPEND_ESCAPE, /* need to escape from TM suspend mode */
} hmer_debug_trig_function;
static int init_debug_trig_function(void)
{ {
__this_cpu_inc(irq_stat.hmi_exceptions); int pvr;
struct device_node *cpun;
#ifdef CONFIG_PPC_BOOK3S_64 struct property *prop = NULL;
/* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ const char *str;
if (pvr_version_is(PVR_POWER9)) {
unsigned long hmer = mfspr(SPRN_HMER); /* First look in the device tree */
preempt_disable();
/* Do we have the debug bit set */ cpun = of_get_cpu_node(smp_processor_id(), NULL);
if (hmer & PPC_BIT(17)) { if (cpun) {
hmer &= ~PPC_BIT(17); of_property_for_each_string(cpun, "ibm,hmi-special-triggers",
mtspr(SPRN_HMER, hmer); prop, str) {
if (strcmp(str, "bit17-vector-ci-load") == 0)
/* hmer_debug_trig_function = DTRIG_VECTOR_CI;
* Now to avoid problems with soft-disable we else if (strcmp(str, "bit17-tm-suspend-escape") == 0)
* only do the emulation if we are coming from hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
* user space
*/
if (user_mode(regs))
local_paca->hmi_p9_special_emu = 1;
/*
* Don't bother going to OPAL if that's the
* only relevant bit.
*/
if (!(hmer & mfspr(SPRN_HMEER)))
return local_paca->hmi_p9_special_emu;
} }
of_node_put(cpun);
}
preempt_enable();
/* If we found the property, don't look at PVR */
if (prop)
goto out;
pvr = mfspr(SPRN_PVR);
/* Check for POWER9 Nimbus (scale-out) */
if ((PVR_VER(pvr) == PVR_POWER9) && (pvr & 0xe000) == 0) {
/* DD2.2 and later */
if ((pvr & 0xfff) >= 0x202)
hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
/* DD2.0 and DD2.1 - used for vector CI load emulation */
else if ((pvr & 0xfff) >= 0x200)
hmer_debug_trig_function = DTRIG_VECTOR_CI;
}
out:
switch (hmer_debug_trig_function) {
case DTRIG_VECTOR_CI:
pr_debug("HMI debug trigger used for vector CI load\n");
break;
case DTRIG_SUSPEND_ESCAPE:
pr_debug("HMI debug trigger used for TM suspend escape\n");
break;
default:
break;
} }
#endif /* CONFIG_PPC_BOOK3S_64 */ return 0;
}
__initcall(init_debug_trig_function);
/*
* Handle HMIs that occur as a result of a debug trigger.
* Return values:
* -1 means this is not a HMI cause that we know about
* 0 means no further handling is required
* 1 means further handling is required
*/
long hmi_handle_debugtrig(struct pt_regs *regs)
{
unsigned long hmer = mfspr(SPRN_HMER);
long ret = 0;
/* HMER_DEBUG_TRIG bit is used for various workarounds on P9 */
if (!((hmer & HMER_DEBUG_TRIG)
&& hmer_debug_trig_function != DTRIG_UNKNOWN))
return -1;
hmer &= ~HMER_DEBUG_TRIG;
/* HMER is a write-AND register */
mtspr(SPRN_HMER, ~HMER_DEBUG_TRIG);
switch (hmer_debug_trig_function) {
case DTRIG_VECTOR_CI:
/*
* Now to avoid problems with soft-disable we
* only do the emulation if we are coming from
* host user space
*/
if (regs && user_mode(regs))
ret = local_paca->hmi_p9_special_emu = 1;
break;
default:
break;
}
/*
* See if any other HMI causes remain to be handled
*/
if (hmer & mfspr(SPRN_HMEER))
return -1;
return ret;
}
/*
* Return values:
*/
long hmi_exception_realmode(struct pt_regs *regs)
{
int ret;
__this_cpu_inc(irq_stat.hmi_exceptions);
ret = hmi_handle_debugtrig(regs);
if (ret >= 0)
return ret;
wait_for_subcore_guest_exit(); wait_for_subcore_guest_exit();
......
...@@ -573,7 +573,7 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, ...@@ -573,7 +573,7 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
j = i + 1; j = i + 1;
if (npages) { if (npages) {
set_dirty_bits(map, i, npages); set_dirty_bits(map, i, npages);
i = j + npages; j = i + npages;
} }
} }
return 0; return 0;
......
...@@ -118,6 +118,9 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, ...@@ -118,6 +118,9 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
#endif #endif
/* If set, the threads on each CPU core have to be in the same MMU mode */
static bool no_mixing_hpt_and_radix;
static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
...@@ -1497,6 +1500,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, ...@@ -1497,6 +1500,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_ARCH_COMPAT: case KVM_REG_PPC_ARCH_COMPAT:
*val = get_reg_val(id, vcpu->arch.vcore->arch_compat); *val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
break; break;
case KVM_REG_PPC_DEC_EXPIRY:
*val = get_reg_val(id, vcpu->arch.dec_expires +
vcpu->arch.vcore->tb_offset);
break;
default: default:
r = -EINVAL; r = -EINVAL;
break; break;
...@@ -1724,6 +1731,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, ...@@ -1724,6 +1731,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_ARCH_COMPAT: case KVM_REG_PPC_ARCH_COMPAT:
r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
break; break;
case KVM_REG_PPC_DEC_EXPIRY:
vcpu->arch.dec_expires = set_reg_val(id, *val) -
vcpu->arch.vcore->tb_offset;
break;
default: default:
r = -EINVAL; r = -EINVAL;
break; break;
...@@ -2378,8 +2389,8 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) ...@@ -2378,8 +2389,8 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
static bool subcore_config_ok(int n_subcores, int n_threads) static bool subcore_config_ok(int n_subcores, int n_threads)
{ {
/* /*
* POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core * POWER9 "SMT4" cores are permanently in what is effectively a 4-way
* mode, with one thread per subcore. * split-core mode, with one thread per subcore.
*/ */
if (cpu_has_feature(CPU_FTR_ARCH_300)) if (cpu_has_feature(CPU_FTR_ARCH_300))
return n_subcores <= 4 && n_threads == 1; return n_subcores <= 4 && n_threads == 1;
...@@ -2415,8 +2426,8 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) ...@@ -2415,8 +2426,8 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
if (!cpu_has_feature(CPU_FTR_ARCH_207S)) if (!cpu_has_feature(CPU_FTR_ARCH_207S))
return false; return false;
/* POWER9 currently requires all threads to be in the same MMU mode */ /* Some POWER9 chips require all threads to be in the same MMU mode */
if (cpu_has_feature(CPU_FTR_ARCH_300) && if (no_mixing_hpt_and_radix &&
kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
return false; return false;
...@@ -2679,9 +2690,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) ...@@ -2679,9 +2690,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* threads are offline. Also check if the number of threads in this * threads are offline. Also check if the number of threads in this
* guest are greater than the current system threads per guest. * guest are greater than the current system threads per guest.
* On POWER9, we need to be not in independent-threads mode if * On POWER9, we need to be not in independent-threads mode if
* this is a HPT guest on a radix host. * this is a HPT guest on a radix host machine where the
* CPU threads may not be in different MMU modes.
*/ */
hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm); hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() &&
!kvm_is_radix(vc->kvm);
if (((controlled_threads > 1) && if (((controlled_threads > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
(hpt_on_radix && vc->kvm->arch.threads_indep)) { (hpt_on_radix && vc->kvm->arch.threads_indep)) {
...@@ -2831,7 +2844,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) ...@@ -2831,7 +2844,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
*/ */
if (!thr0_done) if (!thr0_done)
kvmppc_start_thread(NULL, pvc); kvmppc_start_thread(NULL, pvc);
thr += pvc->num_threads;
} }
/* /*
...@@ -2987,7 +2999,7 @@ static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) ...@@ -2987,7 +2999,7 @@ static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
{ {
if (!xive_enabled()) if (!xive_enabled())
return false; return false;
return vcpu->arch.xive_saved_state.pipr < return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
vcpu->arch.xive_saved_state.cppr; vcpu->arch.xive_saved_state.cppr;
} }
#else #else
...@@ -3176,17 +3188,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ...@@ -3176,17 +3188,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
* this thread straight away and have it join in. * this thread straight away and have it join in.
*/ */
if (!signal_pending(current)) { if (!signal_pending(current)) {
if (vc->vcore_state == VCORE_PIGGYBACK) { if ((vc->vcore_state == VCORE_PIGGYBACK ||
if (spin_trylock(&vc->lock)) { vc->vcore_state == VCORE_RUNNING) &&
if (vc->vcore_state == VCORE_RUNNING &&
!VCORE_IS_EXITING(vc)) {
kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
}
spin_unlock(&vc->lock);
}
} else if (vc->vcore_state == VCORE_RUNNING &&
!VCORE_IS_EXITING(vc)) { !VCORE_IS_EXITING(vc)) {
kvmppc_create_dtl_entry(vcpu, vc); kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu, vc); kvmppc_start_thread(vcpu, vc);
...@@ -4448,6 +4451,19 @@ static int kvmppc_book3s_init_hv(void) ...@@ -4448,6 +4451,19 @@ static int kvmppc_book3s_init_hv(void)
if (kvmppc_radix_possible()) if (kvmppc_radix_possible())
r = kvmppc_radix_init(); r = kvmppc_radix_init();
/*
* POWER9 chips before version 2.02 can't have some threads in
* HPT mode and some in radix mode on the same core.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
unsigned int pvr = mfspr(SPRN_PVR);
if ((pvr >> 16) == PVR_POWER9 &&
(((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
no_mixing_hpt_and_radix = true;
}
return r; return r;
} }
......
...@@ -268,17 +268,19 @@ static void kvmppc_tb_resync_done(void) ...@@ -268,17 +268,19 @@ static void kvmppc_tb_resync_done(void)
* secondary threads to proceed. * secondary threads to proceed.
* - All secondary threads will eventually call opal hmi handler on * - All secondary threads will eventually call opal hmi handler on
* their exit path. * their exit path.
*
* Returns 1 if the timebase offset should be applied, 0 if not.
*/ */
long kvmppc_realmode_hmi_handler(void) long kvmppc_realmode_hmi_handler(void)
{ {
int ptid = local_paca->kvm_hstate.ptid;
bool resync_req; bool resync_req;
/* This is only called on primary thread. */
BUG_ON(ptid != 0);
__this_cpu_inc(irq_stat.hmi_exceptions); __this_cpu_inc(irq_stat.hmi_exceptions);
if (hmi_handle_debugtrig(NULL) >= 0)
return 1;
/* /*
* By now primary thread has already completed guest->host * By now primary thread has already completed guest->host
* partition switch but haven't signaled secondaries yet. * partition switch but haven't signaled secondaries yet.
......
...@@ -617,13 +617,6 @@ kvmppc_hv_entry: ...@@ -617,13 +617,6 @@ kvmppc_hv_entry:
lbz r0, KVM_RADIX(r9) lbz r0, KVM_RADIX(r9)
cmpwi cr7, r0, 0 cmpwi cr7, r0, 0
/* Clear out SLB if hash */
bne cr7, 2f
li r6,0
slbmte r6,r6
slbia
ptesync
2:
/* /*
* POWER7/POWER8 host -> guest partition switch code. * POWER7/POWER8 host -> guest partition switch code.
* We don't have to lock against concurrent tlbies, * We don't have to lock against concurrent tlbies,
...@@ -738,19 +731,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) ...@@ -738,19 +731,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
10: cmpdi r4, 0 10: cmpdi r4, 0
beq kvmppc_primary_no_guest beq kvmppc_primary_no_guest
kvmppc_got_guest: kvmppc_got_guest:
/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
lwz r5,VCPU_SLB_MAX(r4)
cmpwi r5,0
beq 9f
mtctr r5
addi r6,r4,VCPU_SLB
1: ld r8,VCPU_SLB_E(r6)
ld r9,VCPU_SLB_V(r6)
slbmte r9,r8
addi r6,r6,VCPU_SLB_SIZE
bdnz 1b
9:
/* Increment yield count if they have a VPA */ /* Increment yield count if they have a VPA */
ld r3, VCPU_VPA(r4) ld r3, VCPU_VPA(r4)
cmpdi r3, 0 cmpdi r3, 0
...@@ -957,7 +937,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ...@@ -957,7 +937,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
mftb r7 mftb r7
subf r3,r7,r8 subf r3,r7,r8
mtspr SPRN_DEC,r3 mtspr SPRN_DEC,r3
std r3,VCPU_DEC(r4)
ld r5, VCPU_SPRG0(r4) ld r5, VCPU_SPRG0(r4)
ld r6, VCPU_SPRG1(r4) ld r6, VCPU_SPRG1(r4)
...@@ -1018,6 +997,29 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ...@@ -1018,6 +997,29 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
cmpdi r3, 512 /* 1 microsecond */ cmpdi r3, 512 /* 1 microsecond */
blt hdec_soon blt hdec_soon
/* For hash guest, clear out and reload the SLB */
ld r6, VCPU_KVM(r4)
lbz r0, KVM_RADIX(r6)
cmpwi r0, 0
bne 9f
li r6, 0
slbmte r6, r6
slbia
ptesync
/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
lwz r5,VCPU_SLB_MAX(r4)
cmpwi r5,0
beq 9f
mtctr r5
addi r6,r4,VCPU_SLB
1: ld r8,VCPU_SLB_E(r6)
ld r9,VCPU_SLB_V(r6)
slbmte r9,r8
addi r6,r6,VCPU_SLB_SIZE
bdnz 1b
9:
#ifdef CONFIG_KVM_XICS #ifdef CONFIG_KVM_XICS
/* We are entering the guest on that thread, push VCPU to XIVE */ /* We are entering the guest on that thread, push VCPU to XIVE */
ld r10, HSTATE_XIVE_TIMA_PHYS(r13) ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
...@@ -1031,8 +1033,53 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ...@@ -1031,8 +1033,53 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
li r9, TM_QW1_OS + TM_WORD2 li r9, TM_QW1_OS + TM_WORD2
stwcix r11,r9,r10 stwcix r11,r9,r10
li r9, 1 li r9, 1
stw r9, VCPU_XIVE_PUSHED(r4) stb r9, VCPU_XIVE_PUSHED(r4)
eieio eieio
/*
* We clear the irq_pending flag. There is a small chance of a
* race vs. the escalation interrupt happening on another
* processor setting it again, but the only consequence is to
* cause a spurrious wakeup on the next H_CEDE which is not an
* issue.
*/
li r0,0
stb r0, VCPU_IRQ_PENDING(r4)
/*
* In single escalation mode, if the escalation interrupt is
* on, we mask it.
*/
lbz r0, VCPU_XIVE_ESC_ON(r4)
cmpwi r0,0
beq 1f
ld r10, VCPU_XIVE_ESC_RADDR(r4)
li r9, XIVE_ESB_SET_PQ_01
ldcix r0, r10, r9
sync
/* We have a possible subtle race here: The escalation interrupt might
* have fired and be on its way to the host queue while we mask it,
* and if we unmask it early enough (re-cede right away), there is
* a theorical possibility that it fires again, thus landing in the
* target queue more than once which is a big no-no.
*
* Fortunately, solving this is rather easy. If the above load setting
* PQ to 01 returns a previous value where P is set, then we know the
* escalation interrupt is somewhere on its way to the host. In that
* case we simply don't clear the xive_esc_on flag below. It will be
* eventually cleared by the handler for the escalation interrupt.
*
* Then, when doing a cede, we check that flag again before re-enabling
* the escalation interrupt, and if set, we abort the cede.
*/
andi. r0, r0, XIVE_ESB_VAL_P
bne- 1f
/* Now P is 0, we can clear the flag */
li r0, 0
stb r0, VCPU_XIVE_ESC_ON(r4)
1:
no_xive: no_xive:
#endif /* CONFIG_KVM_XICS */ #endif /* CONFIG_KVM_XICS */
...@@ -1193,7 +1240,7 @@ hdec_soon: ...@@ -1193,7 +1240,7 @@ hdec_soon:
addi r3, r4, VCPU_TB_RMEXIT addi r3, r4, VCPU_TB_RMEXIT
bl kvmhv_accumulate_time bl kvmhv_accumulate_time
#endif #endif
b guest_exit_cont b guest_bypass
/****************************************************************************** /******************************************************************************
* * * *
...@@ -1423,15 +1470,35 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ...@@ -1423,15 +1470,35 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
blt deliver_guest_interrupt blt deliver_guest_interrupt
guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
/* Save more register state */
mfdar r6
mfdsisr r7
std r6, VCPU_DAR(r9)
stw r7, VCPU_DSISR(r9)
/* don't overwrite fault_dar/fault_dsisr if HDSI */
cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
beq mc_cont
std r6, VCPU_FAULT_DAR(r9)
stw r7, VCPU_FAULT_DSISR(r9)
/* See if it is a machine check */
cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
beq machine_check_realmode
mc_cont:
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r9, VCPU_TB_RMEXIT
mr r4, r9
bl kvmhv_accumulate_time
#endif
#ifdef CONFIG_KVM_XICS #ifdef CONFIG_KVM_XICS
/* We are exiting, pull the VP from the XIVE */ /* We are exiting, pull the VP from the XIVE */
lwz r0, VCPU_XIVE_PUSHED(r9) lbz r0, VCPU_XIVE_PUSHED(r9)
cmpwi cr0, r0, 0 cmpwi cr0, r0, 0
beq 1f beq 1f
li r7, TM_SPC_PULL_OS_CTX li r7, TM_SPC_PULL_OS_CTX
li r6, TM_QW1_OS li r6, TM_QW1_OS
mfmsr r0 mfmsr r0
andi. r0, r0, MSR_IR /* in real mode? */ andi. r0, r0, MSR_DR /* in real mode? */
beq 2f beq 2f
ld r10, HSTATE_XIVE_TIMA_VIRT(r13) ld r10, HSTATE_XIVE_TIMA_VIRT(r13)
cmpldi cr0, r10, 0 cmpldi cr0, r10, 0
...@@ -1454,33 +1521,42 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ ...@@ -1454,33 +1521,42 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
/* Fixup some of the state for the next load */ /* Fixup some of the state for the next load */
li r10, 0 li r10, 0
li r0, 0xff li r0, 0xff
stw r10, VCPU_XIVE_PUSHED(r9) stb r10, VCPU_XIVE_PUSHED(r9)
stb r10, (VCPU_XIVE_SAVED_STATE+3)(r9) stb r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
stb r0, (VCPU_XIVE_SAVED_STATE+4)(r9) stb r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
eieio eieio
1: 1:
#endif /* CONFIG_KVM_XICS */ #endif /* CONFIG_KVM_XICS */
/* Save more register state */
mfdar r6
mfdsisr r7
std r6, VCPU_DAR(r9)
stw r7, VCPU_DSISR(r9)
/* don't overwrite fault_dar/fault_dsisr if HDSI */
cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
beq mc_cont
std r6, VCPU_FAULT_DAR(r9)
stw r7, VCPU_FAULT_DSISR(r9)
/* See if it is a machine check */ /* For hash guest, read the guest SLB and save it away */
cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK ld r5, VCPU_KVM(r9)
beq machine_check_realmode lbz r0, KVM_RADIX(r5)
mc_cont: li r5, 0
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING cmpwi r0, 0
addi r3, r9, VCPU_TB_RMEXIT bne 3f /* for radix, save 0 entries */
mr r4, r9 lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
bl kvmhv_accumulate_time mtctr r0
#endif li r6,0
addi r7,r9,VCPU_SLB
1: slbmfee r8,r6
andis. r0,r8,SLB_ESID_V@h
beq 2f
add r8,r8,r6 /* put index in */
slbmfev r3,r6
std r8,VCPU_SLB_E(r7)
std r3,VCPU_SLB_V(r7)
addi r7,r7,VCPU_SLB_SIZE
addi r5,r5,1
2: addi r6,r6,1
bdnz 1b
/* Finally clear out the SLB */
li r0,0
slbmte r0,r0
slbia
ptesync
3: stw r5,VCPU_SLB_MAX(r9)
guest_bypass:
mr r3, r12 mr r3, r12
/* Increment exit count, poke other threads to exit */ /* Increment exit count, poke other threads to exit */
bl kvmhv_commence_exit bl kvmhv_commence_exit
...@@ -1501,31 +1577,6 @@ mc_cont: ...@@ -1501,31 +1577,6 @@ mc_cont:
ori r6,r6,1 ori r6,r6,1
mtspr SPRN_CTRLT,r6 mtspr SPRN_CTRLT,r6
4: 4:
/* Check if we are running hash or radix and store it in cr2 */
ld r5, VCPU_KVM(r9)
lbz r0, KVM_RADIX(r5)
cmpwi cr2,r0,0
/* Read the guest SLB and save it away */
li r5, 0
bne cr2, 3f /* for radix, save 0 entries */
lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
mtctr r0
li r6,0
addi r7,r9,VCPU_SLB
1: slbmfee r8,r6
andis. r0,r8,SLB_ESID_V@h
beq 2f
add r8,r8,r6 /* put index in */
slbmfev r3,r6
std r8,VCPU_SLB_E(r7)
std r3,VCPU_SLB_V(r7)
addi r7,r7,VCPU_SLB_SIZE
addi r5,r5,1
2: addi r6,r6,1
bdnz 1b
3: stw r5,VCPU_SLB_MAX(r9)
/* /*
* Save the guest PURR/SPURR * Save the guest PURR/SPURR
*/ */
...@@ -1803,7 +1854,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ...@@ -1803,7 +1854,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
ld r5, VCPU_KVM(r9) ld r5, VCPU_KVM(r9)
lbz r0, KVM_RADIX(r5) lbz r0, KVM_RADIX(r5)
cmpwi cr2, r0, 0 cmpwi cr2, r0, 0
beq cr2, 3f beq cr2, 4f
/* Radix: Handle the case where the guest used an illegal PID */ /* Radix: Handle the case where the guest used an illegal PID */
LOAD_REG_ADDR(r4, mmu_base_pid) LOAD_REG_ADDR(r4, mmu_base_pid)
...@@ -1839,15 +1890,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ...@@ -1839,15 +1890,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
BEGIN_FTR_SECTION BEGIN_FTR_SECTION
PPC_INVALIDATE_ERAT PPC_INVALIDATE_ERAT
END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
b 4f 4:
#endif /* CONFIG_PPC_RADIX_MMU */ #endif /* CONFIG_PPC_RADIX_MMU */
/* Hash: clear out SLB */
3: li r5,0
slbmte r5,r5
slbia
ptesync
4:
/* /*
* POWER7/POWER8 guest -> host partition switch code. * POWER7/POWER8 guest -> host partition switch code.
* We don't have to lock against tlbies but we do * We don't have to lock against tlbies but we do
...@@ -1908,16 +1953,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) ...@@ -1908,16 +1953,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
bne 27f bne 27f
bl kvmppc_realmode_hmi_handler bl kvmppc_realmode_hmi_handler
nop nop
cmpdi r3, 0
li r12, BOOK3S_INTERRUPT_HMI li r12, BOOK3S_INTERRUPT_HMI
/* /*
* At this point kvmppc_realmode_hmi_handler would have resync-ed * At this point kvmppc_realmode_hmi_handler may have resync-ed
* the TB. Hence it is not required to subtract guest timebase * the TB, and if it has, we must not subtract the guest timebase
* offset from timebase. So, skip it. * offset from the timebase. So, skip it.
* *
* Also, do not call kvmppc_subcore_exit_guest() because it has * Also, do not call kvmppc_subcore_exit_guest() because it has
* been invoked as part of kvmppc_realmode_hmi_handler(). * been invoked as part of kvmppc_realmode_hmi_handler().
*/ */
b 30f beq 30f
27: 27:
/* Subtract timebase offset from timebase */ /* Subtract timebase offset from timebase */
...@@ -2744,7 +2790,32 @@ kvm_cede_prodded: ...@@ -2744,7 +2790,32 @@ kvm_cede_prodded:
/* we've ceded but we want to give control to the host */ /* we've ceded but we want to give control to the host */
kvm_cede_exit: kvm_cede_exit:
ld r9, HSTATE_KVM_VCPU(r13) ld r9, HSTATE_KVM_VCPU(r13)
b guest_exit_cont #ifdef CONFIG_KVM_XICS
/* Abort if we still have a pending escalation */
lbz r5, VCPU_XIVE_ESC_ON(r9)
cmpwi r5, 0
beq 1f
li r0, 0
stb r0, VCPU_CEDED(r9)
1: /* Enable XIVE escalation */
li r5, XIVE_ESB_SET_PQ_00
mfmsr r0
andi. r0, r0, MSR_DR /* in real mode? */
beq 1f
ld r10, VCPU_XIVE_ESC_VADDR(r9)
cmpdi r10, 0
beq 3f
ldx r0, r10, r5
b 2f
1: ld r10, VCPU_XIVE_ESC_RADDR(r9)
cmpdi r10, 0
beq 3f
ldcix r0, r10, r5
2: sync
li r0, 1
stb r0, VCPU_XIVE_ESC_ON(r9)
#endif /* CONFIG_KVM_XICS */
3: b guest_exit_cont
/* Try to handle a machine check in real mode */ /* Try to handle a machine check in real mode */
machine_check_realmode: machine_check_realmode:
......
...@@ -84,12 +84,22 @@ static irqreturn_t xive_esc_irq(int irq, void *data) ...@@ -84,12 +84,22 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
{ {
struct kvm_vcpu *vcpu = data; struct kvm_vcpu *vcpu = data;
/* We use the existing H_PROD mechanism to wake up the target */ vcpu->arch.irq_pending = 1;
vcpu->arch.prodded = 1;
smp_mb(); smp_mb();
if (vcpu->arch.ceded) if (vcpu->arch.ceded)
kvmppc_fast_vcpu_kick(vcpu); kvmppc_fast_vcpu_kick(vcpu);
/* Since we have the no-EOI flag, the interrupt is effectively
* disabled now. Clearing xive_esc_on means we won't bother
* doing so on the next entry.
*
* This also allows the entry code to know that if a PQ combination
* of 10 is observed while xive_esc_on is true, it means the queue
* contains an unprocessed escalation interrupt. We don't make use of
* that knowledge today but might (see comment in book3s_hv_rmhandler.S)
*/
vcpu->arch.xive_esc_on = false;
return IRQ_HANDLED; return IRQ_HANDLED;
} }
...@@ -112,19 +122,21 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) ...@@ -112,19 +122,21 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
return -EIO; return -EIO;
} }
/* if (xc->xive->single_escalation)
* Future improvement: start with them disabled name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
* and handle DD2 and later scheme of merged escalation vcpu->kvm->arch.lpid, xc->server_num);
* interrupts else
*/ name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", vcpu->kvm->arch.lpid, xc->server_num, prio);
vcpu->kvm->arch.lpid, xc->server_num, prio);
if (!name) { if (!name) {
pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n", pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
prio, xc->server_num); prio, xc->server_num);
rc = -ENOMEM; rc = -ENOMEM;
goto error; goto error;
} }
pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio);
rc = request_irq(xc->esc_virq[prio], xive_esc_irq, rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
IRQF_NO_THREAD, name, vcpu); IRQF_NO_THREAD, name, vcpu);
if (rc) { if (rc) {
...@@ -133,6 +145,25 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) ...@@ -133,6 +145,25 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
goto error; goto error;
} }
xc->esc_virq_names[prio] = name; xc->esc_virq_names[prio] = name;
/* In single escalation mode, we grab the ESB MMIO of the
* interrupt and mask it. Also populate the VCPU v/raddr
* of the ESB page for use by asm entry/exit code. Finally
* set the XIVE_IRQ_NO_EOI flag which will prevent the
* core code from performing an EOI on the escalation
* interrupt, thus leaving it effectively masked after
* it fires once.
*/
if (xc->xive->single_escalation) {
struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
vcpu->arch.xive_esc_raddr = xd->eoi_page;
vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio;
xd->flags |= XIVE_IRQ_NO_EOI;
}
return 0; return 0;
error: error:
irq_dispose_mapping(xc->esc_virq[prio]); irq_dispose_mapping(xc->esc_virq[prio]);
...@@ -191,12 +222,12 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) ...@@ -191,12 +222,12 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
pr_devel("Provisioning prio... %d\n", prio); pr_devel("Provisioning prio... %d\n", prio);
/* Provision each VCPU and enable escalations */ /* Provision each VCPU and enable escalations if needed */
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
if (!vcpu->arch.xive_vcpu) if (!vcpu->arch.xive_vcpu)
continue; continue;
rc = xive_provision_queue(vcpu, prio); rc = xive_provision_queue(vcpu, prio);
if (rc == 0) if (rc == 0 && !xive->single_escalation)
xive_attach_escalation(vcpu, prio); xive_attach_escalation(vcpu, prio);
if (rc) if (rc)
return rc; return rc;
...@@ -1082,6 +1113,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, ...@@ -1082,6 +1113,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
/* Allocate IPI */ /* Allocate IPI */
xc->vp_ipi = xive_native_alloc_irq(); xc->vp_ipi = xive_native_alloc_irq();
if (!xc->vp_ipi) { if (!xc->vp_ipi) {
pr_err("Failed to allocate xive irq for VCPU IPI\n");
r = -EIO; r = -EIO;
goto bail; goto bail;
} }
...@@ -1091,19 +1123,34 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, ...@@ -1091,19 +1123,34 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
if (r) if (r)
goto bail; goto bail;
/*
* Enable the VP first as the single escalation mode will
* affect escalation interrupts numbering
*/
r = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
if (r) {
pr_err("Failed to enable VP in OPAL, err %d\n", r);
goto bail;
}
/* /*
* Initialize queues. Initially we set them all for no queueing * Initialize queues. Initially we set them all for no queueing
* and we enable escalation for queue 0 only which we'll use for * and we enable escalation for queue 0 only which we'll use for
* our mfrr change notifications. If the VCPU is hot-plugged, we * our mfrr change notifications. If the VCPU is hot-plugged, we
* do handle provisioning however. * do handle provisioning however based on the existing "map"
* of enabled queues.
*/ */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
struct xive_q *q = &xc->queues[i]; struct xive_q *q = &xc->queues[i];
/* Single escalation, no queue 7 */
if (i == 7 && xive->single_escalation)
break;
/* Is queue already enabled ? Provision it */ /* Is queue already enabled ? Provision it */
if (xive->qmap & (1 << i)) { if (xive->qmap & (1 << i)) {
r = xive_provision_queue(vcpu, i); r = xive_provision_queue(vcpu, i);
if (r == 0) if (r == 0 && !xive->single_escalation)
xive_attach_escalation(vcpu, i); xive_attach_escalation(vcpu, i);
if (r) if (r)
goto bail; goto bail;
...@@ -1123,11 +1170,6 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, ...@@ -1123,11 +1170,6 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
if (r) if (r)
goto bail; goto bail;
/* Enable the VP */
r = xive_native_enable_vp(xc->vp_id);
if (r)
goto bail;
/* Route the IPI */ /* Route the IPI */
r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI); r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
if (!r) if (!r)
...@@ -1474,6 +1516,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) ...@@ -1474,6 +1516,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
pr_devel(" val=0x016%llx (server=0x%x, guest_prio=%d)\n", pr_devel(" val=0x016%llx (server=0x%x, guest_prio=%d)\n",
val, server, guest_prio); val, server, guest_prio);
/* /*
* If the source doesn't already have an IPI, allocate * If the source doesn't already have an IPI, allocate
* one and get the corresponding data * one and get the corresponding data
...@@ -1762,6 +1805,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) ...@@ -1762,6 +1805,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
if (xive->vp_base == XIVE_INVALID_VP) if (xive->vp_base == XIVE_INVALID_VP)
ret = -ENOMEM; ret = -ENOMEM;
xive->single_escalation = xive_native_has_single_escalation();
if (ret) { if (ret) {
kfree(xive); kfree(xive);
return ret; return ret;
...@@ -1795,6 +1840,7 @@ static int xive_debug_show(struct seq_file *m, void *private) ...@@ -1795,6 +1840,7 @@ static int xive_debug_show(struct seq_file *m, void *private)
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
unsigned int i;
if (!xc) if (!xc)
continue; continue;
...@@ -1804,6 +1850,33 @@ static int xive_debug_show(struct seq_file *m, void *private) ...@@ -1804,6 +1850,33 @@ static int xive_debug_show(struct seq_file *m, void *private)
xc->server_num, xc->cppr, xc->hw_cppr, xc->server_num, xc->cppr, xc->hw_cppr,
xc->mfrr, xc->pending, xc->mfrr, xc->pending,
xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
struct xive_q *q = &xc->queues[i];
u32 i0, i1, idx;
if (!q->qpage && !xc->esc_virq[i])
continue;
seq_printf(m, " [q%d]: ", i);
if (q->qpage) {
idx = q->idx;
i0 = be32_to_cpup(q->qpage + idx);
idx = (idx + 1) & q->msk;
i1 = be32_to_cpup(q->qpage + idx);
seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1);
}
if (xc->esc_virq[i]) {
struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
(pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
(pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
xc->esc_virq[i], pq, xd->eoi_page);
seq_printf(m, "\n");
}
}
t_rm_h_xirr += xc->stat_rm_h_xirr; t_rm_h_xirr += xc->stat_rm_h_xirr;
t_rm_h_ipoll += xc->stat_rm_h_ipoll; t_rm_h_ipoll += xc->stat_rm_h_ipoll;
......
...@@ -120,6 +120,8 @@ struct kvmppc_xive { ...@@ -120,6 +120,8 @@ struct kvmppc_xive {
u32 q_order; u32 q_order;
u32 q_page_order; u32 q_page_order;
/* Flags */
u8 single_escalation;
}; };
#define KVMPPC_XIVE_Q_COUNT 8 #define KVMPPC_XIVE_Q_COUNT 8
...@@ -201,25 +203,20 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp ...@@ -201,25 +203,20 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp
* is as follow. * is as follow.
* *
* Guest request for 0...6 are honored. Guest request for anything * Guest request for 0...6 are honored. Guest request for anything
* higher results in a priority of 7 being applied. * higher results in a priority of 6 being applied.
*
* However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
* in order to match AIX expectations
* *
* Similar mapping is done for CPPR values * Similar mapping is done for CPPR values
*/ */
static inline u8 xive_prio_from_guest(u8 prio) static inline u8 xive_prio_from_guest(u8 prio)
{ {
if (prio == 0xff || prio < 8) if (prio == 0xff || prio < 6)
return prio; return prio;
return 7; return 6;
} }
static inline u8 xive_prio_to_guest(u8 prio) static inline u8 xive_prio_to_guest(u8 prio)
{ {
if (prio == 0xff || prio < 7) return prio;
return prio;
return 0xb;
} }
static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle) static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
......
...@@ -763,7 +763,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) ...@@ -763,7 +763,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
vcpu->arch.dec_expires = ~(u64)0; vcpu->arch.dec_expires = get_tb();
#ifdef CONFIG_KVM_EXIT_TIMING #ifdef CONFIG_KVM_EXIT_TIMING
mutex_init(&vcpu->arch.exit_timing_lock); mutex_init(&vcpu->arch.exit_timing_lock);
...@@ -1106,11 +1106,9 @@ int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -1106,11 +1106,9 @@ int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
{ {
enum emulation_result emulated = EMULATE_DONE; enum emulation_result emulated = EMULATE_DONE;
/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */ /* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
if ( (vcpu->arch.mmio_vsx_copy_nums > 4) || if (vcpu->arch.mmio_vsx_copy_nums > 4)
(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
return EMULATE_FAIL; return EMULATE_FAIL;
}
while (vcpu->arch.mmio_vsx_copy_nums) { while (vcpu->arch.mmio_vsx_copy_nums) {
emulated = __kvmppc_handle_load(run, vcpu, rt, bytes, emulated = __kvmppc_handle_load(run, vcpu, rt, bytes,
...@@ -1252,11 +1250,9 @@ int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -1252,11 +1250,9 @@ int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
vcpu->arch.io_gpr = rs; vcpu->arch.io_gpr = rs;
/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */ /* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
if ( (vcpu->arch.mmio_vsx_copy_nums > 4) || if (vcpu->arch.mmio_vsx_copy_nums > 4)
(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
return EMULATE_FAIL; return EMULATE_FAIL;
}
while (vcpu->arch.mmio_vsx_copy_nums) { while (vcpu->arch.mmio_vsx_copy_nums) {
if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1) if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1)
......
...@@ -143,8 +143,7 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private) ...@@ -143,8 +143,7 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
int i; int i;
u64 min, max, sum, sum_quad; u64 min, max, sum, sum_quad;
seq_printf(m, "%s", "type count min max sum sum_squared\n"); seq_puts(m, "type count min max sum sum_squared\n");
for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
......
...@@ -367,7 +367,8 @@ static void xive_irq_eoi(struct irq_data *d) ...@@ -367,7 +367,8 @@ static void xive_irq_eoi(struct irq_data *d)
* EOI the source if it hasn't been disabled and hasn't * EOI the source if it hasn't been disabled and hasn't
* been passed-through to a KVM guest * been passed-through to a KVM guest
*/ */
if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d)) if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) &&
!(xd->flags & XIVE_IRQ_NO_EOI))
xive_do_source_eoi(irqd_to_hwirq(d), xd); xive_do_source_eoi(irqd_to_hwirq(d), xd);
/* /*
......
...@@ -42,6 +42,7 @@ static u32 xive_provision_chip_count; ...@@ -42,6 +42,7 @@ static u32 xive_provision_chip_count;
static u32 xive_queue_shift; static u32 xive_queue_shift;
static u32 xive_pool_vps = XIVE_INVALID_VP; static u32 xive_pool_vps = XIVE_INVALID_VP;
static struct kmem_cache *xive_provision_cache; static struct kmem_cache *xive_provision_cache;
static bool xive_has_single_esc;
int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
{ {
...@@ -571,6 +572,10 @@ bool __init xive_native_init(void) ...@@ -571,6 +572,10 @@ bool __init xive_native_init(void)
break; break;
} }
/* Do we support single escalation */
if (of_get_property(np, "single-escalation-support", NULL) != NULL)
xive_has_single_esc = true;
/* Configure Thread Management areas for KVM */ /* Configure Thread Management areas for KVM */
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
kvmppc_set_xive_tima(cpu, r.start, tima); kvmppc_set_xive_tima(cpu, r.start, tima);
...@@ -667,12 +672,15 @@ void xive_native_free_vp_block(u32 vp_base) ...@@ -667,12 +672,15 @@ void xive_native_free_vp_block(u32 vp_base)
} }
EXPORT_SYMBOL_GPL(xive_native_free_vp_block); EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
int xive_native_enable_vp(u32 vp_id) int xive_native_enable_vp(u32 vp_id, bool single_escalation)
{ {
s64 rc; s64 rc;
u64 flags = OPAL_XIVE_VP_ENABLED;
if (single_escalation)
flags |= OPAL_XIVE_VP_SINGLE_ESCALATION;
for (;;) { for (;;) {
rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0); rc = opal_xive_set_vp_info(vp_id, flags, 0);
if (rc != OPAL_BUSY) if (rc != OPAL_BUSY)
break; break;
msleep(1); msleep(1);
...@@ -710,3 +718,9 @@ int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id) ...@@ -710,3 +718,9 @@ int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(xive_native_get_vp_info); EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
bool xive_native_has_single_escalation(void)
{
return xive_has_single_esc;
}
EXPORT_SYMBOL_GPL(xive_native_has_single_escalation);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment