Commit ffcb09f2 authored by Radim Krčmář's avatar Radim Krčmář

Merge branch 'kvm-ppc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc

PPC KVM update for 4.10:

 * Support for KVM guests on POWER9 using the hashed page table MMU.
 * Updates and improvements to the halt-polling support on PPC, from
   Suraj Jitindar Singh.
 * An optimization to speed up emulated MMIO, from Yongji Xie.
 * Various other minor cleanups.
parents bf65014d 6ccad8ce
......@@ -6,6 +6,8 @@ cpuid.txt
- KVM-specific cpuid leaves (x86).
devices/
- KVM_CAP_DEVICE_CTRL userspace API.
halt-polling.txt
- notes on halt-polling
hypercalls.txt
- KVM hypercalls.
locking.txt
......
......@@ -2023,6 +2023,8 @@ registers, find a list below:
PPC | KVM_REG_PPC_WORT | 64
PPC | KVM_REG_PPC_SPRG9 | 64
PPC | KVM_REG_PPC_DBSR | 32
PPC | KVM_REG_PPC_TIDR | 64
PPC | KVM_REG_PPC_PSSCR | 64
PPC | KVM_REG_PPC_TM_GPR0 | 64
...
PPC | KVM_REG_PPC_TM_GPR31 | 64
......@@ -2039,6 +2041,7 @@ registers, find a list below:
PPC | KVM_REG_PPC_TM_VSCR | 32
PPC | KVM_REG_PPC_TM_DSCR | 64
PPC | KVM_REG_PPC_TM_TAR | 64
PPC | KVM_REG_PPC_TM_XER | 64
| |
MIPS | KVM_REG_MIPS_R0 | 64
...
......
The KVM halt polling system
===========================
The KVM halt polling system provides a feature within KVM whereby the latency
of a guest can, under some circumstances, be reduced by polling in the host
for some time period after the guest has elected to no longer run by cedeing.
That is, when a guest vcpu has ceded, or in the case of powerpc when all of the
vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions
before giving up the cpu to the scheduler in order to let something else run.
Polling provides a latency advantage in cases where the guest can be run again
very quickly by at least saving us a trip through the scheduler, normally on
the order of a few micro-seconds, although performance benefits are workload
dependant. In the event that no wakeup source arrives during the polling
interval or some other task on the runqueue is runnable the scheduler is
invoked. Thus halt polling is especially useful on workloads with very short
wakeup periods where the time spent halt polling is minimised and the time
savings of not invoking the scheduler are distinguishable.
The generic halt polling code is implemented in:
virt/kvm/kvm_main.c: kvm_vcpu_block()
The powerpc kvm-hv specific case is implemented in:
arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked()
Halt Polling Interval
=====================
The maximum time for which to poll before invoking the scheduler, referred to
as the halt polling interval, is increased and decreased based on the perceived
effectiveness of the polling in an attempt to limit pointless polling.
This value is stored in either the vcpu struct:
kvm_vcpu->halt_poll_ns
or in the case of powerpc kvm-hv, in the vcore struct:
kvmppc_vcore->halt_poll_ns
Thus this is a per vcpu (or vcore) value.
During polling if a wakeup source is received within the halt polling interval,
the interval is left unchanged. In the event that a wakeup source isn't
received during the polling interval (and thus schedule is invoked) there are
two options, either the polling interval and total block time[0] were less than
the global max polling interval (see module params below), or the total block
time was greater than the global max polling interval.
In the event that both the polling interval and total block time were less than
the global max polling interval then the polling interval can be increased in
the hope that next time during the longer polling interval the wake up source
will be received while the host is polling and the latency benefits will be
received. The polling interval is grown in the function grow_halt_poll_ns() and
is multiplied by the module parameter halt_poll_ns_grow.
In the event that the total block time was greater than the global max polling
interval then the host will never poll for long enough (limited by the global
max) to wakeup during the polling interval so it may as well be shrunk in order
to avoid pointless polling. The polling interval is shrunk in the function
shrink_halt_poll_ns() and is divided by the module parameter
halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0.
It is worth noting that this adjustment process attempts to hone in on some
steady state polling interval but will only really do a good job for wakeups
which come at an approximately constant rate, otherwise there will be constant
adjustment of the polling interval.
[0] total block time: the time between when the halt polling function is
invoked and a wakeup source received (irrespective of
whether the scheduler is invoked within that function).
Module Parameters
=================
The kvm module has 3 tuneable module parameters to adjust the global max
polling interval as well as the rate at which the polling interval is grown and
shrunk. These variables are defined in include/linux/kvm_host.h and as module
parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the
powerpc kvm-hv case.
Module Parameter | Description | Default Value
--------------------------------------------------------------------------------
halt_poll_ns | The global max polling interval | KVM_HALT_POLL_NS_DEFAULT
| which defines the ceiling value |
| of the polling interval for | (per arch value)
| each vcpu. |
--------------------------------------------------------------------------------
halt_poll_ns_grow | The value by which the halt | 2
| polling interval is multiplied |
| in the grow_halt_poll_ns() |
| function. |
--------------------------------------------------------------------------------
halt_poll_ns_shrink | The value by which the halt | 0
| polling interval is divided in |
| the shrink_halt_poll_ns() |
| function. |
--------------------------------------------------------------------------------
These module parameters can be set from the debugfs files in:
/sys/module/kvm/parameters/
Note: that these module parameters are system wide values and are not able to
be tuned on a per vm basis.
Further Notes
=============
- Care should be taken when setting the halt_poll_ns module parameter as a
large value has the potential to drive the cpu usage to 100% on a machine which
would be almost entirely idle otherwise. This is because even if a guest has
wakeups during which very little work is done and which are quite far apart, if
the period is shorter than the global max polling interval (halt_poll_ns) then
the host will always poll for the entire block time and thus cpu utilisation
will go to 100%.
- Halt polling essentially presents a trade off between power usage and latency
and the module parameters should be used to tune the affinity for this. Idle
cpu time is essentially converted to host kernel time with the aim of decreasing
latency when entering the guest.
- Halt polling will only be conducted by the host when no other tasks are
runnable on that cpu, otherwise the polling will cease immediately and
schedule will be invoked to allow that other task to run. Thus this doesn't
allow a guest to denial of service the cpu.
......@@ -14,6 +14,9 @@
#include <linux/threads.h>
#include <linux/kprobes.h>
#ifdef CONFIG_KVM
#include <linux/kvm_host.h>
#endif
#include <uapi/asm/ucontext.h>
......@@ -109,4 +112,45 @@ void early_setup_secondary(void);
/* time */
void accumulate_stolen_time(void);
/* kvm */
#ifdef CONFIG_KVM
long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
unsigned long ioba, unsigned long tce);
long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
unsigned long liobn, unsigned long ioba,
unsigned long tce_list, unsigned long npages);
long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
unsigned long liobn, unsigned long ioba,
unsigned long tce_value, unsigned long npages);
long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
unsigned int yield_count);
long kvmppc_h_random(struct kvm_vcpu *vcpu);
void kvmhv_commence_exit(int trap);
long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
void kvmppc_subcore_enter_guest(void);
void kvmppc_subcore_exit_guest(void);
long kvmppc_realmode_hmi_handler(void);
long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel);
long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index, unsigned long avpn);
long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu);
long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index, unsigned long avpn,
unsigned long va);
long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index);
long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index);
long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index);
long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
unsigned long slb_v, unsigned int status, bool data);
unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
unsigned long mfrr);
int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
#endif
#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
......@@ -70,7 +70,9 @@
#define HPTE_V_SSIZE_SHIFT 62
#define HPTE_V_AVPN_SHIFT 7
#define HPTE_V_COMMON_BITS ASM_CONST(0x000fffffffffffff)
#define HPTE_V_AVPN ASM_CONST(0x3fffffffffffff80)
#define HPTE_V_AVPN_3_0 ASM_CONST(0x000fffffffffff80)
#define HPTE_V_AVPN_VAL(x) (((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT)
#define HPTE_V_COMPARE(x,y) (!(((x) ^ (y)) & 0xffffffffffffff80UL))
#define HPTE_V_BOLTED ASM_CONST(0x0000000000000010)
......@@ -80,14 +82,16 @@
#define HPTE_V_VALID ASM_CONST(0x0000000000000001)
/*
* ISA 3.0 have a different HPTE format.
* ISA 3.0 has a different HPTE format.
*/
#define HPTE_R_3_0_SSIZE_SHIFT 58
#define HPTE_R_3_0_SSIZE_MASK (3ull << HPTE_R_3_0_SSIZE_SHIFT)
#define HPTE_R_PP0 ASM_CONST(0x8000000000000000)
#define HPTE_R_TS ASM_CONST(0x4000000000000000)
#define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000)
#define HPTE_R_RPN_SHIFT 12
#define HPTE_R_RPN ASM_CONST(0x0ffffffffffff000)
#define HPTE_R_RPN_3_0 ASM_CONST(0x01fffffffffff000)
#define HPTE_R_PP ASM_CONST(0x0000000000000003)
#define HPTE_R_PPP ASM_CONST(0x8000000000000003)
#define HPTE_R_N ASM_CONST(0x0000000000000004)
......@@ -316,11 +320,42 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize,
*/
v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm);
v <<= HPTE_V_AVPN_SHIFT;
if (!cpu_has_feature(CPU_FTR_ARCH_300))
v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
return v;
}
/*
* ISA v3.0 defines a new HPTE format, which differs from the old
* format in having smaller AVPN and ARPN fields, and the B field
* in the second dword instead of the first.
*/
static inline unsigned long hpte_old_to_new_v(unsigned long v)
{
/* trim AVPN, drop B */
return v & HPTE_V_COMMON_BITS;
}
static inline unsigned long hpte_old_to_new_r(unsigned long v, unsigned long r)
{
/* move B field from 1st to 2nd dword, trim ARPN */
return (r & ~HPTE_R_3_0_SSIZE_MASK) |
(((v) >> HPTE_V_SSIZE_SHIFT) << HPTE_R_3_0_SSIZE_SHIFT);
}
static inline unsigned long hpte_new_to_old_v(unsigned long v, unsigned long r)
{
/* insert B field */
return (v & HPTE_V_COMMON_BITS) |
((r & HPTE_R_3_0_SSIZE_MASK) <<
(HPTE_V_SSIZE_SHIFT - HPTE_R_3_0_SSIZE_SHIFT));
}
static inline unsigned long hpte_new_to_old_r(unsigned long r)
{
/* clear out B field */
return r & ~HPTE_R_3_0_SSIZE_MASK;
}
/*
* This function sets the AVPN and L fields of the HPTE appropriately
* using the base page size and actual page size.
......@@ -341,12 +376,8 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize,
* aligned for the requested page size
*/
static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize,
int actual_psize, int ssize)
int actual_psize)
{
if (cpu_has_feature(CPU_FTR_ARCH_300))
pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT;
/* A 4K page needs no special encoding */
if (actual_psize == MMU_PAGE_4K)
return pa & HPTE_R_RPN;
......
......@@ -99,6 +99,7 @@
#define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40
#define BOOK3S_INTERRUPT_HMI 0xe60
#define BOOK3S_INTERRUPT_H_DOORBELL 0xe80
#define BOOK3S_INTERRUPT_H_VIRT 0xea0
#define BOOK3S_INTERRUPT_PERFMON 0xf00
#define BOOK3S_INTERRUPT_ALTIVEC 0xf20
#define BOOK3S_INTERRUPT_VSX 0xf40
......
......@@ -48,7 +48,7 @@
#ifdef CONFIG_KVM_MMIO
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
#endif
#define KVM_HALT_POLL_NS_DEFAULT 500000
#define KVM_HALT_POLL_NS_DEFAULT 10000 /* 10 us */
/* These values are internal and can be increased later */
#define KVM_NR_IRQCHIPS 1
......@@ -244,8 +244,10 @@ struct kvm_arch_memory_slot {
struct kvm_arch {
unsigned int lpid;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
unsigned int tlb_sets;
unsigned long hpt_virt;
struct revmap_entry *revmap;
atomic64_t mmio_update;
unsigned int host_lpid;
unsigned long host_lpcr;
unsigned long sdr1;
......@@ -408,6 +410,24 @@ struct kvmppc_passthru_irqmap {
#define KVMPPC_IRQ_MPIC 1
#define KVMPPC_IRQ_XICS 2
#define MMIO_HPTE_CACHE_SIZE 4
struct mmio_hpte_cache_entry {
unsigned long hpte_v;
unsigned long hpte_r;
unsigned long rpte;
unsigned long pte_index;
unsigned long eaddr;
unsigned long slb_v;
long mmio_update;
unsigned int slb_base_pshift;
};
struct mmio_hpte_cache {
struct mmio_hpte_cache_entry entry[MMIO_HPTE_CACHE_SIZE];
unsigned int index;
};
struct openpic;
struct kvm_vcpu_arch {
......@@ -498,6 +518,8 @@ struct kvm_vcpu_arch {
ulong tcscr;
ulong acop;
ulong wort;
ulong tid;
ulong psscr;
ulong shadow_srr1;
#endif
u32 vrsave; /* also USPRG0 */
......@@ -546,6 +568,7 @@ struct kvm_vcpu_arch {
u64 tfiar;
u32 cr_tm;
u64 xer_tm;
u64 lr_tm;
u64 ctr_tm;
u64 amr_tm;
......@@ -655,9 +678,11 @@ struct kvm_vcpu_arch {
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
struct kvm_vcpu_arch_shared shregs;
struct mmio_hpte_cache mmio_cache;
unsigned long pgfault_addr;
long pgfault_index;
unsigned long pgfault_hpte[2];
struct mmio_hpte_cache_entry *pgfault_cache;
struct task_struct *run_task;
struct kvm_run *kvm_run;
......
......@@ -483,9 +483,10 @@ extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
unsigned long host_irq);
extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
unsigned long host_irq);
extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr,
struct kvmppc_irq_map *irq_map,
struct kvmppc_passthru_irqmap *pimap);
struct kvmppc_passthru_irqmap *pimap,
bool *again);
extern int h_ipi_redirect;
#else
static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
......
......@@ -208,6 +208,11 @@ extern u64 ppc64_rma_size;
/* Cleanup function used by kexec */
extern void mmu_cleanup_all(void);
extern void radix__mmu_cleanup_all(void);
/* Functions for creating and updating partition table on POWER9 */
extern void mmu_partition_table_init(void);
extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
unsigned long dw1);
#endif /* CONFIG_PPC64 */
struct mm_struct;
......
......@@ -220,9 +220,12 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id,
int64_t opal_pci_poll2(uint64_t id, uint64_t data);
int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll);
int64_t opal_rm_int_get_xirr(__be32 *out_xirr, bool just_poll);
int64_t opal_int_set_cppr(uint8_t cppr);
int64_t opal_int_eoi(uint32_t xirr);
int64_t opal_rm_int_eoi(uint32_t xirr);
int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
int64_t opal_rm_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
uint32_t pe_num, uint32_t tce_size,
uint64_t dma_addr, uint32_t npages);
......
......@@ -153,6 +153,8 @@
#define PSSCR_EC 0x00100000 /* Exit Criterion */
#define PSSCR_ESL 0x00200000 /* Enable State Loss */
#define PSSCR_SD 0x00400000 /* Status Disable */
#define PSSCR_PLS 0xf000000000000000 /* Power-saving Level Status */
#define PSSCR_GUEST_VIS 0xf0000000000003ff /* Guest-visible PSSCR fields */
/* Floating Point Status and Control Register (FPSCR) Fields */
#define FPSCR_FX 0x80000000 /* FPU exception summary */
......@@ -236,6 +238,7 @@
#define SPRN_TEXASRU 0x83 /* '' '' '' Upper 32 */
#define TEXASR_FS __MASK(63-36) /* TEXASR Failure Summary */
#define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */
#define SPRN_TIDR 144 /* Thread ID register */
#define SPRN_CTRLF 0x088
#define SPRN_CTRLT 0x098
#define CTRL_CT 0xc0000000 /* current thread */
......@@ -294,6 +297,7 @@
#define SPRN_HSRR1 0x13B /* Hypervisor Save/Restore 1 */
#define SPRN_LMRR 0x32D /* Load Monitor Region Register */
#define SPRN_LMSER 0x32E /* Load Monitor Section Enable Register */
#define SPRN_ASDR 0x330 /* Access segment descriptor register */
#define SPRN_IC 0x350 /* Virtual Instruction Count */
#define SPRN_VTB 0x351 /* Virtual Time Base */
#define SPRN_LDBAR 0x352 /* LD Base Address Register */
......@@ -305,6 +309,7 @@
/* HFSCR and FSCR bit numbers are the same */
#define FSCR_LM_LG 11 /* Enable Load Monitor Registers */
#define FSCR_MSGP_LG 10 /* Enable MSGP */
#define FSCR_TAR_LG 8 /* Enable Target Address Register */
#define FSCR_EBB_LG 7 /* Enable Event Based Branching */
#define FSCR_TM_LG 5 /* Enable Transactional Memory */
......@@ -320,6 +325,7 @@
#define FSCR_DSCR __MASK(FSCR_DSCR_LG)
#define SPRN_HFSCR 0xbe /* HV=1 Facility Status & Control Register */
#define HFSCR_LM __MASK(FSCR_LM_LG)
#define HFSCR_MSGP __MASK(FSCR_MSGP_LG)
#define HFSCR_TAR __MASK(FSCR_TAR_LG)
#define HFSCR_EBB __MASK(FSCR_EBB_LG)
#define HFSCR_TM __MASK(FSCR_TM_LG)
......@@ -355,8 +361,10 @@
#define LPCR_PECE0 ASM_CONST(0x0000000000004000) /* ext. exceptions can cause exit */
#define LPCR_PECE1 ASM_CONST(0x0000000000002000) /* decrementer can cause exit */
#define LPCR_PECE2 ASM_CONST(0x0000000000001000) /* machine check etc can cause exit */
#define LPCR_PECE_HVEE ASM_CONST(0x0000400000000000) /* P9 Wakeup on HV interrupts */
#define LPCR_MER ASM_CONST(0x0000000000000800) /* Mediated External Exception */
#define LPCR_MER_SH 11
#define LPCR_GTSE ASM_CONST(0x0000000000000400) /* Guest Translation Shootdown Enable */
#define LPCR_TC ASM_CONST(0x0000000000000200) /* Translation control */
#define LPCR_LPES 0x0000000c
#define LPCR_LPES0 ASM_CONST(0x0000000000000008) /* LPAR Env selector 0 */
......@@ -377,6 +385,12 @@
#define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */
#define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */
#define PCR_TM_DIS (1ul << (63-2)) /* Trans. memory disable (POWER8) */
/*
* These bits are used in the function kvmppc_set_arch_compat() to specify and
* determine both the compatibility level which we want to emulate and the
* compatibility level which the host is capable of emulating.
*/
#define PCR_ARCH_207 0x8 /* Architecture 2.07 */
#define PCR_ARCH_206 0x4 /* Architecture 2.06 */
#define PCR_ARCH_205 0x2 /* Architecture 2.05 */
#define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */
......@@ -1218,6 +1232,7 @@
#define PVR_ARCH_206 0x0f000003
#define PVR_ARCH_206p 0x0f100003
#define PVR_ARCH_207 0x0f000004
#define PVR_ARCH_300 0x0f000005
/* Macros for setting and retrieving special purpose registers */
#ifndef __ASSEMBLY__
......
......@@ -573,6 +573,10 @@ struct kvm_get_htab_header {
#define KVM_REG_PPC_SPRG9 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xba)
#define KVM_REG_PPC_DBSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbb)
/* POWER9 registers */
#define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
#define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
/* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs
*/
......@@ -596,6 +600,7 @@ struct kvm_get_htab_header {
#define KVM_REG_PPC_TM_VSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67)
#define KVM_REG_PPC_TM_DSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68)
#define KVM_REG_PPC_TM_TAR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69)
#define KVM_REG_PPC_TM_XER (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x6a)
/* PPC64 eXternal Interrupt Controller Specification */
#define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */
......
......@@ -487,6 +487,7 @@ int main(void)
/* book3s */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
DEFINE(KVM_TLB_SETS, offsetof(struct kvm, arch.tlb_sets));
DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
......@@ -548,6 +549,8 @@ int main(void)
DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr));
DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
DEFINE(VCPU_TID, offsetof(struct kvm_vcpu, arch.tid));
DEFINE(VCPU_PSSCR, offsetof(struct kvm_vcpu, arch.psscr));
DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map));
DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
......@@ -569,6 +572,7 @@ int main(void)
DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr));
DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm));
DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm));
DEFINE(VCPU_XER_TM, offsetof(struct kvm_vcpu, arch.xer_tm));
DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm));
DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm));
DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm));
......
......@@ -174,7 +174,7 @@ __init_FSCR:
__init_HFSCR:
mfspr r3,SPRN_HFSCR
ori r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\
HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB
HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP
mtspr SPRN_HFSCR,r3
blr
......
......@@ -88,6 +88,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
/* 128 (2**7) bytes in each HPTEG */
kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
atomic64_set(&kvm->arch.mmio_update, 0);
/* Allocate reverse map array */
rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
if (!rev) {
......@@ -255,7 +257,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
kvmppc_set_msr(vcpu, msr);
}
long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh,
unsigned long ptel, unsigned long *pte_idx_ret)
{
......@@ -312,7 +314,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_slb *slbe;
unsigned long slb_v;
unsigned long pp, key;
unsigned long v, gr;
unsigned long v, orig_v, gr;
__be64 *hptep;
int index;
int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
......@@ -337,10 +339,12 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
return -ENOENT;
}
hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
if (cpu_has_feature(CPU_FTR_ARCH_300))
v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
gr = kvm->arch.revmap[index].guest_rpte;
unlock_hpte(hptep, v);
unlock_hpte(hptep, orig_v);
preempt_enable();
gpte->eaddr = eaddr;
......@@ -438,6 +442,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
{
struct kvm *kvm = vcpu->kvm;
unsigned long hpte[3], r;
unsigned long hnow_v, hnow_r;
__be64 *hptep;
unsigned long mmu_seq, psize, pte_size;
unsigned long gpa_base, gfn_base;
......@@ -451,6 +456,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int writing, write_ok;
struct vm_area_struct *vma;
unsigned long rcbits;
long mmio_update;
/*
* Real-mode code has already searched the HPT and found the
......@@ -460,6 +466,19 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
if (ea != vcpu->arch.pgfault_addr)
return RESUME_GUEST;
if (vcpu->arch.pgfault_cache) {
mmio_update = atomic64_read(&kvm->arch.mmio_update);
if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
r = vcpu->arch.pgfault_cache->rpte;
psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
dsisr & DSISR_ISSTORE);
}
}
index = vcpu->arch.pgfault_index;
hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
rev = &kvm->arch.revmap[index];
......@@ -472,6 +491,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
unlock_hpte(hptep, hpte[0]);
preempt_enable();
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
hpte[1] = hpte_new_to_old_r(hpte[1]);
}
if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
hpte[1] != vcpu->arch.pgfault_hpte[1])
return RESUME_GUEST;
......@@ -575,15 +598,21 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
if (psize < PAGE_SIZE)
psize = PAGE_SIZE;
r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1));
r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
((pfn << PAGE_SHIFT) & ~(psize - 1));
if (hpte_is_writable(r) && !write_ok)
r = hpte_make_readonly(r);
ret = RESUME_GUEST;
preempt_disable();
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
cpu_relax();
if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] ||
be64_to_cpu(hptep[1]) != hpte[1] ||
hnow_v = be64_to_cpu(hptep[0]);
hnow_r = be64_to_cpu(hptep[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
hnow_r = hpte_new_to_old_r(hnow_r);
}
if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
rev->guest_rpte != hpte[2])
/* HPTE has been changed under us; let the guest retry */
goto out_unlock;
......@@ -615,6 +644,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
}
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
r = hpte_old_to_new_r(hpte[0], r);
hpte[0] = hpte_old_to_new_v(hpte[0]);
}
hptep[1] = cpu_to_be64(r);
eieio();
__unlock_hpte(hptep, hpte[0]);
......@@ -758,6 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
hpte_rpn(ptel, psize) == gfn) {
hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
kvmppc_invalidate_hpte(kvm, hptep, i);
hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
/* Harvest R and C */
rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
......@@ -1165,7 +1199,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
unsigned long *hpte, struct revmap_entry *revp,
int want_valid, int first_pass)
{
unsigned long v, r;
unsigned long v, r, hr;
unsigned long rcbits_unset;
int ok = 1;
int valid, dirty;
......@@ -1192,6 +1226,11 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
cpu_relax();
v = be64_to_cpu(hptp[0]);
hr = be64_to_cpu(hptp[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
v = hpte_new_to_old_v(v, hr);
hr = hpte_new_to_old_r(hr);
}
/* re-evaluate valid and dirty from synchronized HPTE value */
valid = !!(v & HPTE_V_VALID);
......@@ -1199,8 +1238,8 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
/* Harvest R and C into guest view if necessary */
rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) {
revp->guest_rpte |= (be64_to_cpu(hptp[1]) &
if (valid && (rcbits_unset & hr)) {
revp->guest_rpte |= (hr &
(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
dirty = 1;
}
......@@ -1608,7 +1647,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
return ret;
}
ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
return -EACCES;
......
......@@ -39,7 +39,7 @@
#include <asm/udbg.h>
#include <asm/iommu.h>
#include <asm/tce.h>
#include <asm/iommu.h>
#include <asm/asm-prototypes.h>
#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
......
......@@ -54,6 +54,9 @@
#include <asm/dbell.h>
#include <asm/hmi.h>
#include <asm/pnv-pci.h>
#include <asm/mmu.h>
#include <asm/opal.h>
#include <asm/xics.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
......@@ -62,6 +65,7 @@
#include <linux/irqbypass.h>
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/of.h>
#include "book3s.h"
......@@ -104,23 +108,6 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
#endif
/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
/* Factor by which the vcore halt poll interval is grown, default is to double
*/
static unsigned int halt_poll_ns_grow = 2;
module_param(halt_poll_ns_grow, int, S_IRUGO);
MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
/* Factor by which the vcore halt poll interval is shrunk, default is to reset
*/
static unsigned int halt_poll_ns_shrink;
module_param(halt_poll_ns_shrink, int, S_IRUGO);
MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
......@@ -146,12 +133,21 @@ static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
static bool kvmppc_ipi_thread(int cpu)
{
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
/* On POWER9 we can use msgsnd to IPI any cpu */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
smp_mb();
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
return true;
}
/* On POWER8 for IPIs to threads in the same core, use msgsnd */
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
preempt_disable();
if (cpu_first_thread_sibling(cpu) ==
cpu_first_thread_sibling(smp_processor_id())) {
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
msg |= cpu_thread_in_core(cpu);
smp_mb();
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
......@@ -162,10 +158,14 @@ static bool kvmppc_ipi_thread(int cpu)
}
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
if (cpu >= 0 && cpu < nr_cpu_ids) {
if (paca[cpu].kvm_hstate.xics_phys) {
xics_wake_cpu(cpu);
return true;
}
opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
return true;
}
#endif
return false;
......@@ -299,41 +299,54 @@ static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
vcpu->arch.pvr = pvr;
}
/* Dummy value used in computing PCR value below */
#define PCR_ARCH_300 (PCR_ARCH_207 << 1)
static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
{
unsigned long pcr = 0;
unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
struct kvmppc_vcore *vc = vcpu->arch.vcore;
/* We can (emulate) our own architecture version and anything older */
if (cpu_has_feature(CPU_FTR_ARCH_300))
host_pcr_bit = PCR_ARCH_300;
else if (cpu_has_feature(CPU_FTR_ARCH_207S))
host_pcr_bit = PCR_ARCH_207;
else if (cpu_has_feature(CPU_FTR_ARCH_206))
host_pcr_bit = PCR_ARCH_206;
else
host_pcr_bit = PCR_ARCH_205;
/* Determine lowest PCR bit needed to run guest in given PVR level */
guest_pcr_bit = host_pcr_bit;
if (arch_compat) {
switch (arch_compat) {
case PVR_ARCH_205:
/*
* If an arch bit is set in PCR, all the defined
* higher-order arch bits also have to be set.
*/
pcr = PCR_ARCH_206 | PCR_ARCH_205;
guest_pcr_bit = PCR_ARCH_205;
break;
case PVR_ARCH_206:
case PVR_ARCH_206p:
pcr = PCR_ARCH_206;
guest_pcr_bit = PCR_ARCH_206;
break;
case PVR_ARCH_207:
guest_pcr_bit = PCR_ARCH_207;
break;
case PVR_ARCH_300:
guest_pcr_bit = PCR_ARCH_300;
break;
default:
return -EINVAL;
}
}
if (!cpu_has_feature(CPU_FTR_ARCH_207S)) {
/* POWER7 can't emulate POWER8 */
if (!(pcr & PCR_ARCH_206))
/* Check requested PCR bits don't exceed our capabilities */
if (guest_pcr_bit > host_pcr_bit)
return -EINVAL;
pcr &= ~PCR_ARCH_206;
}
}
spin_lock(&vc->lock);
vc->arch_compat = arch_compat;
vc->pcr = pcr;
/* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
vc->pcr = host_pcr_bit - guest_pcr_bit;
spin_unlock(&vc->lock);
return 0;
......@@ -945,6 +958,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
case BOOK3S_INTERRUPT_EXTERNAL:
case BOOK3S_INTERRUPT_H_DOORBELL:
case BOOK3S_INTERRUPT_H_VIRT:
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST;
break;
......@@ -1229,6 +1243,12 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_WORT:
*val = get_reg_val(id, vcpu->arch.wort);
break;
case KVM_REG_PPC_TIDR:
*val = get_reg_val(id, vcpu->arch.tid);
break;
case KVM_REG_PPC_PSSCR:
*val = get_reg_val(id, vcpu->arch.psscr);
break;
case KVM_REG_PPC_VPA_ADDR:
spin_lock(&vcpu->arch.vpa_update_lock);
*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
......@@ -1288,6 +1308,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_TM_CR:
*val = get_reg_val(id, vcpu->arch.cr_tm);
break;
case KVM_REG_PPC_TM_XER:
*val = get_reg_val(id, vcpu->arch.xer_tm);
break;
case KVM_REG_PPC_TM_LR:
*val = get_reg_val(id, vcpu->arch.lr_tm);
break;
......@@ -1427,6 +1450,12 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_WORT:
vcpu->arch.wort = set_reg_val(id, *val);
break;
case KVM_REG_PPC_TIDR:
vcpu->arch.tid = set_reg_val(id, *val);
break;
case KVM_REG_PPC_PSSCR:
vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
break;
case KVM_REG_PPC_VPA_ADDR:
addr = set_reg_val(id, *val);
r = -EINVAL;
......@@ -1498,6 +1527,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_TM_CR:
vcpu->arch.cr_tm = set_reg_val(id, *val);
break;
case KVM_REG_PPC_TM_XER:
vcpu->arch.xer_tm = set_reg_val(id, *val);
break;
case KVM_REG_PPC_TM_LR:
vcpu->arch.lr_tm = set_reg_val(id, *val);
break;
......@@ -1540,6 +1572,20 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
return r;
}
/*
* On POWER9, threads are independent and can be in different partitions.
* Therefore we consider each thread to be a subcore.
* There is a restriction that all threads have to be in the same
* MMU mode (radix or HPT), unfortunately, but since we only support
* HPT guests on a HPT host so far, that isn't an impediment yet.
*/
static int threads_per_vcore(void)
{
if (cpu_has_feature(CPU_FTR_ARCH_300))
return 1;
return threads_per_subcore;
}
static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
{
struct kvmppc_vcore *vcore;
......@@ -1554,7 +1600,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
init_swait_queue_head(&vcore->wq);
vcore->preempt_tb = TB_NIL;
vcore->lpcr = kvm->arch.lpcr;
vcore->first_vcpuid = core * threads_per_subcore;
vcore->first_vcpuid = core * threads_per_vcore();
vcore->kvm = kvm;
INIT_LIST_HEAD(&vcore->preempt_list);
......@@ -1717,7 +1763,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
int core;
struct kvmppc_vcore *vcore;
core = id / threads_per_subcore;
core = id / threads_per_vcore();
if (core >= KVM_MAX_VCORES)
goto out;
......@@ -1935,7 +1981,10 @@ static void kvmppc_wait_for_nap(void)
{
int cpu = smp_processor_id();
int i, loops;
int n_threads = threads_per_vcore();
if (n_threads <= 1)
return;
for (loops = 0; loops < 1000000; ++loops) {
/*
* Check if all threads are finished.
......@@ -1943,17 +1992,17 @@ static void kvmppc_wait_for_nap(void)
* and the thread clears it when finished, so we look
* for any threads that still have a non-NULL vcore ptr.
*/
for (i = 1; i < threads_per_subcore; ++i)
for (i = 1; i < n_threads; ++i)
if (paca[cpu + i].kvm_hstate.kvm_vcore)
break;
if (i == threads_per_subcore) {
if (i == n_threads) {
HMT_medium();
return;
}
HMT_low();
}
HMT_medium();
for (i = 1; i < threads_per_subcore; ++i)
for (i = 1; i < n_threads; ++i)
if (paca[cpu + i].kvm_hstate.kvm_vcore)
pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
}
......@@ -2019,7 +2068,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_PREEMPT;
vc->pcpu = smp_processor_id();
if (vc->num_threads < threads_per_subcore) {
if (vc->num_threads < threads_per_vcore()) {
spin_lock(&lp->lock);
list_add_tail(&vc->preempt_list, &lp->list);
spin_unlock(&lp->lock);
......@@ -2123,8 +2172,7 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
cip->subcore_threads[sub] = vc->num_threads;
cip->subcore_vm[sub] = vc->kvm;
init_master_vcore(vc);
list_del(&vc->preempt_list);
list_add_tail(&vc->preempt_list, &cip->vcs[sub]);
list_move_tail(&vc->preempt_list, &cip->vcs[sub]);
return true;
}
......@@ -2307,6 +2355,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
unsigned long cmd_bit, stat_bit;
int pcpu, thr;
int target_threads;
int controlled_threads;
/*
* Remove from the list any threads that have a signal pending
......@@ -2324,12 +2373,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
init_master_vcore(vc);
vc->preempt_tb = TB_NIL;
/*
* Number of threads that we will be controlling: the same as
* the number of threads per subcore, except on POWER9,
* where it's 1 because the threads are (mostly) independent.
*/
controlled_threads = threads_per_vcore();
/*
* Make sure we are running on primary threads, and that secondary
* threads are offline. Also check if the number of threads in this
* guest are greater than the current system threads per guest.
*/
if ((threads_per_core > 1) &&
if ((controlled_threads > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
......@@ -2345,7 +2401,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
*/
init_core_info(&core_info, vc);
pcpu = smp_processor_id();
target_threads = threads_per_subcore;
target_threads = controlled_threads;
if (target_smt_mode && target_smt_mode < target_threads)
target_threads = target_smt_mode;
if (vc->num_threads < target_threads)
......@@ -2381,7 +2437,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
smp_wmb();
}
pcpu = smp_processor_id();
for (thr = 0; thr < threads_per_subcore; ++thr)
for (thr = 0; thr < controlled_threads; ++thr)
paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
/* Initiate micro-threading (split-core) if required */
......@@ -2491,7 +2547,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
}
/* Let secondaries go back to the offline loop */
for (i = 0; i < threads_per_subcore; ++i) {
for (i = 0; i < controlled_threads; ++i) {
kvmppc_release_hwthread(pcpu + i);
if (sip && sip->napped[i])
kvmppc_ipi_thread(pcpu + i);
......@@ -2543,9 +2599,6 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
vc->halt_poll_ns = 10000;
else
vc->halt_poll_ns *= halt_poll_ns_grow;
if (vc->halt_poll_ns > halt_poll_max_ns)
vc->halt_poll_ns = halt_poll_max_ns;
}
static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
......@@ -2556,7 +2609,8 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
vc->halt_poll_ns /= halt_poll_ns_shrink;
}
/* Check to see if any of the runnable vcpus on the vcore have pending
/*
* Check to see if any of the runnable vcpus on the vcore have pending
* exceptions or are no longer ceded
*/
static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
......@@ -2655,16 +2709,18 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
}
/* Adjust poll time */
if (halt_poll_max_ns) {
if (halt_poll_ns) {
if (block_ns <= vc->halt_poll_ns)
;
/* We slept and blocked for longer than the max halt time */
else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
shrink_halt_poll_ns(vc);
/* We slept and our poll time is too small */
else if (vc->halt_poll_ns < halt_poll_max_ns &&
block_ns < halt_poll_max_ns)
else if (vc->halt_poll_ns < halt_poll_ns &&
block_ns < halt_poll_ns)
grow_halt_poll_ns(vc);
if (vc->halt_poll_ns > halt_poll_ns)
vc->halt_poll_ns = halt_poll_ns;
} else
vc->halt_poll_ns = 0;
......@@ -2971,6 +3027,15 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
/*
* If we are making a new memslot, it might make
* some address that was previously cached as emulated
* MMIO be no longer emulated MMIO, so invalidate
* all the caches of emulated MMIO translations.
*/
if (npages)
atomic64_inc(&kvm->arch.mmio_update);
if (npages && old->npages) {
/*
* If modifying a memslot, reset all the rmap dirty bits.
......@@ -3015,6 +3080,22 @@ static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
return;
}
static void kvmppc_setup_partition_table(struct kvm *kvm)
{
unsigned long dw0, dw1;
/* PS field - page size for VRMA */
dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
/* HTABSIZE and HTABORG fields */
dw0 |= kvm->arch.sdr1;
/* Second dword has GR=0; other fields are unused since UPRT=0 */
dw1 = 0;
mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
}
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{
int err = 0;
......@@ -3066,17 +3147,20 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
psize == 0x1000000))
goto out_srcu;
/* Update VRMASD field in the LPCR */
senc = slb_pgsize_encoding(psize);
kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
(VRMA_VSID << SLB_VSID_SHIFT_1T);
/* the -4 is to account for senc values starting at 0x10 */
lpcr = senc << (LPCR_VRMASD_SH - 4);
/* Create HPTEs in the hash page table for the VRMA */
kvmppc_map_vrma(vcpu, memslot, porder);
/* Update VRMASD field in the LPCR */
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
/* the -4 is to account for senc values starting at 0x10 */
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
} else {
kvmppc_setup_partition_table(kvm);
}
/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
smp_wmb();
......@@ -3219,13 +3303,17 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
* Since we don't flush the TLB when tearing down a VM,
* and this lpid might have previously been used,
* make sure we flush on each core before running the new VM.
* On POWER9, the tlbie in mmu_partition_table_set_entry()
* does this flush for us.
*/
if (!cpu_has_feature(CPU_FTR_ARCH_300))
cpumask_setall(&kvm->arch.need_tlb_flush);
/* Start out with the default set of hcalls enabled */
memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
sizeof(kvm->arch.enabled_hcalls));
if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
/* Init LPCR for virtual RMA mode */
......@@ -3239,8 +3327,28 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
/* On POWER8 turn on online bit to enable PURR/SPURR */
if (cpu_has_feature(CPU_FTR_ARCH_207S))
lpcr |= LPCR_ONL;
/*
* On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
* Set HVICE bit to enable hypervisor virtualization interrupts.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
lpcr &= ~LPCR_VPM0;
lpcr |= LPCR_HVICE;
}
kvm->arch.lpcr = lpcr;
/*
* Work out how many sets the TLB has, for the use of
* the TLB invalidation loop in book3s_hv_rmhandlers.S.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
else if (cpu_has_feature(CPU_FTR_ARCH_207S))
kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */
else
kvm->arch.tlb_sets = POWER7_TLB_SETS; /* 128 */
/*
* Track that we now have a HV mode VM active. This blocks secondary
* CPU threads from coming online.
......@@ -3305,9 +3413,9 @@ static int kvmppc_core_check_processor_compat_hv(void)
!cpu_has_feature(CPU_FTR_ARCH_206))
return -EIO;
/*
* Disable KVM for Power9, untill the required bits merged.
* Disable KVM for Power9 in radix mode.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
return -EIO;
return 0;
......@@ -3661,6 +3769,23 @@ static int kvmppc_book3s_init_hv(void)
if (r)
return r;
/*
* We need a way of accessing the XICS interrupt controller,
* either directly, via paca[cpu].kvm_hstate.xics_phys, or
* indirectly, via OPAL.
*/
#ifdef CONFIG_SMP
if (!get_paca()->kvm_hstate.xics_phys) {
struct device_node *np;
np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
if (!np) {
pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
return -ENODEV;
}
}
#endif
kvm_ops_hv.owner = THIS_MODULE;
kvmppc_hv_ops = &kvm_ops_hv;
......@@ -3683,3 +3808,4 @@ module_exit(kvmppc_book3s_exit_hv);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(KVM_MINOR);
MODULE_ALIAS("devname:kvm");
......@@ -26,6 +26,9 @@
#include <asm/dbell.h>
#include <asm/cputhreads.h>
#include <asm/io.h>
#include <asm/asm-prototypes.h>
#include <asm/opal.h>
#include <asm/smp.h>
#define KVM_CMA_CHUNK_ORDER 18
......@@ -205,12 +208,18 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
void kvmhv_rm_send_ipi(int cpu)
{
unsigned long xics_phys;
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
/* On POWER8 for IPIs to threads in the same core, use msgsnd */
/* On POWER9 we can use msgsnd for any destination cpu. */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
return;
}
/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
cpu_first_thread_sibling(cpu) ==
cpu_first_thread_sibling(raw_smp_processor_id())) {
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
msg |= cpu_thread_in_core(cpu);
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
return;
......@@ -218,7 +227,11 @@ void kvmhv_rm_send_ipi(int cpu)
/* Else poke the target with an IPI */
xics_phys = paca[cpu].kvm_hstate.xics_phys;
if (xics_phys)
rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
else
opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
IPI_PRIORITY);
}
/*
......@@ -329,7 +342,7 @@ static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
* saved a copy of the XIRR in the PACA, it will be picked up by
* the host ICP driver.
*/
static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
static int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
{
struct kvmppc_passthru_irqmap *pimap;
struct kvmppc_irq_map *irq_map;
......@@ -348,11 +361,11 @@ static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
/* We're handling this interrupt, generic code doesn't need to */
local_paca->kvm_hstate.saved_xirr = 0;
return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap, again);
}
#else
static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
{
return 1;
}
......@@ -367,14 +380,31 @@ static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
* -1 if there was a guest wakeup IPI (which has now been cleared)
* -2 if there is PCI passthrough external interrupt that was handled
*/
static long kvmppc_read_one_intr(bool *again);
long kvmppc_read_intr(void)
{
long ret = 0;
long rc;
bool again;
do {
again = false;
rc = kvmppc_read_one_intr(&again);
if (rc && (ret == 0 || rc > ret))
ret = rc;
} while (again);
return ret;
}
static long kvmppc_read_one_intr(bool *again)
{
unsigned long xics_phys;
u32 h_xirr;
__be32 xirr;
u32 xisr;
u8 host_ipi;
int64_t rc;
/* see if a host IPI is pending */
host_ipi = local_paca->kvm_hstate.host_ipi;
......@@ -383,8 +413,14 @@ long kvmppc_read_intr(void)
/* Now read the interrupt from the ICP */
xics_phys = local_paca->kvm_hstate.xics_phys;
if (unlikely(!xics_phys))
if (!xics_phys) {
/* Use OPAL to read the XIRR */
rc = opal_rm_int_get_xirr(&xirr, false);
if (rc < 0)
return 1;
} else {
xirr = _lwzcix(xics_phys + XICS_XIRR);
}
/*
* Save XIRR for later. Since we get control in reverse endian
......@@ -392,7 +428,6 @@ long kvmppc_read_intr(void)
* host endian. Note that xirr is the value read from the
* XIRR register, while h_xirr is the host endian version.
*/
xirr = _lwzcix(xics_phys + XICS_XIRR);
h_xirr = be32_to_cpu(xirr);
local_paca->kvm_hstate.saved_xirr = h_xirr;
xisr = h_xirr & 0xffffff;
......@@ -411,8 +446,16 @@ long kvmppc_read_intr(void)
* If it is an IPI, clear the MFRR and EOI it.
*/
if (xisr == XICS_IPI) {
if (xics_phys) {
_stbcix(xics_phys + XICS_MFRR, 0xff);
_stwcix(xics_phys + XICS_XIRR, xirr);
} else {
opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
rc = opal_rm_int_eoi(h_xirr);
/* If rc > 0, there is another interrupt pending */
*again = rc > 0;
}
/*
* Need to ensure side effects of above stores
* complete before proceeding.
......@@ -429,7 +472,11 @@ long kvmppc_read_intr(void)
/* We raced with the host,
* we need to resend that IPI, bummer
*/
if (xics_phys)
_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
else
opal_rm_int_set_mfrr(hard_smp_processor_id(),
IPI_PRIORITY);
/* Let side effects complete */
smp_mb();
return 1;
......@@ -440,5 +487,5 @@ long kvmppc_read_intr(void)
return -1;
}
return kvmppc_check_passthru(xisr, xirr);
return kvmppc_check_passthru(xisr, xirr, again);
}
......@@ -16,6 +16,7 @@
#include <asm/machdep.h>
#include <asm/cputhreads.h>
#include <asm/hmi.h>
#include <asm/asm-prototypes.h>
/* SRR1 bits for machine check on POWER7 */
#define SRR1_MC_LDSTERR (1ul << (63-42))
......
......@@ -21,6 +21,7 @@
#include <asm/hvcall.h>
#include <asm/synch.h>
#include <asm/ppc-opcode.h>
#include <asm/asm-prototypes.h>
/* Translate address of a vmalloc'd thing to a linear map address */
static void *real_vmalloc_addr(void *x)
......@@ -264,8 +265,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
if (pa)
pteh |= HPTE_V_VALID;
else
else {
pteh |= HPTE_V_ABSENT;
ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
}
/*If we had host pte mapping then Check WIMG */
if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) {
......@@ -351,6 +354,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* inval in progress, write a non-present HPTE */
pteh |= HPTE_V_ABSENT;
pteh &= ~HPTE_V_VALID;
ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
unlock_rmap(rmap);
} else {
kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
......@@ -361,6 +365,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
}
}
/* Convert to new format on P9 */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
ptel = hpte_old_to_new_r(pteh, ptel);
pteh = hpte_old_to_new_v(pteh);
}
hpte[1] = cpu_to_be64(ptel);
/* Write the first HPTE dword, unlocking the HPTE and making it valid */
......@@ -386,6 +395,13 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index))
#endif
static inline int is_mmio_hpte(unsigned long v, unsigned long r)
{
return ((v & HPTE_V_ABSENT) &&
(r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
(HPTE_R_KEY_HI | HPTE_R_KEY_LO));
}
static inline int try_lock_tlbie(unsigned int *lock)
{
unsigned int tmp, old;
......@@ -409,13 +425,18 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
{
long i;
/*
* We use the POWER9 5-operand versions of tlbie and tlbiel here.
* Since we are using RIC=0 PRS=0 R=0, and P7/P8 tlbiel ignores
* the RS field, this is backwards-compatible with P7 and P8.
*/
if (global) {
while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
cpu_relax();
if (need_sync)
asm volatile("ptesync" : : : "memory");
for (i = 0; i < npages; ++i)
asm volatile(PPC_TLBIE(%1,%0) : :
asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
"r" (rbvalues[i]), "r" (kvm->arch.lpid));
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
kvm->arch.tlbie_lock = 0;
......@@ -423,7 +444,8 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
if (need_sync)
asm volatile("ptesync" : : : "memory");
for (i = 0; i < npages; ++i)
asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : :
"r" (rbvalues[i]), "r" (0));
asm volatile("ptesync" : : : "memory");
}
}
......@@ -435,18 +457,23 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
__be64 *hpte;
unsigned long v, r, rb;
struct revmap_entry *rev;
u64 pte;
u64 pte, orig_pte, pte_r;
if (pte_index >= kvm->arch.hpt_npte)
return H_PARAMETER;
hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
pte = be64_to_cpu(hpte[0]);
pte = orig_pte = be64_to_cpu(hpte[0]);
pte_r = be64_to_cpu(hpte[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
pte = hpte_new_to_old_v(pte, pte_r);
pte_r = hpte_new_to_old_r(pte_r);
}
if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
((flags & H_ANDCOND) && (pte & avpn) != 0)) {
__unlock_hpte(hpte, pte);
__unlock_hpte(hpte, orig_pte);
return H_NOT_FOUND;
}
......@@ -454,7 +481,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
v = pte & ~HPTE_V_HVLOCK;
if (v & HPTE_V_VALID) {
hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
rb = compute_tlbie_rb(v, pte_r, pte_index);
do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
/*
* The reference (R) and change (C) bits in a HPT
......@@ -472,6 +499,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
if (is_mmio_hpte(v, pte_r))
atomic64_inc(&kvm->arch.mmio_update);
if (v & HPTE_V_ABSENT)
v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID;
hpret[0] = v;
......@@ -498,7 +528,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
int global;
long int ret = H_SUCCESS;
struct revmap_entry *rev, *revs[4];
u64 hp0;
u64 hp0, hp1;
global = global_invalidates(kvm, 0);
for (i = 0; i < 4 && ret == H_SUCCESS; ) {
......@@ -531,6 +561,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
}
found = 0;
hp0 = be64_to_cpu(hp[0]);
hp1 = be64_to_cpu(hp[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hp0 = hpte_new_to_old_v(hp0, hp1);
hp1 = hpte_new_to_old_r(hp1);
}
if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) {
switch (flags & 3) {
case 0: /* absolute */
......@@ -561,13 +596,14 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
args[j] |= rcbits << (56 - 5);
hp[0] = 0;
if (is_mmio_hpte(hp0, hp1))
atomic64_inc(&kvm->arch.mmio_update);
continue;
}
/* leave it locked */
hp[0] &= ~cpu_to_be64(HPTE_V_VALID);
tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]),
be64_to_cpu(hp[1]), pte_index);
tlbrb[n] = compute_tlbie_rb(hp0, hp1, pte_index);
indexes[n] = j;
hptes[n] = hp;
revs[n] = rev;
......@@ -605,7 +641,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
__be64 *hpte;
struct revmap_entry *rev;
unsigned long v, r, rb, mask, bits;
u64 pte;
u64 pte_v, pte_r;
if (pte_index >= kvm->arch.hpt_npte)
return H_PARAMETER;
......@@ -613,14 +649,16 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
pte = be64_to_cpu(hpte[0]);
if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
__unlock_hpte(hpte, pte);
v = pte_v = be64_to_cpu(hpte[0]);
if (cpu_has_feature(CPU_FTR_ARCH_300))
v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1]));
if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) {
__unlock_hpte(hpte, pte_v);
return H_NOT_FOUND;
}
v = pte;
pte_r = be64_to_cpu(hpte[1]);
bits = (flags << 55) & HPTE_R_PP0;
bits |= (flags << 48) & HPTE_R_KEY_HI;
bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
......@@ -642,22 +680,26 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
* readonly to writable. If it should be writable, we'll
* take a trap and let the page fault code sort it out.
*/
pte = be64_to_cpu(hpte[1]);
r = (pte & ~mask) | bits;
if (hpte_is_writable(r) && !hpte_is_writable(pte))
r = (pte_r & ~mask) | bits;
if (hpte_is_writable(r) && !hpte_is_writable(pte_r))
r = hpte_make_readonly(r);
/* If the PTE is changing, invalidate it first */
if (r != pte) {
if (r != pte_r) {
rb = compute_tlbie_rb(v, r, pte_index);
hpte[0] = cpu_to_be64((v & ~HPTE_V_VALID) |
hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) |
HPTE_V_ABSENT);
do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags),
true);
/* Don't lose R/C bit updates done by hardware */
r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C);
hpte[1] = cpu_to_be64(r);
}
}
unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
unlock_hpte(hpte, pte_v & ~HPTE_V_HVLOCK);
asm volatile("ptesync" : : : "memory");
if (is_mmio_hpte(v, pte_r))
atomic64_inc(&kvm->arch.mmio_update);
return H_SUCCESS;
}
......@@ -681,6 +723,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
r = be64_to_cpu(hpte[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
v = hpte_new_to_old_v(v, r);
r = hpte_new_to_old_r(r);
}
if (v & HPTE_V_ABSENT) {
v &= ~HPTE_V_ABSENT;
v |= HPTE_V_VALID;
......@@ -798,10 +844,16 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index)
{
unsigned long rb;
u64 hp0, hp1;
hptep[0] &= ~cpu_to_be64(HPTE_V_VALID);
rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
pte_index);
hp0 = be64_to_cpu(hptep[0]);
hp1 = be64_to_cpu(hptep[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hp0 = hpte_new_to_old_v(hp0, hp1);
hp1 = hpte_new_to_old_r(hp1);
}
rb = compute_tlbie_rb(hp0, hp1, pte_index);
do_tlbies(kvm, &rb, 1, 1, true);
}
EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
......@@ -811,9 +863,15 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
{
unsigned long rb;
unsigned char rbyte;
u64 hp0, hp1;
rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
pte_index);
hp0 = be64_to_cpu(hptep[0]);
hp1 = be64_to_cpu(hptep[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hp0 = hpte_new_to_old_v(hp0, hp1);
hp1 = hpte_new_to_old_r(hp1);
}
rb = compute_tlbie_rb(hp0, hp1, pte_index);
rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8;
/* modify only the second-last byte, which contains the ref bit */
*((char *)hptep + 14) = rbyte;
......@@ -828,6 +886,37 @@ static int slb_base_page_shift[4] = {
20, /* 1M, unsupported */
};
static struct mmio_hpte_cache_entry *mmio_cache_search(struct kvm_vcpu *vcpu,
unsigned long eaddr, unsigned long slb_v, long mmio_update)
{
struct mmio_hpte_cache_entry *entry = NULL;
unsigned int pshift;
unsigned int i;
for (i = 0; i < MMIO_HPTE_CACHE_SIZE; i++) {
entry = &vcpu->arch.mmio_cache.entry[i];
if (entry->mmio_update == mmio_update) {
pshift = entry->slb_base_pshift;
if ((entry->eaddr >> pshift) == (eaddr >> pshift) &&
entry->slb_v == slb_v)
return entry;
}
}
return NULL;
}
static struct mmio_hpte_cache_entry *
next_mmio_cache_entry(struct kvm_vcpu *vcpu)
{
unsigned int index = vcpu->arch.mmio_cache.index;
vcpu->arch.mmio_cache.index++;
if (vcpu->arch.mmio_cache.index == MMIO_HPTE_CACHE_SIZE)
vcpu->arch.mmio_cache.index = 0;
return &vcpu->arch.mmio_cache.entry[index];
}
/* When called from virtmode, this func should be protected by
* preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK
* can trigger deadlock issue.
......@@ -842,7 +931,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
unsigned long avpn;
__be64 *hpte;
unsigned long mask, val;
unsigned long v, r;
unsigned long v, r, orig_v;
/* Get page shift, work out hash and AVPN etc. */
mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
......@@ -877,6 +966,8 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
for (i = 0; i < 16; i += 2) {
/* Read the PTE racily */
v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
if (cpu_has_feature(CPU_FTR_ARCH_300))
v = hpte_new_to_old_v(v, be64_to_cpu(hpte[i+1]));
/* Check valid/absent, hash, segment size and AVPN */
if (!(v & valid) || (v & mask) != val)
......@@ -885,8 +976,12 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
/* Lock the PTE and read it under the lock */
while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
cpu_relax();
v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
v = orig_v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
r = be64_to_cpu(hpte[i+1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
v = hpte_new_to_old_v(v, r);
r = hpte_new_to_old_r(r);
}
/*
* Check the HPTE again, including base page size
......@@ -896,7 +991,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
/* Return with the HPTE still locked */
return (hash << 3) + (i >> 1);
__unlock_hpte(&hpte[i], v);
__unlock_hpte(&hpte[i], orig_v);
}
if (val & HPTE_V_SECONDARY)
......@@ -924,17 +1019,27 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
{
struct kvm *kvm = vcpu->kvm;
long int index;
unsigned long v, r, gr;
unsigned long v, r, gr, orig_v;
__be64 *hpte;
unsigned long valid;
struct revmap_entry *rev;
unsigned long pp, key;
struct mmio_hpte_cache_entry *cache_entry = NULL;
long mmio_update = 0;
/* For protection fault, expect to find a valid HPTE */
valid = HPTE_V_VALID;
if (status & DSISR_NOHPTE)
if (status & DSISR_NOHPTE) {
valid |= HPTE_V_ABSENT;
mmio_update = atomic64_read(&kvm->arch.mmio_update);
cache_entry = mmio_cache_search(vcpu, addr, slb_v, mmio_update);
}
if (cache_entry) {
index = cache_entry->pte_index;
v = cache_entry->hpte_v;
r = cache_entry->hpte_r;
gr = cache_entry->rpte;
} else {
index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
if (index < 0) {
if (status & DSISR_NOHPTE)
......@@ -942,12 +1047,17 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
return 0; /* for prot fault, HPTE disappeared */
}
hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
r = be64_to_cpu(hpte[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
v = hpte_new_to_old_v(v, r);
r = hpte_new_to_old_r(r);
}
rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
gr = rev->guest_rpte;
unlock_hpte(hpte, v);
unlock_hpte(hpte, orig_v);
}
/* For not found, if the HPTE is valid by now, retry the instruction */
if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
......@@ -985,12 +1095,32 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
vcpu->arch.pgfault_index = index;
vcpu->arch.pgfault_hpte[0] = v;
vcpu->arch.pgfault_hpte[1] = r;
vcpu->arch.pgfault_cache = cache_entry;
/* Check the storage key to see if it is possibly emulated MMIO */
if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
(r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
(HPTE_R_KEY_HI | HPTE_R_KEY_LO))
if ((r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
(HPTE_R_KEY_HI | HPTE_R_KEY_LO)) {
if (!cache_entry) {
unsigned int pshift = 12;
unsigned int pshift_index;
if (slb_v & SLB_VSID_L) {
pshift_index = ((slb_v & SLB_VSID_LP) >> 4);
pshift = slb_base_page_shift[pshift_index];
}
cache_entry = next_mmio_cache_entry(vcpu);
cache_entry->eaddr = addr;
cache_entry->slb_base_pshift = pshift;
cache_entry->pte_index = index;
cache_entry->hpte_v = v;
cache_entry->hpte_r = r;
cache_entry->rpte = gr;
cache_entry->slb_v = slb_v;
cache_entry->mmio_update = mmio_update;
}
if (data && (vcpu->arch.shregs.msr & MSR_IR))
return -2; /* MMIO emulation - load instr word */
}
return -1; /* send fault up to host kernel mode */
}
......@@ -24,6 +24,7 @@
#include <asm/pnv-pci.h>
#include <asm/opal.h>
#include <asm/smp.h>
#include <asm/asm-prototypes.h>
#include "book3s_xics.h"
......@@ -70,7 +71,11 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
hcpu = hcore << threads_shift;
kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
if (paca[hcpu].kvm_hstate.xics_phys)
icp_native_cause_ipi_rm(hcpu);
else
opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu),
IPI_PRIORITY);
}
#else
static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
......@@ -737,7 +742,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
unsigned long eoi_rc;
static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
{
unsigned long xics_phys;
int64_t rc;
......@@ -751,7 +756,12 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
/* EOI it */
xics_phys = local_paca->kvm_hstate.xics_phys;
if (xics_phys) {
_stwcix(xics_phys + XICS_XIRR, xirr);
} else {
rc = opal_rm_int_eoi(be32_to_cpu(xirr));
*again = rc > 0;
}
}
static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
......@@ -809,9 +819,10 @@ static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
}
long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
u32 xirr,
__be32 xirr,
struct kvmppc_irq_map *irq_map,
struct kvmppc_passthru_irqmap *pimap)
struct kvmppc_passthru_irqmap *pimap,
bool *again)
{
struct kvmppc_xics *xics;
struct kvmppc_icp *icp;
......@@ -825,7 +836,8 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
icp_rm_deliver_irq(xics, icp, irq);
/* EOI the interrupt */
icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
again);
if (check_too_hard(xics, icp) == H_TOO_HARD)
return 2;
......
......@@ -501,17 +501,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
cmpwi r0, 0
beq 57f
li r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
mfspr r4, SPRN_LPCR
rlwimi r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
mtspr SPRN_LPCR, r4
isync
std r0, HSTATE_SCRATCH0(r13)
ptesync
ld r0, HSTATE_SCRATCH0(r13)
1: cmpd r0, r0
bne 1b
nap
b .
mfspr r5, SPRN_LPCR
rlwimi r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
b kvm_nap_sequence
57: li r0, 0
stbx r0, r3, r4
......@@ -523,6 +515,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* *
*****************************************************************************/
/* Stack frame offsets */
#define STACK_SLOT_TID (112-16)
#define STACK_SLOT_PSSCR (112-24)
.global kvmppc_hv_entry
kvmppc_hv_entry:
......@@ -581,12 +577,14 @@ kvmppc_hv_entry:
ld r9,VCORE_KVM(r5) /* pointer to struct kvm */
cmpwi r6,0
bne 10f
ld r6,KVM_SDR1(r9)
lwz r7,KVM_LPID(r9)
BEGIN_FTR_SECTION
ld r6,KVM_SDR1(r9)
li r0,LPID_RSVD /* switch to reserved LPID */
mtspr SPRN_LPID,r0
ptesync
mtspr SPRN_SDR1,r6 /* switch to partition page table */
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
mtspr SPRN_LPID,r7
isync
......@@ -607,12 +605,8 @@ kvmppc_hv_entry:
stdcx. r7,0,r6
bne 23b
/* Flush the TLB of any entries for this LPID */
/* use arch 2.07S as a proxy for POWER8 */
BEGIN_FTR_SECTION
li r6,512 /* POWER8 has 512 sets */
FTR_SECTION_ELSE
li r6,128 /* POWER7 has 128 sets */
ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
lwz r6,KVM_TLB_SETS(r9)
li r0,0 /* RS for P9 version of tlbiel */
mtctr r6
li r7,0x800 /* IS field = 0b10 */
ptesync
......@@ -698,6 +692,14 @@ kvmppc_got_guest:
mtspr SPRN_PURR,r7
mtspr SPRN_SPURR,r8
/* Save host values of some registers */
BEGIN_FTR_SECTION
mfspr r5, SPRN_TIDR
mfspr r6, SPRN_PSSCR
std r5, STACK_SLOT_TID(r1)
std r6, STACK_SLOT_PSSCR(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
BEGIN_FTR_SECTION
/* Set partition DABR */
/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
......@@ -750,14 +752,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
BEGIN_FTR_SECTION
ld r5, VCPU_MMCR + 24(r4)
ld r6, VCPU_SIER(r4)
mtspr SPRN_MMCR2, r5
mtspr SPRN_SIER, r6
BEGIN_FTR_SECTION_NESTED(96)
lwz r7, VCPU_PMC + 24(r4)
lwz r8, VCPU_PMC + 28(r4)
ld r9, VCPU_MMCR + 32(r4)
mtspr SPRN_MMCR2, r5
mtspr SPRN_SIER, r6
mtspr SPRN_SPMC1, r7
mtspr SPRN_SPMC2, r8
mtspr SPRN_MMCRS, r9
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
......@@ -813,20 +817,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtspr SPRN_EBBHR, r8
ld r5, VCPU_EBBRR(r4)
ld r6, VCPU_BESCR(r4)
ld r7, VCPU_CSIGR(r4)
ld r8, VCPU_TACR(r4)
lwz r7, VCPU_GUEST_PID(r4)
ld r8, VCPU_WORT(r4)
mtspr SPRN_EBBRR, r5
mtspr SPRN_BESCR, r6
mtspr SPRN_CSIGR, r7
mtspr SPRN_TACR, r8
mtspr SPRN_PID, r7
mtspr SPRN_WORT, r8
BEGIN_FTR_SECTION
/* POWER8-only registers */
ld r5, VCPU_TCSCR(r4)
ld r6, VCPU_ACOP(r4)
lwz r7, VCPU_GUEST_PID(r4)
ld r8, VCPU_WORT(r4)
ld r7, VCPU_CSIGR(r4)
ld r8, VCPU_TACR(r4)
mtspr SPRN_TCSCR, r5
mtspr SPRN_ACOP, r6
mtspr SPRN_PID, r7
mtspr SPRN_WORT, r8
mtspr SPRN_CSIGR, r7
mtspr SPRN_TACR, r8
FTR_SECTION_ELSE
/* POWER9-only registers */
ld r5, VCPU_TID(r4)
ld r6, VCPU_PSSCR(r4)
oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
8:
/*
......@@ -1341,20 +1355,29 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
std r8, VCPU_EBBHR(r9)
mfspr r5, SPRN_EBBRR
mfspr r6, SPRN_BESCR
mfspr r7, SPRN_CSIGR
mfspr r8, SPRN_TACR
mfspr r7, SPRN_PID
mfspr r8, SPRN_WORT
std r5, VCPU_EBBRR(r9)
std r6, VCPU_BESCR(r9)
std r7, VCPU_CSIGR(r9)
std r8, VCPU_TACR(r9)
stw r7, VCPU_GUEST_PID(r9)
std r8, VCPU_WORT(r9)
BEGIN_FTR_SECTION
mfspr r5, SPRN_TCSCR
mfspr r6, SPRN_ACOP
mfspr r7, SPRN_PID
mfspr r8, SPRN_WORT
mfspr r7, SPRN_CSIGR
mfspr r8, SPRN_TACR
std r5, VCPU_TCSCR(r9)
std r6, VCPU_ACOP(r9)
stw r7, VCPU_GUEST_PID(r9)
std r8, VCPU_WORT(r9)
std r7, VCPU_CSIGR(r9)
std r8, VCPU_TACR(r9)
FTR_SECTION_ELSE
mfspr r5, SPRN_TIDR
mfspr r6, SPRN_PSSCR
std r5, VCPU_TID(r9)
rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */
rotldi r6, r6, 60
std r6, VCPU_PSSCR(r9)
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
/*
* Restore various registers to 0, where non-zero values
* set by the guest could disrupt the host.
......@@ -1363,12 +1386,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtspr SPRN_IAMR, r0
mtspr SPRN_CIABR, r0
mtspr SPRN_DAWRX, r0
mtspr SPRN_TCSCR, r0
mtspr SPRN_WORT, r0
BEGIN_FTR_SECTION
mtspr SPRN_TCSCR, r0
/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
li r0, 1
sldi r0, r0, 31
mtspr SPRN_MMCRS, r0
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
8:
/* Save and reset AMR and UAMOR before turning on the MMU */
......@@ -1502,15 +1527,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
stw r8, VCPU_PMC + 20(r9)
BEGIN_FTR_SECTION
mfspr r5, SPRN_SIER
std r5, VCPU_SIER(r9)
BEGIN_FTR_SECTION_NESTED(96)
mfspr r6, SPRN_SPMC1
mfspr r7, SPRN_SPMC2
mfspr r8, SPRN_MMCRS
std r5, VCPU_SIER(r9)
stw r6, VCPU_PMC + 24(r9)
stw r7, VCPU_PMC + 28(r9)
std r8, VCPU_MMCR + 32(r9)
lis r4, 0x8000
mtspr SPRN_MMCRS, r4
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
22:
/* Clear out SLB */
......@@ -1519,6 +1546,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
slbia
ptesync
/* Restore host values of some registers */
BEGIN_FTR_SECTION
ld r5, STACK_SLOT_TID(r1)
ld r6, STACK_SLOT_PSSCR(r1)
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
/*
* POWER7/POWER8 guest -> host partition switch code.
* We don't have to lock against tlbies but we do
......@@ -1552,12 +1587,14 @@ kvmhv_switch_to_host:
beq 19f
/* Primary thread switches back to host partition */
ld r6,KVM_HOST_SDR1(r4)
lwz r7,KVM_HOST_LPID(r4)
BEGIN_FTR_SECTION
ld r6,KVM_HOST_SDR1(r4)
li r8,LPID_RSVD /* switch to reserved LPID */
mtspr SPRN_LPID,r8
ptesync
mtspr SPRN_SDR1,r6 /* switch to partition page table */
mtspr SPRN_SDR1,r6 /* switch to host page table */
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
mtspr SPRN_LPID,r7
isync
......@@ -2211,6 +2248,21 @@ BEGIN_FTR_SECTION
ori r5, r5, LPCR_PECEDH
rlwimi r5, r3, 0, LPCR_PECEDP
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
kvm_nap_sequence: /* desired LPCR value in r5 */
BEGIN_FTR_SECTION
/*
* PSSCR bits: exit criterion = 1 (wakeup based on LPCR at sreset)
* enable state loss = 1 (allow SMT mode switch)
* requested level = 0 (just stop dispatching)
*/
lis r3, (PSSCR_EC | PSSCR_ESL)@h
mtspr SPRN_PSSCR, r3
/* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */
li r4, LPCR_PECE_HVEE@higher
sldi r4, r4, 32
or r5, r5, r4
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
mtspr SPRN_LPCR,r5
isync
li r0, 0
......@@ -2219,7 +2271,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
ld r0, HSTATE_SCRATCH0(r13)
1: cmpd r0, r0
bne 1b
BEGIN_FTR_SECTION
nap
FTR_SECTION_ELSE
PPC_STOP
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
b .
33: mr r4, r3
......@@ -2600,11 +2656,13 @@ kvmppc_save_tm:
mfctr r7
mfspr r8, SPRN_AMR
mfspr r10, SPRN_TAR
mfxer r11
std r5, VCPU_LR_TM(r9)
stw r6, VCPU_CR_TM(r9)
std r7, VCPU_CTR_TM(r9)
std r8, VCPU_AMR_TM(r9)
std r10, VCPU_TAR_TM(r9)
std r11, VCPU_XER_TM(r9)
/* Restore r12 as trap number. */
lwz r12, VCPU_TRAP(r9)
......@@ -2697,11 +2755,13 @@ kvmppc_restore_tm:
ld r7, VCPU_CTR_TM(r4)
ld r8, VCPU_AMR_TM(r4)
ld r9, VCPU_TAR_TM(r4)
ld r10, VCPU_XER_TM(r4)
mtlr r5
mtcr r6
mtctr r7
mtspr SPRN_AMR, r8
mtspr SPRN_TAR, r9
mtxer r10
/*
* Load up PPR and DSCR values but don't put them in the actual SPRs
......
......@@ -536,7 +536,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_SPAPR_TCE:
case KVM_CAP_SPAPR_TCE_64:
case KVM_CAP_PPC_ALLOC_HTAB:
case KVM_CAP_PPC_RTAS:
case KVM_CAP_PPC_FIXUP_HCALL:
case KVM_CAP_PPC_ENABLE_HCALL:
......@@ -545,13 +544,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
#endif
r = 1;
break;
case KVM_CAP_PPC_ALLOC_HTAB:
r = hv_enabled;
break;
#endif /* CONFIG_PPC_BOOK3S_64 */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
case KVM_CAP_PPC_SMT:
if (hv_enabled)
r = threads_per_subcore;
else
r = 0;
if (hv_enabled) {
if (cpu_has_feature(CPU_FTR_ARCH_300))
r = 1;
else
r = threads_per_subcore;
}
break;
case KVM_CAP_PPC_RMA:
r = 0;
......
......@@ -449,7 +449,7 @@ TRACE_EVENT(kvmppc_vcore_wakeup,
__entry->tgid = current->tgid;
),
TP_printk("%s time %lld ns, tgid=%d",
TP_printk("%s time %llu ns, tgid=%d",
__entry->waited ? "wait" : "poll",
__entry->ns, __entry->tgid)
);
......
......@@ -221,13 +221,18 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
return -1;
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED)) {
DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
i, hpte_v, hpte_r);
}
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
hpte_v = hpte_old_to_new_v(hpte_v);
}
hptep->r = cpu_to_be64(hpte_r);
/* Guarantee the second dword is visible before the valid bit */
eieio();
......@@ -295,6 +300,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
vpn, want_v & HPTE_V_AVPN, slot, newpp);
hpte_v = be64_to_cpu(hptep->v);
if (cpu_has_feature(CPU_FTR_ARCH_300))
hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
/*
* We need to invalidate the TLB always because hpte_remove doesn't do
* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
......@@ -309,6 +316,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
native_lock_hpte(hptep);
/* recheck with locks held */
hpte_v = be64_to_cpu(hptep->v);
if (cpu_has_feature(CPU_FTR_ARCH_300))
hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
!(hpte_v & HPTE_V_VALID))) {
ret = -1;
......@@ -350,6 +359,8 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + slot;
hpte_v = be64_to_cpu(hptep->v);
if (cpu_has_feature(CPU_FTR_ARCH_300))
hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
/* HPTE matches */
......@@ -409,6 +420,8 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
want_v = hpte_encode_avpn(vpn, bpsize, ssize);
native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
if (cpu_has_feature(CPU_FTR_ARCH_300))
hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
/*
* We need to invalidate the TLB always because hpte_remove doesn't do
......@@ -467,6 +480,8 @@ static void native_hugepage_invalidate(unsigned long vsid,
want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
if (cpu_has_feature(CPU_FTR_ARCH_300))
hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
/* Even if we miss, we need to invalidate the TLB */
if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
......@@ -504,6 +519,10 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
/* Look at the 8 bit LP value */
unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
hpte_r = hpte_new_to_old_r(hpte_r);
}
if (!(hpte_v & HPTE_V_LARGE)) {
size = MMU_PAGE_4K;
a_size = MMU_PAGE_4K;
......@@ -512,11 +531,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
a_size = hpte_page_sizes[lp] >> 4;
}
/* This works for all page sizes, and for 256M and 1T segments */
if (cpu_has_feature(CPU_FTR_ARCH_300))
*ssize = hpte_r >> HPTE_R_3_0_SSIZE_SHIFT;
else
*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
shift = mmu_psize_defs[size].shift;
avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
......@@ -639,6 +654,9 @@ static void native_flush_hash_range(unsigned long number, int local)
want_v = hpte_encode_avpn(vpn, psize, ssize);
native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
if (cpu_has_feature(CPU_FTR_ARCH_300))
hpte_v = hpte_new_to_old_v(hpte_v,
be64_to_cpu(hptep->r));
if (!HPTE_V_COMPARE(hpte_v, want_v) ||
!(hpte_v & HPTE_V_VALID))
native_unlock_hpte(hptep);
......
......@@ -792,37 +792,17 @@ static void update_hid_for_hash(void)
static void __init hash_init_partition_table(phys_addr_t hash_table,
unsigned long htab_size)
{
unsigned long ps_field;
unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
mmu_partition_table_init();
/*
* slb llp encoding for the page size used in VPM real mode.
* We can ignore that for lpid 0
* PS field (VRMA page size) is not used for LPID 0, hence set to 0.
* For now, UPRT is 0 and we have no segment table.
*/
ps_field = 0;
htab_size = __ilog2(htab_size) - 18;
BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
MEMBLOCK_ALLOC_ANYWHERE));
/* Initialize the Partition Table with no entries */
memset((void *)partition_tb, 0, patb_size);
partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size);
/*
* FIXME!! This should be done via update_partition table
* For now UPRT is 0 for us.
*/
partition_tb->patb1 = 0;
mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
pr_info("Partition table %p\n", partition_tb);
if (cpu_has_feature(CPU_FTR_POWER9_DD1))
update_hid_for_hash();
/*
* update partition table control register,
* 64 K size.
*/
mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
}
static void __init htab_initialize(void)
......
......@@ -177,23 +177,15 @@ static void __init radix_init_pgtable(void)
static void __init radix_init_partition_table(void)
{
unsigned long rts_field;
unsigned long rts_field, dw0;
mmu_partition_table_init();
rts_field = radix__get_tree_size();
dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
mmu_partition_table_set_entry(0, dw0, 0);
BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT);
partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) |
RADIX_PGD_INDEX_SIZE | PATB_HR);
pr_info("Initializing Radix MMU\n");
pr_info("Partition table %p\n", partition_tb);
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
/*
* update partition table control register,
* 64 K size.
*/
mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
}
void __init radix_init_native(void)
......@@ -378,6 +370,8 @@ void __init radix__early_init_mmu(void)
radix_init_partition_table();
}
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
radix_init_pgtable();
}
......
......@@ -431,3 +431,37 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
}
}
#endif
#ifdef CONFIG_PPC_BOOK3S_64
void __init mmu_partition_table_init(void)
{
unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
MEMBLOCK_ALLOC_ANYWHERE));
/* Initialize the Partition Table with no entries */
memset((void *)partition_tb, 0, patb_size);
/*
* update partition table control register,
* 64 K size.
*/
mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
}
void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
unsigned long dw1)
{
partition_tb[lpid].patb0 = cpu_to_be64(dw0);
partition_tb[lpid].patb1 = cpu_to_be64(dw1);
/* Global flush of TLBs and partition table caches for this lpid */
asm volatile("ptesync" : : : "memory");
asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
"r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
}
EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
#endif /* CONFIG_PPC_BOOK3S_64 */
......@@ -304,8 +304,11 @@ OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE);
OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE);
OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE);
OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR);
OPAL_CALL_REAL(opal_rm_int_get_xirr, OPAL_INT_GET_XIRR);
OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR);
OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
OPAL_CALL_REAL(opal_rm_int_eoi, OPAL_INT_EOI);
OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
OPAL_CALL_REAL(opal_rm_int_set_mfrr, OPAL_INT_SET_MFRR);
OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
OPAL_CALL_REAL(opal_rm_pci_tce_kill, OPAL_PCI_TCE_KILL);
......@@ -896,3 +896,5 @@ EXPORT_SYMBOL_GPL(opal_leds_get_ind);
EXPORT_SYMBOL_GPL(opal_leds_set_ind);
/* Export this symbol for PowerNV Operator Panel class driver */
EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
/* Export this for KVM */
EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
......@@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn,
vflags &= ~HPTE_V_SECONDARY;
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags;
hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags;
spin_lock_irqsave(&ps3_htab_lock, flags);
......
......@@ -145,7 +145,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
hpte_group, vpn, pa, rflags, vflags, psize);
hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
if (!(vflags & HPTE_V_BOLTED))
pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
......
......@@ -1113,6 +1113,10 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
extern bool kvm_rebooting;
extern unsigned int halt_poll_ns;
extern unsigned int halt_poll_ns_grow;
extern unsigned int halt_poll_ns_shrink;
struct kvm_device {
struct kvm_device_ops *ops;
struct kvm *kvm;
......
......@@ -651,6 +651,9 @@ struct kvm_enable_cap {
};
/* for KVM_PPC_GET_PVINFO */
#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0)
struct kvm_ppc_pvinfo {
/* out */
__u32 flags;
......@@ -682,8 +685,6 @@ struct kvm_ppc_smmu_info {
struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
};
#define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0)
#define KVMIO 0xAE
/* machine type bits, to be used as argument to KVM_CREATE_VM */
......
......@@ -70,16 +70,19 @@ MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
/* Architectures should define their poll value according to the halt latency */
static unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
EXPORT_SYMBOL_GPL(halt_poll_ns);
/* Default doubles per-vcpu halt_poll_ns. */
static unsigned int halt_poll_ns_grow = 2;
unsigned int halt_poll_ns_grow = 2;
module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
/* Default resets per-vcpu halt_poll_ns . */
static unsigned int halt_poll_ns_shrink;
unsigned int halt_poll_ns_shrink;
module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
/*
* Ordering of locks:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment