Commit 5fdb2621 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - A couple of fixes when handling an exception while a SError has
     been delivered

   - Workaround for Cortex-A510's single-step erratum

  RISC-V:

   - Make CY, TM, and IR counters accessible in VU mode

   - Fix SBI implementation version

  x86:

   - Report deprecation of x87 features in supported CPUID

   - Preparation for fixing an interrupt delivery race on AMD hardware

   - Sparse fix

  All except POWER and s390:

   - Rework guest entry code to correctly mark noinstr areas and fix
     vtime' accounting (for x86, this was already mostly correct but not
     entirely; for ARM, MIPS and RISC-V it wasn't)"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: x86: Use ERR_PTR_USR() to return -EFAULT as a __user pointer
  KVM: x86: Report deprecated x87 features in supported CPUID
  KVM: arm64: Workaround Cortex-A510's single-step and PAC trap errata
  KVM: arm64: Stop handle_exit() from handling HVC twice when an SError occurs
  KVM: arm64: Avoid consuming a stale esr value when SError occur
  RISC-V: KVM: Fix SBI implementation version
  RISC-V: KVM: make CY, TM, and IR counters accessible in VU mode
  kvm/riscv: rework guest entry logic
  kvm/arm64: rework guest entry logic
  kvm/x86: rework guest entry logic
  kvm/mips: rework guest entry logic
  kvm: add guest_state_{enter,exit}_irqoff()
  KVM: x86: Move delivery of non-APICv interrupt into vendor code
  kvm: Move KVM_GET_XSAVE2 IOCTL definition at the end of kvm.h
parents fbc04bf0 7e6a6b40
......@@ -100,6 +100,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2051678 | ARM64_ERRATUM_2051678 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2077057 | ARM64_ERRATUM_2077057 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
......
......@@ -680,6 +680,22 @@ config ARM64_ERRATUM_2051678
If unsure, say Y.
config ARM64_ERRATUM_2077057
bool "Cortex-A510: 2077057: workaround software-step corrupting SPSR_EL2"
help
This option adds the workaround for ARM Cortex-A510 erratum 2077057.
Affected Cortex-A510 may corrupt SPSR_EL2 when the a step exception is
expected, but a Pointer Authentication trap is taken instead. The
erratum causes SPSR_EL1 to be copied to SPSR_EL2, which could allow
EL1 to cause a return to EL2 with a guest controlled ELR_EL2.
This can only happen when EL2 is stepping EL1.
When these conditions occur, the SPSR_EL2 value is unchanged from the
previous guest entry, and can be restored from the in-memory copy.
If unsure, say Y.
config ARM64_ERRATUM_2119858
bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode"
default y
......
......@@ -600,6 +600,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus),
},
#endif
#ifdef CONFIG_ARM64_ERRATUM_2077057
{
.desc = "ARM erratum 2077057",
.capability = ARM64_WORKAROUND_2077057,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2),
},
#endif
#ifdef CONFIG_ARM64_ERRATUM_2064142
{
.desc = "ARM erratum 2064142",
......
......@@ -797,6 +797,24 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
xfer_to_guest_mode_work_pending();
}
/*
* Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
* the vCPU is running.
*
* This must be noinstr as instrumentation may make use of RCU, and this is not
* safe during the EQS.
*/
static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
int ret;
guest_state_enter_irqoff();
ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
guest_state_exit_irqoff();
return ret;
}
/**
* kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
* @vcpu: The VCPU pointer
......@@ -881,9 +899,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* Enter the guest
*/
trace_kvm_entry(*vcpu_pc(vcpu));
guest_enter_irqoff();
guest_timing_enter_irqoff();
ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
ret = kvm_arm_vcpu_enter_exit(vcpu);
vcpu->mode = OUTSIDE_GUEST_MODE;
vcpu->stat.exits++;
......@@ -918,26 +936,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_arch_vcpu_ctxsync_fp(vcpu);
/*
* We may have taken a host interrupt in HYP mode (ie
* while executing the guest). This interrupt is still
* pending, as we haven't serviced it yet!
* We must ensure that any pending interrupts are taken before
* we exit guest timing so that timer ticks are accounted as
* guest time. Transiently unmask interrupts so that any
* pending interrupts are taken.
*
* We're now back in SVC mode, with interrupts
* disabled. Enabling the interrupts now will have
* the effect of taking the interrupt again, in SVC
* mode this time.
* Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
* context synchronization event) is necessary to ensure that
* pending interrupts are taken.
*/
local_irq_enable();
isb();
local_irq_disable();
guest_timing_exit_irqoff();
local_irq_enable();
/*
* We do local_irq_enable() before calling guest_exit() so
* that if a timer interrupt hits while running the guest we
* account that tick as being spent in the guest. We enable
* preemption after calling guest_exit() so that if we get
* preempted we make sure ticks after that is not counted as
* guest time.
*/
guest_exit();
trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
/* Exit types that need handling before we can be preempted */
......
......@@ -228,6 +228,14 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index)
{
struct kvm_run *run = vcpu->run;
if (ARM_SERROR_PENDING(exception_index)) {
/*
* The SError is handled by handle_exit_early(). If the guest
* survives it will re-execute the original instruction.
*/
return 1;
}
exception_index = ARM_EXCEPTION_CODE(exception_index);
switch (exception_index) {
......
......@@ -402,6 +402,24 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
return false;
}
static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code)
{
/*
* Check for the conditions of Cortex-A510's #2077057. When these occur
* SPSR_EL2 can't be trusted, but isn't needed either as it is
* unchanged from the value in vcpu_gp_regs(vcpu)->pstate.
* Are we single-stepping the guest, and took a PAC exception from the
* active-not-pending state?
*/
if (cpus_have_final_cap(ARM64_WORKAROUND_2077057) &&
vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
*vcpu_cpsr(vcpu) & DBG_SPSR_SS &&
ESR_ELx_EC(read_sysreg_el2(SYS_ESR)) == ESR_ELx_EC_PAC)
write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);
vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR);
}
/*
* Return true when we were able to fixup the guest exit and should return to
* the guest, false when we should restore the host state and return to the
......@@ -413,7 +431,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
* Save PSTATE early so that we can evaluate the vcpu mode
* early on.
*/
vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR);
synchronize_vcpu_pstate(vcpu, exit_code);
/*
* Check whether we want to repaint the state one way or
......@@ -424,7 +442,8 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);
if (ARM_SERROR_PENDING(*exit_code)) {
if (ARM_SERROR_PENDING(*exit_code) &&
ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) {
u8 esr_ec = kvm_vcpu_trap_get_class(vcpu);
/*
......
......@@ -55,9 +55,10 @@ WORKAROUND_1418040
WORKAROUND_1463225
WORKAROUND_1508412
WORKAROUND_1542419
WORKAROUND_2064142
WORKAROUND_2038923
WORKAROUND_1902691
WORKAROUND_2038923
WORKAROUND_2064142
WORKAROUND_2077057
WORKAROUND_TRBE_OVERWRITE_FILL_MODE
WORKAROUND_TSB_FLUSH_FAILURE
WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
......
......@@ -414,6 +414,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
return -ENOIOCTLCMD;
}
/*
* Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
* the vCPU is running.
*
* This must be noinstr as instrumentation may make use of RCU, and this is not
* safe during the EQS.
*/
static int noinstr kvm_mips_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
int ret;
guest_state_enter_irqoff();
ret = kvm_mips_callbacks->vcpu_run(vcpu);
guest_state_exit_irqoff();
return ret;
}
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
int r = -EINTR;
......@@ -434,7 +452,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
lose_fpu(1);
local_irq_disable();
guest_enter_irqoff();
guest_timing_enter_irqoff();
trace_kvm_enter(vcpu);
/*
......@@ -445,10 +463,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
smp_store_mb(vcpu->mode, IN_GUEST_MODE);
r = kvm_mips_callbacks->vcpu_run(vcpu);
r = kvm_mips_vcpu_enter_exit(vcpu);
/*
* We must ensure that any pending interrupts are taken before
* we exit guest timing so that timer ticks are accounted as
* guest time. Transiently unmask interrupts so that any
* pending interrupts are taken.
*
* TODO: is there a barrier which ensures that pending interrupts are
* recognised? Currently this just hopes that the CPU takes any pending
* interrupts between the enable and disable.
*/
local_irq_enable();
local_irq_disable();
trace_kvm_out(vcpu);
guest_exit_irqoff();
guest_timing_exit_irqoff();
local_irq_enable();
out:
......@@ -1168,7 +1199,7 @@ static void kvm_mips_set_c0_status(void)
/*
* Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
*/
int kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
static int __kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
u32 cause = vcpu->arch.host_cp0_cause;
......@@ -1357,6 +1388,17 @@ int kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
return ret;
}
int noinstr kvm_mips_handle_exit(struct kvm_vcpu *vcpu)
{
int ret;
guest_state_exit_irqoff();
ret = __kvm_mips_handle_exit(vcpu);
guest_state_enter_irqoff();
return ret;
}
/* Enable FPU for guest and restore context */
void kvm_own_fpu(struct kvm_vcpu *vcpu)
{
......
......@@ -90,6 +90,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *cntx;
struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr;
/* Mark this VCPU never ran */
vcpu->arch.ran_atleast_once = false;
......@@ -106,6 +107,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
cntx->hstatus |= HSTATUS_SPVP;
cntx->hstatus |= HSTATUS_SPV;
/* By default, make CY, TM, and IR counters accessible in VU mode */
reset_csr->scounteren = 0x7;
/* Setup VCPU timer */
kvm_riscv_vcpu_timer_init(vcpu);
......@@ -699,6 +703,20 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu)
csr_write(CSR_HVIP, csr->hvip);
}
/*
* Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
* the vCPU is running.
*
* This must be noinstr as instrumentation may make use of RCU, and this is not
* safe during the EQS.
*/
static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
guest_state_enter_irqoff();
__kvm_riscv_switch_to(&vcpu->arch);
guest_state_exit_irqoff();
}
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
int ret;
......@@ -790,9 +808,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
continue;
}
guest_enter_irqoff();
guest_timing_enter_irqoff();
__kvm_riscv_switch_to(&vcpu->arch);
kvm_riscv_vcpu_enter_exit(vcpu);
vcpu->mode = OUTSIDE_GUEST_MODE;
vcpu->stat.exits++;
......@@ -812,25 +830,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_riscv_vcpu_sync_interrupts(vcpu);
/*
* We may have taken a host interrupt in VS/VU-mode (i.e.
* while executing the guest). This interrupt is still
* pending, as we haven't serviced it yet!
* We must ensure that any pending interrupts are taken before
* we exit guest timing so that timer ticks are accounted as
* guest time. Transiently unmask interrupts so that any
* pending interrupts are taken.
*
* We're now back in HS-mode with interrupts disabled
* so enabling the interrupts now will have the effect
* of taking the interrupt again, in HS-mode this time.
* There's no barrier which ensures that pending interrupts are
* recognised, so we just hope that the CPU takes any pending
* interrupts between the enable and disable.
*/
local_irq_enable();
local_irq_disable();
/*
* We do local_irq_enable() before calling guest_exit() so
* that if a timer interrupt hits while running the guest
* we account that tick as being spent in the guest. We
* enable preemption after calling guest_exit() so that if
* we get preempted we make sure ticks after that is not
* counted as guest time.
*/
guest_exit();
guest_timing_exit_irqoff();
local_irq_enable();
preempt_enable();
......
......@@ -9,6 +9,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kvm_host.h>
#include <linux/version.h>
#include <asm/csr.h>
#include <asm/sbi.h>
#include <asm/kvm_vcpu_timer.h>
......@@ -32,7 +33,7 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
*out_val = KVM_SBI_IMPID;
break;
case SBI_EXT_BASE_GET_IMP_VERSION:
*out_val = 0;
*out_val = LINUX_VERSION_CODE;
break;
case SBI_EXT_BASE_PROBE_EXT:
if ((cp->a0 >= SBI_EXT_EXPERIMENTAL_START &&
......
......@@ -82,7 +82,7 @@ KVM_X86_OP_NULL(guest_apic_has_interrupt)
KVM_X86_OP(load_eoi_exitmap)
KVM_X86_OP(set_virtual_apic_mode)
KVM_X86_OP_NULL(set_apic_access_page_addr)
KVM_X86_OP(deliver_posted_interrupt)
KVM_X86_OP(deliver_interrupt)
KVM_X86_OP_NULL(sync_pir_to_irr)
KVM_X86_OP(set_tss_addr)
KVM_X86_OP(set_identity_map_addr)
......
......@@ -1410,7 +1410,8 @@ struct kvm_x86_ops {
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
......
......@@ -554,12 +554,13 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_7_0_EBX,
F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
);
F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) |
F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) |
F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) |
F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) |
F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) |
F(AVX512VL));
kvm_cpu_cap_mask(CPUID_7_ECX,
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
......
......@@ -1096,14 +1096,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) {
kvm_lapic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
} else {
trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
trig_mode, vector);
}
static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode,
trig_mode, vector);
break;
case APIC_DM_REMRD:
......
......@@ -3291,6 +3291,21 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector)
{
struct kvm_vcpu *vcpu = apic->vcpu;
if (svm_deliver_avic_intr(vcpu, vector)) {
kvm_lapic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
} else {
trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
trig_mode, vector);
}
}
static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
......@@ -3615,7 +3630,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long vmcb_pa = svm->current_vmcb->pa;
kvm_guest_enter_irqoff();
guest_state_enter_irqoff();
if (sev_es_guest(vcpu->kvm)) {
__svm_sev_es_vcpu_run(vmcb_pa);
......@@ -3635,7 +3650,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
vmload(__sme_page_pa(sd->save_area));
}
kvm_guest_exit_irqoff();
guest_state_exit_irqoff();
}
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
......@@ -4545,7 +4560,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.pmu_ops = &amd_pmu_ops,
.nested_ops = &svm_nested_ops,
.deliver_posted_interrupt = svm_deliver_avic_intr,
.deliver_interrupt = svm_deliver_interrupt,
.dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
.update_pi_irte = svm_update_pi_irte,
.setup_mce = svm_setup_mce,
......
......@@ -4041,6 +4041,21 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
return 0;
}
static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector)
{
struct kvm_vcpu *vcpu = apic->vcpu;
if (vmx_deliver_posted_interrupt(vcpu, vector)) {
kvm_lapic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
} else {
trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
trig_mode, vector);
}
}
/*
* Set up the vmcs's constant host-state fields, i.e., host-state fields that
* will not change in the lifetime of the guest.
......@@ -6754,7 +6769,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
kvm_guest_enter_irqoff();
guest_state_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
......@@ -6770,7 +6785,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = native_read_cr2();
kvm_guest_exit_irqoff();
guest_state_exit_irqoff();
}
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
......@@ -7768,7 +7783,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
.deliver_interrupt = vmx_deliver_interrupt,
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
.set_tss_addr = vmx_set_tss_addr,
......
......@@ -90,6 +90,8 @@
u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
#define emul_to_vcpu(ctxt) \
((struct kvm_vcpu *)(ctxt)->vcpu)
......@@ -4340,7 +4342,7 @@ static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
void __user *uaddr = (void __user*)(unsigned long)attr->addr;
if ((u64)(unsigned long)uaddr != attr->addr)
return ERR_PTR(-EFAULT);
return ERR_PTR_USR(-EFAULT);
return uaddr;
}
......@@ -10041,6 +10043,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(0, 7);
}
guest_timing_enter_irqoff();
for (;;) {
/*
* Assert that vCPU vs. VM APICv state is consistent. An APICv
......@@ -10125,7 +10129,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* of accounting via context tracking, but the loss of accuracy is
* acceptable for all known use cases.
*/
vtime_account_guest_exit();
guest_timing_exit_irqoff();
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
......@@ -11639,8 +11643,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
}
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
/**
* __x86_set_memory_region: Setup KVM internal memory slot
*
......
......@@ -10,51 +10,6 @@
void kvm_spurious_fault(void);
static __always_inline void kvm_guest_enter_irqoff(void)
{
/*
* VMENTER enables interrupts (host state), but the kernel state is
* interrupts disabled when this is invoked. Also tell RCU about
* it. This is the same logic as for exit_to_user_mode().
*
* This ensures that e.g. latency analysis on the host observes
* guest mode as interrupt enabled.
*
* guest_enter_irqoff() informs context tracking about the
* transition to guest mode and if enabled adjusts RCU state
* accordingly.
*/
instrumentation_begin();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
}
static __always_inline void kvm_guest_exit_irqoff(void)
{
/*
* VMEXIT disables interrupts (host state), but tracing and lockdep
* have them in state 'on' as recorded before entering guest mode.
* Same as enter_from_user_mode().
*
* context_tracking_guest_exit() restores host context and reinstates
* RCU if enabled and required.
*
* This needs to be done immediately after VM-Exit, before any code
* that might contain tracepoints or call out to the greater world,
* e.g. before x86_spec_ctrl_restore_host().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
context_tracking_guest_exit();
instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
}
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \
......
......@@ -29,7 +29,9 @@
#include <linux/refcount.h>
#include <linux/nospec.h>
#include <linux/notifier.h>
#include <linux/ftrace.h>
#include <linux/hashtable.h>
#include <linux/instrumentation.h>
#include <linux/interval_tree.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
......@@ -368,8 +370,11 @@ struct kvm_vcpu {
u64 last_used_slot_gen;
};
/* must be called with irqs disabled */
static __always_inline void guest_enter_irqoff(void)
/*
* Start accounting time towards a guest.
* Must be called before entering guest context.
*/
static __always_inline void guest_timing_enter_irqoff(void)
{
/*
* This is running in ioctl context so its safe to assume that it's the
......@@ -378,7 +383,18 @@ static __always_inline void guest_enter_irqoff(void)
instrumentation_begin();
vtime_account_guest_enter();
instrumentation_end();
}
/*
* Enter guest context and enter an RCU extended quiescent state.
*
* Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is
* unsafe to use any code which may directly or indirectly use RCU, tracing
* (including IRQ flag tracing), or lockdep. All code in this period must be
* non-instrumentable.
*/
static __always_inline void guest_context_enter_irqoff(void)
{
/*
* KVM does not hold any references to rcu protected data when it
* switches CPU into a guest mode. In fact switching to a guest mode
......@@ -394,16 +410,79 @@ static __always_inline void guest_enter_irqoff(void)
}
}
static __always_inline void guest_exit_irqoff(void)
/*
* Deprecated. Architectures should move to guest_timing_enter_irqoff() and
* guest_state_enter_irqoff().
*/
static __always_inline void guest_enter_irqoff(void)
{
guest_timing_enter_irqoff();
guest_context_enter_irqoff();
}
/**
* guest_state_enter_irqoff - Fixup state when entering a guest
*
* Entry to a guest will enable interrupts, but the kernel state is interrupts
* disabled when this is invoked. Also tell RCU about it.
*
* 1) Trace interrupts on state
* 2) Invoke context tracking if enabled to adjust RCU state
* 3) Tell lockdep that interrupts are enabled
*
* Invoked from architecture specific code before entering a guest.
* Must be called with interrupts disabled and the caller must be
* non-instrumentable.
* The caller has to invoke guest_timing_enter_irqoff() before this.
*
* Note: this is analogous to exit_to_user_mode().
*/
static __always_inline void guest_state_enter_irqoff(void)
{
instrumentation_begin();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
guest_context_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
}
/*
* Exit guest context and exit an RCU extended quiescent state.
*
* Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is
* unsafe to use any code which may directly or indirectly use RCU, tracing
* (including IRQ flag tracing), or lockdep. All code in this period must be
* non-instrumentable.
*/
static __always_inline void guest_context_exit_irqoff(void)
{
context_tracking_guest_exit();
}
/*
* Stop accounting time towards a guest.
* Must be called after exiting guest context.
*/
static __always_inline void guest_timing_exit_irqoff(void)
{
instrumentation_begin();
/* Flush the guest cputime we spent on the guest */
vtime_account_guest_exit();
instrumentation_end();
}
/*
* Deprecated. Architectures should move to guest_state_exit_irqoff() and
* guest_timing_exit_irqoff().
*/
static __always_inline void guest_exit_irqoff(void)
{
guest_context_exit_irqoff();
guest_timing_exit_irqoff();
}
static inline void guest_exit(void)
{
unsigned long flags;
......@@ -413,6 +492,33 @@ static inline void guest_exit(void)
local_irq_restore(flags);
}
/**
* guest_state_exit_irqoff - Establish state when returning from guest mode
*
* Entry from a guest disables interrupts, but guest mode is traced as
* interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
*
* 1) Tell lockdep that interrupts are disabled
* 2) Invoke context tracking if enabled to reactivate RCU
* 3) Trace interrupts off state
*
* Invoked from architecture specific code after exiting a guest.
* Must be invoked with interrupts disabled and the caller must be
* non-instrumentable.
* The caller has to invoke guest_timing_exit_irqoff() after this.
*
* Note: this is analogous to enter_from_user_mode().
*/
static __always_inline void guest_state_exit_irqoff(void)
{
lockdep_hardirqs_off(CALLER_ADDR0);
guest_context_exit_irqoff();
instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
}
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
{
/*
......
......@@ -1624,9 +1624,6 @@ struct kvm_enc_region {
#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3)
#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4)
/* Available with KVM_CAP_XSAVE2 */
#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
struct kvm_s390_pv_sec_parm {
__u64 origin;
__u64 length;
......@@ -2048,4 +2045,7 @@ struct kvm_stats_desc {
#define KVM_GET_STATS_FD _IO(KVMIO, 0xce)
/* Available with KVM_CAP_XSAVE2 */
#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
#endif /* __LINUX_KVM_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment