Commit 855fb038 authored by Paolo Bonzini's avatar Paolo Bonzini

Merge remote-tracking branch 'kvm/master' into HEAD

Pick commit fdba608f ("KVM: VMX: Wake vCPU when delivering posted
IRQ even if vCPU == this vCPU").  In addition to fixing a bug, it
also aligns the non-nested and nested usage of triggering posted
interrupts, allowing for additional cleanups.
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parents 5a213b92 fdba608f
...@@ -2413,8 +2413,12 @@ ...@@ -2413,8 +2413,12 @@
Default is 1 (enabled) Default is 1 (enabled)
kvm-intel.emulate_invalid_guest_state= kvm-intel.emulate_invalid_guest_state=
[KVM,Intel] Enable emulation of invalid guest states [KVM,Intel] Disable emulation of invalid guest state.
Default is 0 (disabled) Ignored if kvm-intel.enable_unrestricted_guest=1, as
guest state is never invalid for unrestricted guests.
This param doesn't apply to nested guests (L2), as KVM
never emulates invalid L2 guest state.
Default is 1 (enabled)
kvm-intel.flexpriority= kvm-intel.flexpriority=
[KVM,Intel] Disable FlexPriority feature (TPR shadow). [KVM,Intel] Disable FlexPriority feature (TPR shadow).
......
...@@ -47,6 +47,7 @@ KVM_X86_OP(set_dr7) ...@@ -47,6 +47,7 @@ KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg) KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags) KVM_X86_OP(get_rflags)
KVM_X86_OP(set_rflags) KVM_X86_OP(set_rflags)
KVM_X86_OP(get_if_flag)
KVM_X86_OP(tlb_flush_all) KVM_X86_OP(tlb_flush_all)
KVM_X86_OP(tlb_flush_current) KVM_X86_OP(tlb_flush_current)
KVM_X86_OP_NULL(tlb_remote_flush) KVM_X86_OP_NULL(tlb_remote_flush)
......
...@@ -97,7 +97,7 @@ ...@@ -97,7 +97,7 @@
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_TLB_FLUSH_GUEST \ #define KVM_REQ_TLB_FLUSH_GUEST \
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
...@@ -1354,6 +1354,7 @@ struct kvm_x86_ops { ...@@ -1354,6 +1354,7 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
bool (*get_if_flag)(struct kvm_vcpu *vcpu);
void (*tlb_flush_all)(struct kvm_vcpu *vcpu); void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
void (*tlb_flush_current)(struct kvm_vcpu *vcpu); void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
......
...@@ -1923,11 +1923,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ...@@ -1923,11 +1923,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
if (all_cpus)
goto check_and_send_ipi;
if (!sparse_banks_len) if (!sparse_banks_len)
goto ret_success; goto ret_success;
if (!all_cpus && if (kvm_read_guest(kvm,
kvm_read_guest(kvm,
hc->ingpa + offsetof(struct hv_send_ipi_ex, hc->ingpa + offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents), vp_set.bank_contents),
sparse_banks, sparse_banks,
...@@ -1935,6 +1937,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ...@@ -1935,6 +1937,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
return HV_STATUS_INVALID_HYPERCALL_INPUT; return HV_STATUS_INVALID_HYPERCALL_INPUT;
} }
check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT; return HV_STATUS_INVALID_HYPERCALL_INPUT;
......
...@@ -3971,7 +3971,21 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, ...@@ -3971,7 +3971,21 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
static bool is_page_fault_stale(struct kvm_vcpu *vcpu, static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, int mmu_seq) struct kvm_page_fault *fault, int mmu_seq)
{ {
if (is_obsolete_sp(vcpu->kvm, to_shadow_page(vcpu->arch.mmu->root_hpa))) struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa);
/* Special roots, e.g. pae_root, are not backed by shadow pages. */
if (sp && is_obsolete_sp(vcpu->kvm, sp))
return true;
/*
* Roots without an associated shadow page are considered invalid if
* there is a pending request to free obsolete roots. The request is
* only a hint that the current root _may_ be obsolete and needs to be
* reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
* previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
* to reload even if no vCPU is actively using the root.
*/
if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu))
return true; return true;
return fault->slot && return fault->slot &&
......
...@@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) ...@@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
*/ */
void tdp_iter_restart(struct tdp_iter *iter) void tdp_iter_restart(struct tdp_iter *iter)
{ {
iter->yielded = false;
iter->yielded_gfn = iter->next_last_level_gfn; iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level; iter->level = iter->root_level;
...@@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter) ...@@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter)
*/ */
void tdp_iter_next(struct tdp_iter *iter) void tdp_iter_next(struct tdp_iter *iter)
{ {
if (iter->yielded) {
tdp_iter_restart(iter);
return;
}
if (try_step_down(iter)) if (try_step_down(iter))
return; return;
......
...@@ -45,6 +45,12 @@ struct tdp_iter { ...@@ -45,6 +45,12 @@ struct tdp_iter {
* iterator walks off the end of the paging structure. * iterator walks off the end of the paging structure.
*/ */
bool valid; bool valid;
/*
* True if KVM dropped mmu_lock and yielded in the middle of a walk, in
* which case tdp_iter_next() needs to restart the walk at the root
* level instead of advancing to the next entry.
*/
bool yielded;
}; };
/* /*
......
...@@ -502,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, ...@@ -502,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter, struct tdp_iter *iter,
u64 new_spte) u64 new_spte)
{ {
WARN_ON_ONCE(iter->yielded);
lockdep_assert_held_read(&kvm->mmu_lock); lockdep_assert_held_read(&kvm->mmu_lock);
/* /*
...@@ -575,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, ...@@ -575,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track, u64 new_spte, bool record_acc_track,
bool record_dirty_log) bool record_dirty_log)
{ {
WARN_ON_ONCE(iter->yielded);
lockdep_assert_held_write(&kvm->mmu_lock); lockdep_assert_held_write(&kvm->mmu_lock);
/* /*
...@@ -640,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, ...@@ -640,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* If this function should yield and flush is set, it will perform a remote * If this function should yield and flush is set, it will perform a remote
* TLB flush before yielding. * TLB flush before yielding.
* *
* If this function yields, it will also reset the tdp_iter's walk over the * If this function yields, iter->yielded is set and the caller must skip to
* paging structure and the calling function should skip to the next * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
* iteration to allow the iterator to continue its traversal from the * over the paging structures to allow the iterator to continue its traversal
* paging structure root. * from the paging structure root.
* *
* Return true if this function yielded and the iterator's traversal was reset. * Returns true if this function yielded.
* Return false if a yield was not needed.
*/ */
static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
struct tdp_iter *iter, bool flush, struct tdp_iter *iter,
bool shared) bool flush, bool shared)
{ {
WARN_ON(iter->yielded);
/* Ensure forward progress has been made before yielding. */ /* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn) if (iter->next_last_level_gfn == iter->yielded_gfn)
return false; return false;
...@@ -671,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, ...@@ -671,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
WARN_ON(iter->gfn > iter->next_last_level_gfn); WARN_ON(iter->gfn > iter->next_last_level_gfn);
tdp_iter_restart(iter); iter->yielded = true;
return true;
} }
return false; return iter->yielded;
} }
/* /*
......
...@@ -1594,6 +1594,15 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) ...@@ -1594,6 +1594,15 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags; to_svm(vcpu)->vmcb->save.rflags = rflags;
} }
static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
{
struct vmcb *vmcb = to_svm(vcpu)->vmcb;
return sev_es_guest(vcpu->kvm)
? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
: kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
}
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{ {
kvm_register_mark_available(vcpu, reg); kvm_register_mark_available(vcpu, reg);
...@@ -3583,14 +3592,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) ...@@ -3583,14 +3592,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm)) if (!gif_set(svm))
return true; return true;
if (sev_es_guest(vcpu->kvm)) { if (is_guest_mode(vcpu)) {
/*
* SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
* bit to determine the state of the IF flag.
*/
if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
return true;
} else if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */ /* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
...@@ -3601,7 +3603,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) ...@@ -3601,7 +3603,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (nested_exit_on_intr(svm)) if (nested_exit_on_intr(svm))
return false; return false;
} else { } else {
if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) if (!svm_get_if_flag(vcpu))
return true; return true;
} }
...@@ -4634,6 +4636,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { ...@@ -4634,6 +4636,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.cache_reg = svm_cache_reg, .cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags, .get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags, .set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
.tlb_flush_all = svm_flush_tlb, .tlb_flush_all = svm_flush_tlb,
.tlb_flush_current = svm_flush_tlb, .tlb_flush_current = svm_flush_tlb,
......
...@@ -1372,6 +1372,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) ...@@ -1372,6 +1372,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu); vmx->emulation_required = vmx_emulation_required(vcpu);
} }
static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
{
return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{ {
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
...@@ -3995,8 +4000,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) ...@@ -3995,8 +4000,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
* guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
* posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
*/ */
if (vcpu != kvm_get_running_vcpu() && if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu); kvm_vcpu_kick(vcpu);
return 0; return 0;
...@@ -5921,18 +5925,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) ...@@ -5921,18 +5925,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vmx_flush_pml_buffer(vcpu); vmx_flush_pml_buffer(vcpu);
/* /*
* We should never reach this point with a pending nested VM-Enter, and * KVM should never reach this point with a pending nested VM-Enter.
* more specifically emulation of L2 due to invalid guest state (see * More specifically, short-circuiting VM-Entry to emulate L2 due to
* below) should never happen as that means we incorrectly allowed a * invalid guest state should never happen as that means KVM knowingly
* nested VM-Enter with an invalid vmcs12. * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/ */
if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
return -EIO; return -EIO;
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
if (is_guest_mode(vcpu)) { if (is_guest_mode(vcpu)) {
/* /*
* PML is never enabled when running L2, bail immediately if a * PML is never enabled when running L2, bail immediately if a
...@@ -5954,10 +5954,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) ...@@ -5954,10 +5954,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
*/ */
nested_mark_vmcs12_pages_dirty(vcpu); nested_mark_vmcs12_pages_dirty(vcpu);
/*
* Synthesize a triple fault if L2 state is invalid. In normal
* operation, nested VM-Enter rejects any attempt to enter L2
* with invalid state. However, those checks are skipped if
* state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
* L2 state is invalid, it means either L1 modified SMRAM state
* or userspace provided bad state. Synthesize TRIPLE_FAULT as
* doing so is architecturally allowed in the RSM case, and is
* the least awful solution for the userspace case without
* risking false positives.
*/
if (vmx->emulation_required) {
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
return 1;
}
if (nested_vmx_reflect_vmexit(vcpu)) if (nested_vmx_reflect_vmexit(vcpu))
return 1; return 1;
} }
/* If guest state is invalid, start emulating. L2 is handled above. */
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
if (exit_reason.failed_vmentry) { if (exit_reason.failed_vmentry) {
dump_vmcs(vcpu); dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
...@@ -6652,9 +6672,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) ...@@ -6652,9 +6672,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* consistency check VM-Exit due to invalid guest state and bail. * consistency check VM-Exit due to invalid guest state and bail.
*/ */
if (unlikely(vmx->emulation_required)) { if (unlikely(vmx->emulation_required)) {
vmx->fail = 0;
/* We don't emulate invalid state of a nested guest */
vmx->fail = is_guest_mode(vcpu);
vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
vmx->exit_reason.failed_vmentry = 1; vmx->exit_reason.failed_vmentry = 1;
...@@ -7609,6 +7627,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { ...@@ -7609,6 +7627,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cache_reg = vmx_cache_reg, .cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags, .get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags, .set_rflags = vmx_set_rflags,
.get_if_flag = vmx_get_if_flag,
.tlb_flush_all = vmx_flush_tlb_all, .tlb_flush_all = vmx_flush_tlb_all,
.tlb_flush_current = vmx_flush_tlb_current, .tlb_flush_current = vmx_flush_tlb_current,
......
...@@ -906,7 +906,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) ...@@ -906,7 +906,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
!load_pdptrs(vcpu, kvm_read_cr3(vcpu))) !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
return 1; return 1;
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) if (!(cr0 & X86_CR0_PG) &&
(is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
return 1; return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0); static_call(kvm_x86_set_cr0)(vcpu, cr0);
...@@ -1343,7 +1344,7 @@ static const u32 msrs_to_save_all[] = { ...@@ -1343,7 +1344,7 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_UMWAIT_CONTROL, MSR_IA32_UMWAIT_CONTROL,
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
...@@ -3424,7 +3425,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ...@@ -3424,7 +3425,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated) if (!msr_info->host_initiated)
return 1; return 1;
if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) if (kvm_get_msr_feature(&msr_ent))
return 1; return 1;
if (data & ~msr_ent.data) if (data & ~msr_ent.data)
return 1; return 1;
...@@ -7144,7 +7145,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, ...@@ -7144,7 +7145,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val, unsigned int count) unsigned short port, void *val, unsigned int count)
{ {
if (vcpu->arch.pio.count) { if (vcpu->arch.pio.count) {
/* Complete previous iteration. */ /*
* Complete a previous iteration that required userspace I/O.
* Note, @count isn't guaranteed to match pio.count as userspace
* can modify ECX before rerunning the vCPU. Ignore any such
* shenanigans as KVM doesn't support modifying the rep count,
* and the emulator ensures @count doesn't overflow the buffer.
*/
} else { } else {
int r = __emulator_pio_in(vcpu, size, port, count); int r = __emulator_pio_in(vcpu, size, port, count);
if (!r) if (!r)
...@@ -7153,7 +7160,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, ...@@ -7153,7 +7160,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
/* Results already available, fall through. */ /* Results already available, fall through. */
} }
WARN_ON(count != vcpu->arch.pio.count);
complete_emulator_pio_in(vcpu, val); complete_emulator_pio_in(vcpu, val);
return 1; return 1;
} }
...@@ -9043,14 +9049,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) ...@@ -9043,14 +9049,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{ {
struct kvm_run *kvm_run = vcpu->run; struct kvm_run *kvm_run = vcpu->run;
/* kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
* if_flag is obsolete and useless, so do not bother
* setting it for SEV-ES guests. Userspace can just
* use kvm_run->ready_for_interrupt_injection.
*/
kvm_run->if_flag = !vcpu->arch.guest_state_protected
&& (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu);
......
...@@ -30,10 +30,12 @@ ...@@ -30,10 +30,12 @@
/x86_64/svm_int_ctl_test /x86_64/svm_int_ctl_test
/x86_64/sync_regs_test /x86_64/sync_regs_test
/x86_64/tsc_msrs_test /x86_64/tsc_msrs_test
/x86_64/userspace_io_test
/x86_64/userspace_msr_exit_test /x86_64/userspace_msr_exit_test
/x86_64/vmx_apic_access_test /x86_64/vmx_apic_access_test
/x86_64/vmx_close_while_nested_test /x86_64/vmx_close_while_nested_test
/x86_64/vmx_dirty_log_test /x86_64/vmx_dirty_log_test
/x86_64/vmx_invalid_nested_guest_state
/x86_64/vmx_preemption_timer_test /x86_64/vmx_preemption_timer_test
/x86_64/vmx_set_nested_state_test /x86_64/vmx_set_nested_state_test
/x86_64/vmx_tsc_adjust_test /x86_64/vmx_tsc_adjust_test
......
...@@ -59,10 +59,12 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test ...@@ -59,10 +59,12 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
......
...@@ -321,6 +321,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm); ...@@ -321,6 +321,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm);
unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_size(struct kvm_vm *vm);
unsigned int vm_get_page_shift(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm);
unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
uint64_t vm_get_max_gfn(struct kvm_vm *vm); uint64_t vm_get_max_gfn(struct kvm_vm *vm);
int vm_get_fd(struct kvm_vm *vm); int vm_get_fd(struct kvm_vm *vm);
......
...@@ -302,7 +302,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) ...@@ -302,7 +302,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
(1ULL << (vm->va_bits - 1)) >> vm->page_shift); (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
/* Limit physical addresses to PA-bits. */ /* Limit physical addresses to PA-bits. */
vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; vm->max_gfn = vm_compute_max_gfn(vm);
/* Allocate and setup memory for guest. */ /* Allocate and setup memory for guest. */
vm->vpages_mapped = sparsebit_alloc(); vm->vpages_mapped = sparsebit_alloc();
...@@ -2328,6 +2328,11 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm) ...@@ -2328,6 +2328,11 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm)
return vm->page_shift; return vm->page_shift;
} }
unsigned long __attribute__((weak)) vm_compute_max_gfn(struct kvm_vm *vm)
{
return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
}
uint64_t vm_get_max_gfn(struct kvm_vm *vm) uint64_t vm_get_max_gfn(struct kvm_vm *vm)
{ {
return vm->max_gfn; return vm->max_gfn;
......
...@@ -1431,3 +1431,71 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui ...@@ -1431,3 +1431,71 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
return cpuid; return cpuid;
} }
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
static inline unsigned x86_family(unsigned int eax)
{
unsigned int x86;
x86 = (eax >> 8) & 0xf;
if (x86 == 0xf)
x86 += (eax >> 20) & 0xff;
return x86;
}
unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
{
const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
unsigned long ht_gfn, max_gfn, max_pfn;
uint32_t eax, ebx, ecx, edx, max_ext_leaf;
max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
/* Avoid reserved HyperTransport region on AMD processors. */
eax = ecx = 0;
cpuid(&eax, &ebx, &ecx, &edx);
if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
return max_gfn;
/* On parts with <40 physical address bits, the area is fully hidden */
if (vm->pa_bits < 40)
return max_gfn;
/* Before family 17h, the HyperTransport area is just below 1T. */
ht_gfn = (1 << 28) - num_ht_pages;
eax = 1;
cpuid(&eax, &ebx, &ecx, &edx);
if (x86_family(eax) < 0x17)
goto done;
/*
* Otherwise it's at the top of the physical address space, possibly
* reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use
* the old conservative value if MAXPHYADDR is not enumerated.
*/
eax = 0x80000000;
cpuid(&eax, &ebx, &ecx, &edx);
max_ext_leaf = eax;
if (max_ext_leaf < 0x80000008)
goto done;
eax = 0x80000008;
cpuid(&eax, &ebx, &ecx, &edx);
max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1;
if (max_ext_leaf >= 0x8000001f) {
eax = 0x8000001f;
cpuid(&eax, &ebx, &ecx, &edx);
max_pfn >>= (ebx >> 6) & 0x3f;
}
ht_gfn = max_pfn - num_ht_pages;
done:
return min(max_gfn, ht_gfn - 1);
}
...@@ -75,7 +75,7 @@ static void l1_guest_code(struct svm_test_data *svm) ...@@ -75,7 +75,7 @@ static void l1_guest_code(struct svm_test_data *svm)
vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
/* No intercepts for real and virtual interrupts */ /* No intercepts for real and virtual interrupts */
vmcb->control.intercept &= ~(1ULL << INTERCEPT_INTR | INTERCEPT_VINTR); vmcb->control.intercept &= ~(BIT(INTERCEPT_INTR) | BIT(INTERCEPT_VINTR));
/* Make a virtual interrupt VINTR_IRQ_NUMBER pending */ /* Make a virtual interrupt VINTR_IRQ_NUMBER pending */
vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT); vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT);
......
// SPDX-License-Identifier: GPL-2.0
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
#define VCPU_ID 1
static void guest_ins_port80(uint8_t *buffer, unsigned int count)
{
unsigned long end;
if (count == 2)
end = (unsigned long)buffer + 1;
else
end = (unsigned long)buffer + 8192;
asm volatile("cld; rep; insb" : "+D"(buffer), "+c"(count) : "d"(0x80) : "memory");
GUEST_ASSERT_1(count == 0, count);
GUEST_ASSERT_2((unsigned long)buffer == end, buffer, end);
}
static void guest_code(void)
{
uint8_t buffer[8192];
int i;
/*
* Special case tests. main() will adjust RCX 2 => 1 and 3 => 8192 to
* test that KVM doesn't explode when userspace modifies the "count" on
* a userspace I/O exit. KVM isn't required to play nice with the I/O
* itself as KVM doesn't support manipulating the count, it just needs
* to not explode or overflow a buffer.
*/
guest_ins_port80(buffer, 2);
guest_ins_port80(buffer, 3);
/* Verify KVM fills the buffer correctly when not stuffing RCX. */
memset(buffer, 0, sizeof(buffer));
guest_ins_port80(buffer, 8192);
for (i = 0; i < 8192; i++)
GUEST_ASSERT_2(buffer[i] == 0xaa, i, buffer[i]);
GUEST_DONE();
}
int main(int argc, char *argv[])
{
struct kvm_regs regs;
struct kvm_run *run;
struct kvm_vm *vm;
struct ucall uc;
int rc;
/* Tell stdout not to buffer its content */
setbuf(stdout, NULL);
/* Create VM */
vm = vm_create_default(VCPU_ID, 0, guest_code);
run = vcpu_state(vm, VCPU_ID);
memset(&regs, 0, sizeof(regs));
while (1) {
rc = _vcpu_run(vm, VCPU_ID);
TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
"Unexpected exit reason: %u (%s),\n",
run->exit_reason,
exit_reason_str(run->exit_reason));
if (get_ucall(vm, VCPU_ID, &uc))
break;
TEST_ASSERT(run->io.port == 0x80,
"Expected I/O at port 0x80, got port 0x%x\n", run->io.port);
/*
* Modify the rep string count in RCX: 2 => 1 and 3 => 8192.
* Note, this abuses KVM's batching of rep string I/O to avoid
* getting stuck in an infinite loop. That behavior isn't in
* scope from a testing perspective as it's not ABI in any way,
* i.e. it really is abusing internal KVM knowledge.
*/
vcpu_regs_get(vm, VCPU_ID, &regs);
if (regs.rcx == 2)
regs.rcx = 1;
if (regs.rcx == 3)
regs.rcx = 8192;
memset((void *)run + run->io.data_offset, 0xaa, 4096);
vcpu_regs_set(vm, VCPU_ID, &regs);
}
switch (uc.cmd) {
case UCALL_DONE:
break;
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld : argN+1 = 0x%lx, argN+2 = 0x%lx",
(const char *)uc.args[0], __FILE__, uc.args[1],
uc.args[2], uc.args[3]);
default:
TEST_FAIL("Unknown ucall %lu", uc.cmd);
}
kvm_vm_free(vm);
return 0;
}
// SPDX-License-Identifier: GPL-2.0-only
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
#include "vmx.h"
#include <string.h>
#include <sys/ioctl.h>
#include "kselftest.h"
#define VCPU_ID 0
#define ARBITRARY_IO_PORT 0x2000
static struct kvm_vm *vm;
static void l2_guest_code(void)
{
/*
* Generate an exit to L0 userspace, i.e. main(), via I/O to an
* arbitrary port.
*/
asm volatile("inb %%dx, %%al"
: : [port] "d" (ARBITRARY_IO_PORT) : "rax");
}
static void l1_guest_code(struct vmx_pages *vmx_pages)
{
#define L2_GUEST_STACK_SIZE 64
unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
/* Prepare the VMCS for L2 execution. */
prepare_vmcs(vmx_pages, l2_guest_code,
&l2_guest_stack[L2_GUEST_STACK_SIZE]);
/*
* L2 must be run without unrestricted guest, verify that the selftests
* library hasn't enabled it. Because KVM selftests jump directly to
* 64-bit mode, unrestricted guest support isn't required.
*/
GUEST_ASSERT(!(vmreadz(CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) ||
!(vmreadz(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_UNRESTRICTED_GUEST));
GUEST_ASSERT(!vmlaunch());
/* L2 should triple fault after main() stuffs invalid guest state. */
GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT);
GUEST_DONE();
}
int main(int argc, char *argv[])
{
vm_vaddr_t vmx_pages_gva;
struct kvm_sregs sregs;
struct kvm_run *run;
struct ucall uc;
nested_vmx_check_supported();
vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
/* Allocate VMX pages and shared descriptors (vmx_pages). */
vcpu_alloc_vmx(vm, &vmx_pages_gva);
vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
vcpu_run(vm, VCPU_ID);
run = vcpu_state(vm, VCPU_ID);
/*
* The first exit to L0 userspace should be an I/O access from L2.
* Running L1 should launch L2 without triggering an exit to userspace.
*/
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
"Expected KVM_EXIT_IO, got: %u (%s)\n",
run->exit_reason, exit_reason_str(run->exit_reason));
TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT,
"Expected IN from port %d from L2, got port %d",
ARBITRARY_IO_PORT, run->io.port);
/*
* Stuff invalid guest state for L2 by making TR unusuable. The next
* KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
* emulating invalid guest state for L2.
*/
memset(&sregs, 0, sizeof(sregs));
vcpu_sregs_get(vm, VCPU_ID, &sregs);
sregs.tr.unusable = 1;
vcpu_sregs_set(vm, VCPU_ID, &sregs);
vcpu_run(vm, VCPU_ID);
switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_DONE:
break;
case UCALL_ABORT:
TEST_FAIL("%s", (const char *)uc.args[0]);
default:
TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
}
}
...@@ -110,22 +110,5 @@ int main(int argc, char *argv[]) ...@@ -110,22 +110,5 @@ int main(int argc, char *argv[])
ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT); ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_LBR_FMT);
TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail."); TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
/* testcase 4, set capabilities when we don't have PDCM bit */
entry_1_0->ecx &= ~X86_FEATURE_PDCM;
vcpu_set_cpuid(vm, VCPU_ID, cpuid);
ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
/* testcase 5, set capabilities when we don't have PMU version bits */
entry_1_0->ecx |= X86_FEATURE_PDCM;
eax.split.version_id = 0;
entry_1_0->ecx = eax.full;
vcpu_set_cpuid(vm, VCPU_ID, cpuid);
ret = _vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, PMU_CAP_FW_WRITES);
TEST_ASSERT(ret == 0, "Bad PERF_CAPABILITIES didn't fail.");
vcpu_set_msr(vm, 0, MSR_IA32_PERF_CAPABILITIES, 0);
ASSERT_EQ(vcpu_get_msr(vm, VCPU_ID, MSR_IA32_PERF_CAPABILITIES), 0);
kvm_vm_free(vm); kvm_vm_free(vm);
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment