Commit bce58da1 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "x86:

   - Account for family 17h event renumberings in AMD PMU emulation

   - Remove CPUID leaf 0xA on AMD processors

   - Fix lockdep issue with locking all vCPUs

   - Fix loss of A/D bits in SPTEs

   - Fix syzkaller issue with invalid guest state"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: VMX: Exit to userspace if vCPU has injected exception and invalid state
  KVM: SEV: Mark nested locking of vcpu->lock
  kvm: x86/cpuid: Only provide CPUID leaf 0xA if host has architectural PMU
  KVM: x86/svm: Account for family 17h event renumberings in amd_pmc_perf_hw_id
  KVM: x86/mmu: Use atomic XCHG to write TDP MMU SPTEs with volatile bits
  KVM: x86/mmu: Move shadow-present check out of spte_has_volatile_bits()
  KVM: x86/mmu: Don't treat fully writable SPTEs as volatile (modulo A/D)
parents 497fe3bb 053d2290
...@@ -887,6 +887,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) ...@@ -887,6 +887,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
union cpuid10_eax eax; union cpuid10_eax eax;
union cpuid10_edx edx; union cpuid10_edx edx;
if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
}
perf_get_x86_pmu_capability(&cap); perf_get_x86_pmu_capability(&cap);
/* /*
......
...@@ -473,30 +473,6 @@ static u64 __get_spte_lockless(u64 *sptep) ...@@ -473,30 +473,6 @@ static u64 __get_spte_lockless(u64 *sptep)
} }
#endif #endif
static bool spte_has_volatile_bits(u64 spte)
{
if (!is_shadow_present_pte(spte))
return false;
/*
* Always atomically update spte if it can be updated
* out of mmu-lock, it can ensure dirty bit is not lost,
* also, it can help us to get a stable is_writable_pte()
* to ensure tlb flush is not missed.
*/
if (spte_can_locklessly_be_made_writable(spte) ||
is_access_track_spte(spte))
return true;
if (spte_ad_enabled(spte)) {
if ((spte & shadow_accessed_mask) == 0 ||
(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
return true;
}
return false;
}
/* Rules for using mmu_spte_set: /* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present. * Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present * Note: the sptep being assigned *must* be either not present
...@@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) ...@@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* we always atomically update it, see the comments in * we always atomically update it, see the comments in
* spte_has_volatile_bits(). * spte_has_volatile_bits().
*/ */
if (spte_can_locklessly_be_made_writable(old_spte) && if (is_mmu_writable_spte(old_spte) &&
!is_writable_pte(new_spte)) !is_writable_pte(new_spte))
flush = true; flush = true;
...@@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) ...@@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
u64 old_spte = *sptep; u64 old_spte = *sptep;
int level = sptep_to_sp(sptep)->role.level; int level = sptep_to_sp(sptep)->role.level;
if (!spte_has_volatile_bits(old_spte)) if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, 0ull); __update_clear_spte_fast(sptep, 0ull);
else else
old_spte = __update_clear_spte_slow(sptep, 0ull); old_spte = __update_clear_spte_slow(sptep, 0ull);
...@@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) ...@@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
u64 spte = *sptep; u64 spte = *sptep;
if (!is_writable_pte(spte) && if (!is_writable_pte(spte) &&
!(pt_protect && spte_can_locklessly_be_made_writable(spte))) !(pt_protect && is_mmu_writable_spte(spte)))
return false; return false;
rmap_printk("spte %p %llx\n", sptep, *sptep); rmap_printk("spte %p %llx\n", sptep, *sptep);
...@@ -3196,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) ...@@ -3196,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* be removed in the fast path only if the SPTE was * be removed in the fast path only if the SPTE was
* write-protected for dirty-logging or access tracking. * write-protected for dirty-logging or access tracking.
*/ */
if (fault->write && if (fault->write && is_mmu_writable_spte(spte)) {
spte_can_locklessly_be_made_writable(spte)) {
new_spte |= PT_WRITABLE_MASK; new_spte |= PT_WRITABLE_MASK;
/* /*
......
...@@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) ...@@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM); E820_TYPE_RAM);
} }
/*
* Returns true if the SPTE has bits that may be set without holding mmu_lock.
* The caller is responsible for checking if the SPTE is shadow-present, and
* for determining whether or not the caller cares about non-leaf SPTEs.
*/
bool spte_has_volatile_bits(u64 spte)
{
/*
* Always atomically update spte if it can be updated
* out of mmu-lock, it can ensure dirty bit is not lost,
* also, it can help us to get a stable is_writable_pte()
* to ensure tlb flush is not missed.
*/
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
return true;
if (is_access_track_spte(spte))
return true;
if (spte_ad_enabled(spte)) {
if (!(spte & shadow_accessed_mask) ||
(is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
return true;
}
return false;
}
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot, const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
......
...@@ -390,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte) ...@@ -390,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte)
"kvm: Writable SPTE is not MMU-writable: %llx", spte); "kvm: Writable SPTE is not MMU-writable: %llx", spte);
} }
static inline bool spte_can_locklessly_be_made_writable(u64 spte) static inline bool is_mmu_writable_spte(u64 spte)
{ {
return spte & shadow_mmu_writable_mask; return spte & shadow_mmu_writable_mask;
} }
...@@ -404,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte) ...@@ -404,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen; return gen;
} }
bool spte_has_volatile_bits(u64 spte);
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot, const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include "mmu.h" #include "mmu.h"
#include "spte.h"
/* /*
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs) * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
...@@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) ...@@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
{ {
return READ_ONCE(*rcu_dereference(sptep)); return READ_ONCE(*rcu_dereference(sptep));
} }
static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
{
return xchg(rcu_dereference(sptep), new_spte);
}
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{ {
WRITE_ONCE(*rcu_dereference(sptep), val); /*
* Atomically write the SPTE if it is a shadow-present, leaf SPTE with
* volatile bits, i.e. has bits that can be set outside of mmu_lock.
* The Writable bit can be set by KVM's fast page fault handler, and
* Accessed and Dirty bits can be set by the CPU.
*
* Note, non-leaf SPTEs do have Accessed bits and those bits are
* technically volatile, but KVM doesn't consume the Accessed bit of
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
* logic needs to be reassessed if KVM were to use non-leaf Accessed
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
*/
if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
spte_has_volatile_bits(old_spte))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
return old_spte;
} }
/* /*
......
...@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) ...@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
tdp_mmu_unlink_sp(kvm, sp, shared); tdp_mmu_unlink_sp(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) { for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
u64 *sptep = rcu_dereference(pt) + i; tdp_ptep_t sptep = pt + i;
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
u64 old_child_spte; u64 old_spte;
if (shared) { if (shared) {
/* /*
...@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) ...@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* value to the removed SPTE value. * value to the removed SPTE value.
*/ */
for (;;) { for (;;) {
old_child_spte = xchg(sptep, REMOVED_SPTE); old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
if (!is_removed_spte(old_child_spte)) if (!is_removed_spte(old_spte))
break; break;
cpu_relax(); cpu_relax();
} }
...@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) ...@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* are guarded by the memslots generation, not by being * are guarded by the memslots generation, not by being
* unreachable. * unreachable.
*/ */
old_child_spte = READ_ONCE(*sptep); old_spte = kvm_tdp_mmu_read_spte(sptep);
if (!is_shadow_present_pte(old_child_spte)) if (!is_shadow_present_pte(old_spte))
continue; continue;
/* /*
* Marking the SPTE as a removed SPTE is not * Use the common helper instead of a raw WRITE_ONCE as
* strictly necessary here as the MMU lock will * the SPTE needs to be updated atomically if it can be
* stop other threads from concurrently modifying * modified by a different vCPU outside of mmu_lock.
* this SPTE. Using the removed SPTE value keeps * Even though the parent SPTE is !PRESENT, the TLB
* the two branches consistent and simplifies * hasn't yet been flushed, and both Intel and AMD
* the function. * document that A/D assists can use upper-level PxE
*/ * entries that are cached in the TLB, i.e. the CPU can
WRITE_ONCE(*sptep, REMOVED_SPTE); * still access the page and mark it dirty.
*
* No retry is needed in the atomic update path as the
* sole concern is dropping a Dirty bit, i.e. no other
* task can zap/remove the SPTE as mmu_lock is held for
* write. Marking the SPTE as a removed SPTE is not
* strictly necessary for the same reason, but using
* the remove SPTE value keeps the shared/exclusive
* paths consistent and allows the handle_changed_spte()
* call below to hardcode the new value to REMOVED_SPTE.
*
* Note, even though dropping a Dirty bit is the only
* scenario where a non-atomic update could result in a
* functional bug, simply checking the Dirty bit isn't
* sufficient as a fast page fault could read the upper
* level SPTE before it is zapped, and then make this
* target SPTE writable, resume the guest, and set the
* Dirty bit between reading the SPTE above and writing
* it here.
*/
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
REMOVED_SPTE, level);
} }
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
old_child_spte, REMOVED_SPTE, level, old_spte, REMOVED_SPTE, level, shared);
shared);
} }
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
...@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, ...@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
KVM_PAGES_PER_HPAGE(iter->level)); KVM_PAGES_PER_HPAGE(iter->level));
/* /*
* No other thread can overwrite the removed SPTE as they * No other thread can overwrite the removed SPTE as they must either
* must either wait on the MMU lock or use * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
* tdp_mmu_set_spte_atomic which will not overwrite the * overwrite the special removed SPTE value. No bookkeeping is needed
* special removed SPTE value. No bookkeeping is needed * here since the SPTE is going from non-present to non-present. Use
* here since the SPTE is going from non-present * the raw write helper to avoid an unnecessary check on volatile bits.
* to non-present.
*/ */
kvm_tdp_mmu_write_spte(iter->sptep, 0); __kvm_tdp_mmu_write_spte(iter->sptep, 0);
return 0; return 0;
} }
...@@ -699,8 +718,11 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, ...@@ -699,8 +718,11 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* unless performing certain dirty logging operations. * unless performing certain dirty logging operations.
* Leaving record_dirty_log unset in that case prevents page * Leaving record_dirty_log unset in that case prevents page
* writes from being double counted. * writes from being double counted.
*
* Returns the old SPTE value, which _may_ be different than @old_spte if the
* SPTE had voldatile bits.
*/ */
static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
u64 old_spte, u64 new_spte, gfn_t gfn, int level, u64 old_spte, u64 new_spte, gfn_t gfn, int level,
bool record_acc_track, bool record_dirty_log) bool record_acc_track, bool record_dirty_log)
{ {
...@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, ...@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
*/ */
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
kvm_tdp_mmu_write_spte(sptep, new_spte); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
...@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, ...@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
if (record_dirty_log) if (record_dirty_log)
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
new_spte, level); new_spte, level);
return old_spte;
} }
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
...@@ -732,8 +755,9 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, ...@@ -732,8 +755,9 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
{ {
WARN_ON_ONCE(iter->yielded); WARN_ON_ONCE(iter->yielded);
__tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte, iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
new_spte, iter->gfn, iter->level, iter->old_spte, new_spte,
iter->gfn, iter->level,
record_acc_track, record_dirty_log); record_acc_track, record_dirty_log);
} }
......
...@@ -45,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { ...@@ -45,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
}; };
/* duplicated from amd_f17h_perfmon_event_map. */
static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
[2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
[3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
[4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
[6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
[7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
};
/* amd_pmc_perf_hw_id depends on these being the same size */
static_assert(ARRAY_SIZE(amd_event_mapping) ==
ARRAY_SIZE(amd_f17h_event_mapping));
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
{ {
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
...@@ -140,6 +156,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, ...@@ -140,6 +156,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
{ {
struct kvm_event_hw_type_mapping *event_mapping;
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i; int i;
...@@ -148,15 +165,20 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) ...@@ -148,15 +165,20 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
if (WARN_ON(pmc_is_fixed(pmc))) if (WARN_ON(pmc_is_fixed(pmc)))
return PERF_COUNT_HW_MAX; return PERF_COUNT_HW_MAX;
if (guest_cpuid_family(pmc->vcpu) >= 0x17)
event_mapping = amd_f17h_event_mapping;
else
event_mapping = amd_event_mapping;
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
if (amd_event_mapping[i].eventsel == event_select if (event_mapping[i].eventsel == event_select
&& amd_event_mapping[i].unit_mask == unit_mask) && event_mapping[i].unit_mask == unit_mask)
break; break;
if (i == ARRAY_SIZE(amd_event_mapping)) if (i == ARRAY_SIZE(amd_event_mapping))
return PERF_COUNT_HW_MAX; return PERF_COUNT_HW_MAX;
return amd_event_mapping[i].event_type; return event_mapping[i].event_type;
} }
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because /* check if a PMC is enabled by comparing it against global_ctrl bits. Because
......
...@@ -1594,24 +1594,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) ...@@ -1594,24 +1594,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
atomic_set_release(&src_sev->migration_in_progress, 0); atomic_set_release(&src_sev->migration_in_progress, 0);
} }
/* vCPU mutex subclasses. */
enum sev_migration_role {
SEV_MIGRATION_SOURCE = 0,
SEV_MIGRATION_TARGET,
SEV_NR_MIGRATION_ROLES,
};
static int sev_lock_vcpus_for_migration(struct kvm *kvm) static int sev_lock_vcpus_for_migration(struct kvm *kvm,
enum sev_migration_role role)
{ {
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
unsigned long i, j; unsigned long i, j;
bool first = true;
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
if (mutex_lock_killable(&vcpu->mutex)) if (mutex_lock_killable_nested(&vcpu->mutex, role))
goto out_unlock; goto out_unlock;
if (first) {
/*
* Reset the role to one that avoids colliding with
* the role used for the first vcpu mutex.
*/
role = SEV_NR_MIGRATION_ROLES;
first = false;
} else {
mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
}
} }
return 0; return 0;
out_unlock: out_unlock:
first = true;
kvm_for_each_vcpu(j, vcpu, kvm) { kvm_for_each_vcpu(j, vcpu, kvm) {
if (i == j) if (i == j)
break; break;
if (first)
first = false;
else
mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
mutex_unlock(&vcpu->mutex); mutex_unlock(&vcpu->mutex);
} }
return -EINTR; return -EINTR;
...@@ -1621,8 +1648,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) ...@@ -1621,8 +1648,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
{ {
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
unsigned long i; unsigned long i;
bool first = true;
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
if (first)
first = false;
else
mutex_acquire(&vcpu->mutex.dep_map,
SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
mutex_unlock(&vcpu->mutex); mutex_unlock(&vcpu->mutex);
} }
} }
...@@ -1748,10 +1782,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) ...@@ -1748,10 +1782,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
charged = true; charged = true;
} }
ret = sev_lock_vcpus_for_migration(kvm); ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
if (ret) if (ret)
goto out_dst_cgroup; goto out_dst_cgroup;
ret = sev_lock_vcpus_for_migration(source_kvm); ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
if (ret) if (ret)
goto out_dst_vcpu; goto out_dst_vcpu;
......
...@@ -5472,7 +5472,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) ...@@ -5472,7 +5472,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
return vmx->emulation_required && !vmx->rmode.vm86_active && return vmx->emulation_required && !vmx->rmode.vm86_active &&
vcpu->arch.exception.pending; (vcpu->arch.exception.pending || vcpu->arch.exception.injected);
} }
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment