Commit e649b3f0 authored by Eiichi Tsukata's avatar Eiichi Tsukata Committed by Paolo Bonzini

KVM: x86: Fix APIC page invalidation race

Commit b1394e74 ("KVM: x86: fix APIC page invalidation") tried
to fix inappropriate APIC page invalidation by re-introducing arch
specific kvm_arch_mmu_notifier_invalidate_range() and calling it from
kvm_mmu_notifier_invalidate_range_start. However, the patch left a
possible race where the VMCS APIC address cache is updated *before*
it is unmapped:

  (Invalidator) kvm_mmu_notifier_invalidate_range_start()
  (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
  (KVM VCPU) vcpu_enter_guest()
  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
  (Invalidator) actually unmap page

Because of the above race, there can be a mismatch between the
host physical address stored in the APIC_ACCESS_PAGE VMCS field and
the host physical address stored in the EPT entry for the APIC GPA
(0xfee0000).  When this happens, the processor will not trap APIC
accesses, and will instead show the raw contents of the APIC-access page.
Because Windows OS periodically checks for unexpected modifications to
the LAPIC register, this will show up as a BSOD crash with BugCheck
CRITICAL_STRUCTURE_CORRUPTION (109) we are currently seeing in
https://bugzilla.redhat.com/show_bug.cgi?id=1751017.

The root cause of the issue is that kvm_arch_mmu_notifier_invalidate_range()
cannot guarantee that no additional references are taken to the pages in
the range before kvm_mmu_notifier_invalidate_range_end().  Fortunately,
this case is supported by the MMU notifier API, as documented in
include/linux/mmu_notifier.h:

	 * If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().

The fix therefore is to reload the APIC-access page field in the VMCS
from kvm_mmu_notifier_invalidate_range() instead of ..._range_start().

Cc: stable@vger.kernel.org
Fixes: b1394e74 ("KVM: x86: fix APIC page invalidation")
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=197951Signed-off-by: default avatarEiichi Tsukata <eiichi.tsukata@nutanix.com>
Message-Id: <20200606042627.61070-1-eiichi.tsukata@nutanix.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent fb7333df
...@@ -8270,9 +8270,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) ...@@ -8270,9 +8270,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
} }
int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end, unsigned long start, unsigned long end)
bool blockable)
{ {
unsigned long apic_address; unsigned long apic_address;
...@@ -8283,8 +8282,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, ...@@ -8283,8 +8282,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end) if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
return 0;
} }
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
......
...@@ -1420,8 +1420,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, ...@@ -1420,8 +1420,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
} }
#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end, bool blockable); unsigned long start, unsigned long end);
#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
......
...@@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); ...@@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
static unsigned long long kvm_createvm_count; static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms; static unsigned long long kvm_active_vms;
__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end, bool blockable) unsigned long start, unsigned long end)
{ {
return 0;
} }
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
...@@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) ...@@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
return container_of(mn, struct kvm, mmu_notifier); return container_of(mn, struct kvm, mmu_notifier);
} }
static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int idx;
idx = srcu_read_lock(&kvm->srcu);
kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long address, unsigned long address,
...@@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, ...@@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
{ {
struct kvm *kvm = mmu_notifier_to_kvm(mn); struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush = 0, idx; int need_tlb_flush = 0, idx;
int ret;
idx = srcu_read_lock(&kvm->srcu); idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
...@@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, ...@@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm_flush_remote_tlbs(kvm); kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
range->end,
mmu_notifier_range_blockable(range));
srcu_read_unlock(&kvm->srcu, idx); srcu_read_unlock(&kvm->srcu, idx);
return ret; return 0;
} }
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
...@@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, ...@@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
} }
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range = kvm_mmu_notifier_invalidate_range,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young, .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment