KVM: x86/mmu: Move calls to thp_adjust() down a level

Move the calls to thp_adjust() down a level from the page fault handlers to the map/fetch helpers and remove the page count shuffling done in thp_adjust(). Despite holding a reference to the underlying page while processing a page fault, the page fault flows don't actually rely on holding a reference to the page when thp_adjust() is called. At that point, the fault handlers hold mmu_lock, which prevents mmu_notifier from completing any invalidations, and have verified no invalidations from mmu_notifier have occurred since the page reference was acquired (which is done prior to taking mmu_lock). The kvm_release_pfn_clean()/kvm_get_pfn() dance in thp_adjust() is a quirk that is necessitated because thp_adjust() modifies the pfn that is consumed by its caller. Because the page fault handlers call kvm_release_pfn_clean() on said pfn, thp_adjust() needs to transfer the reference to the correct pfn purely for correctness when the pfn is released. Calling thp_adjust() from __direct_map() and FNAME(fetch) means the pfn adjustment doesn't change the pfn as seen by the page fault handlers, i.e. the pfn released by the page fault handlers is the same pfn that was returned by gfn_to_pfn(). Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

KVM: x86/mmu: Move calls to thp_adjust() down a level
Move the calls to thp_adjust() down a level from the page fault handlers to the map/fetch helpers and remove the page count shuffling done in thp_adjust(). Despite holding a reference to the underlying page while processing a page fault, the page fault flows don't actually rely on holding a reference to the page when thp_adjust() is called. At that point, the fault handlers hold mmu_lock, which prevents mmu_notifier from completing any invalidations, and have verified no invalidations from mmu_notifier have occurred since the page reference was acquired (which is done prior to taking mmu_lock). The kvm_release_pfn_clean()/kvm_get_pfn() dance in thp_adjust() is a quirk that is necessitated because thp_adjust() modifies the pfn that is consumed by its caller. Because the page fault handlers call kvm_release_pfn_clean() on said pfn, thp_adjust() needs to transfer the reference to the correct pfn purely for correctness when the pfn is released. Calling thp_adjust() from __direct_map() and FNAME(fetch) means the pfn adjustment doesn't change the pfn as seen by the page fault handlers, i.e. the pfn released by the page fault handlers is the same pfn that was returned by gfn_to_pfn(). Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
4cd071d1 · Sean Christopherson · Paolo Bonzini · 0885904d · 4cd071d1 · 4cd071d1
Commit 4cd071d1 authored Dec 06, 2019 by Sean Christopherson Committed by Paolo Bonzini Jan 08, 2020
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 24 deletions

arch/x86/kvm/mmu/mmu.c arch/x86/kvm/mmu/mmu.c +12 -19

arch/x86/kvm/mmu/paging_tmpl.h arch/x86/kvm/mmu/paging_tmpl.h +6 -5

No files found.
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3345,24 +3345,15 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	    !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
 	    PageTransCompoundMap(pfn_to_page(pfn))) {
 		unsigned long mask;
+
 		/*
-		 * mmu_notifier_retry was successful and we hold the
-		 * mmu_lock here, so the pmd can't become splitting
-		 * from under us, and in turn
-		 * __split_huge_page_refcount() can't run from under
-		 * us and we can safely transfer the refcount from
-		 * PG_tail to PG_head as we switch the pfn to tail to
-		 * head.
+		 * mmu_notifier_retry() was successful and mmu_lock is held, so
+		 * the pmd can't be split from under us.
 		 */
 		*levelp = level = PT_DIRECTORY_LEVEL;
 		mask = KVM_PAGES_PER_HPAGE(level) - 1;
 		VM_BUG_ON((gfn & mask) != (pfn & mask));
-		if (pfn & mask) {
-			kvm_release_pfn_clean(pfn);
-			pfn &= ~mask;
-			kvm_get_pfn(pfn);
-			*pfnp = pfn;
-		}
+		*pfnp = pfn & ~mask;
 	}
 }

@@ -3390,8 +3381,9 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
 }

 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
-			int map_writable, int level, kvm_pfn_t pfn,
-			bool prefault, bool account_disallowed_nx_lpage)
+			int map_writable, int level, int max_level,
+			kvm_pfn_t pfn, bool prefault,
+			bool account_disallowed_nx_lpage)
 {
 	struct kvm_shadow_walk_iterator it;
 	struct kvm_mmu_page *sp;
@@ -3402,6 +3394,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
 		return RET_PF_RETRY;

+	if (likely(max_level > PT_PAGE_TABLE_LEVEL))
+		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+
 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
 	for_each_shadow_entry(vcpu, gpa, it) {
 		/*
@@ -4220,10 +4215,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 		goto out_unlock;
 	if (make_mmu_pages_available(vcpu) < 0)
 		goto out_unlock;
-	if (likely(max_level > PT_PAGE_TABLE_LEVEL))
-		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault,
-			 is_tdp && lpage_disallowed);
+	r = __direct_map(vcpu, gpa, write, map_writable, level, max_level, pfn,
+			 prefault, is_tdp && lpage_disallowed);

 out_unlock:
 	spin_unlock(&vcpu->kvm->mmu_lock);

--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -613,7 +613,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 */
 static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
 			 struct guest_walker *gw,
-			 int write_fault, int hlevel,
+			 int write_fault, int hlevel, int max_level,
 			 kvm_pfn_t pfn, bool map_writable, bool prefault,
 			 bool lpage_disallowed)
 {
@@ -673,6 +673,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
 	gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
 	base_gfn = gfn;

+	if (max_level > PT_PAGE_TABLE_LEVEL)
+		transparent_hugepage_adjust(vcpu, gw->gfn, &pfn, &hlevel);
+
 	trace_kvm_mmu_spte_requested(addr, gw->level, pfn);

 	for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
@@ -865,10 +868,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	if (make_mmu_pages_available(vcpu) < 0)
 		goto out_unlock;
-	if (max_level > PT_PAGE_TABLE_LEVEL)
-		transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
-	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
-			 level, pfn, map_writable, prefault, lpage_disallowed);
+	r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, max_level,
+			 pfn, map_writable, prefault, lpage_disallowed);
 	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);

 out_unlock: