Commit 6103bc07 authored by Ben Gardon's avatar Ben Gardon Committed by Paolo Bonzini

KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock

To reduce lock contention and interference with page fault handlers,
allow the TDP MMU function to zap a GFN range to operate under the MMU
read lock.
Signed-off-by: default avatarBen Gardon <bgardon@google.com>
Message-Id: <20210401233736.638171-10-bgardon@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent c0e64238
...@@ -3121,7 +3121,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, ...@@ -3121,7 +3121,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
if (is_tdp_mmu_page(sp)) if (is_tdp_mmu_page(sp))
kvm_tdp_mmu_put_root(kvm, sp); kvm_tdp_mmu_put_root(kvm, sp, false);
else if (!--sp->root_count && sp->role.invalid) else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
...@@ -5496,16 +5496,24 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) ...@@ -5496,16 +5496,24 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
} }
} }
if (is_tdp_mmu_enabled(kvm)) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
gfn_end, flush);
}
if (flush) if (flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
write_unlock(&kvm->mmu_lock); write_unlock(&kvm->mmu_lock);
if (is_tdp_mmu_enabled(kvm)) {
flush = false;
read_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
gfn_end, flush, true);
if (flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
gfn_end);
read_unlock(&kvm->mmu_lock);
}
} }
static bool slot_rmap_write_protect(struct kvm *kvm, static bool slot_rmap_write_protect(struct kvm *kvm,
......
...@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) ...@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
} }
static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
bool shared)
{
if (shared)
lockdep_assert_held_read(&kvm->mmu_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);
}
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{ {
if (!kvm->arch.tdp_mmu_enabled) if (!kvm->arch.tdp_mmu_enabled)
...@@ -42,7 +51,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) ...@@ -42,7 +51,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
} }
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield, bool flush); gfn_t start, gfn_t end, bool can_yield, bool flush,
bool shared);
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
{ {
...@@ -66,11 +76,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) ...@@ -66,11 +76,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
tdp_mmu_free_sp(sp); tdp_mmu_free_sp(sp);
} }
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{ {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
lockdep_assert_held_write(&kvm->mmu_lock); kvm_lockdep_assert_mmu_lock_held(kvm, shared);
if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return; return;
...@@ -81,7 +92,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) ...@@ -81,7 +92,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
list_del_rcu(&root->link); list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock); spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
zap_gfn_range(kvm, root, 0, max_gfn, false, false); zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
} }
...@@ -94,12 +105,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) ...@@ -94,12 +105,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
* function will return NULL. * function will return NULL.
*/ */
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root) struct kvm_mmu_page *prev_root,
bool shared)
{ {
struct kvm_mmu_page *next_root; struct kvm_mmu_page *next_root;
lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock(); rcu_read_lock();
if (prev_root) if (prev_root)
...@@ -117,7 +127,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, ...@@ -117,7 +127,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
rcu_read_unlock(); rcu_read_unlock();
if (prev_root) if (prev_root)
kvm_tdp_mmu_put_root(kvm, prev_root); kvm_tdp_mmu_put_root(kvm, prev_root, shared);
return next_root; return next_root;
} }
...@@ -127,12 +137,16 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, ...@@ -127,12 +137,16 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* This makes it safe to release the MMU lock and yield within the loop, but * This makes it safe to release the MMU lock and yield within the loop, but
* if exiting the loop early, the caller must drop the reference to the most * if exiting the loop early, the caller must drop the reference to the most
* recent root. (Unless keeping a live reference is desirable.) * recent root. (Unless keeping a live reference is desirable.)
*
* If shared is set, this function is operating under the MMU lock in read
* mode. In the unlikely event that this thread must free a root, the lock
* will be temporarily dropped and reacquired in write mode.
*/ */
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
for (_root = tdp_mmu_next_root(_kvm, NULL); \ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \
_root; \ _root; \
_root = tdp_mmu_next_root(_kvm, _root)) \ _root = tdp_mmu_next_root(_kvm, _root, _shared)) \
if (kvm_mmu_page_as_id(_root) != _as_id) { \ if (kvm_mmu_page_as_id(_root) != _as_id) { \
} else } else
#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
...@@ -636,7 +650,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, ...@@ -636,7 +650,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* Return false if a yield was not needed. * Return false if a yield was not needed.
*/ */
static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
struct tdp_iter *iter, bool flush) struct tdp_iter *iter, bool flush,
bool shared)
{ {
/* Ensure forward progress has been made before yielding. */ /* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn) if (iter->next_last_level_gfn == iter->yielded_gfn)
...@@ -648,7 +663,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, ...@@ -648,7 +663,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
if (flush) if (flush)
kvm_flush_remote_tlbs(kvm); kvm_flush_remote_tlbs(kvm);
cond_resched_rwlock_write(&kvm->mmu_lock); if (shared)
cond_resched_rwlock_read(&kvm->mmu_lock);
else
cond_resched_rwlock_write(&kvm->mmu_lock);
rcu_read_lock(); rcu_read_lock();
WARN_ON(iter->gfn > iter->next_last_level_gfn); WARN_ON(iter->gfn > iter->next_last_level_gfn);
...@@ -666,24 +685,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, ...@@ -666,24 +685,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
* non-root pages mapping GFNs strictly within that range. Returns true if * non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the * SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock. * MMU lock.
*
* If can_yield is true, will release the MMU lock and reschedule if the * If can_yield is true, will release the MMU lock and reschedule if the
* scheduler needs the CPU or there is contention on the MMU lock. If this * scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and * function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the * the caller must ensure it does not supply too large a GFN range, or the
* operation can cause a soft lockup. Note, in some use cases a flush may be * operation can cause a soft lockup.
* required by prior actions. Ensure the pending flush is performed prior to *
* yielding. * If shared is true, this thread holds the MMU lock in read mode and must
* account for the possibility that other threads are modifying the paging
* structures concurrently. If shared is false, this thread should hold the
* MMU lock in write mode.
*/ */
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield, bool flush) gfn_t start, gfn_t end, bool can_yield, bool flush,
bool shared)
{ {
struct tdp_iter iter; struct tdp_iter iter;
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
rcu_read_lock(); rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) { tdp_root_for_each_pte(iter, root, start, end) {
retry:
if (can_yield && if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
flush = false; flush = false;
continue; continue;
} }
...@@ -701,8 +728,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -701,8 +728,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level)) !is_last_spte(iter.old_spte, iter.level))
continue; continue;
tdp_mmu_set_spte(kvm, &iter, 0); if (!shared) {
flush = true; tdp_mmu_set_spte(kvm, &iter, 0);
flush = true;
} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
* The iter must explicitly re-read the SPTE because
* the atomic cmpxchg failed.
*/
iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
goto retry;
}
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -714,14 +750,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -714,14 +750,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* non-root pages mapping GFNs strictly within that range. Returns true if * non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the * SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock. * MMU lock.
*
* If shared is true, this thread holds the MMU lock in read mode and must
* account for the possibility that other threads are modifying the paging
* structures concurrently. If shared is false, this thread should hold the
* MMU in write mode.
*/ */
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
gfn_t end, bool can_yield, bool flush) gfn_t end, bool can_yield, bool flush,
bool shared)
{ {
struct kvm_mmu_page *root; struct kvm_mmu_page *root;
for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
shared);
return flush; return flush;
} }
...@@ -733,7 +776,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) ...@@ -733,7 +776,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
int i; int i;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush); flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
flush, false);
if (flush) if (flush)
kvm_flush_remote_tlbs(kvm); kvm_flush_remote_tlbs(kvm);
...@@ -892,7 +936,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, ...@@ -892,7 +936,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
for_each_tdp_mmu_root(kvm, root, range->slot->as_id) for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
flush |= zap_gfn_range(kvm, root, range->start, range->end, flush |= zap_gfn_range(kvm, root, range->start, range->end,
range->may_block, flush); range->may_block, flush, false);
return flush; return flush;
} }
...@@ -1038,7 +1082,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -1038,7 +1082,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
for_each_tdp_pte_min_level(iter, root->spt, root->role.level, for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) { min_level, start, end) {
if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
continue; continue;
if (!is_shadow_present_pte(iter.old_spte) || if (!is_shadow_present_pte(iter.old_spte) ||
...@@ -1067,7 +1111,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, ...@@ -1067,7 +1111,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
struct kvm_mmu_page *root; struct kvm_mmu_page *root;
bool spte_set = false; bool spte_set = false;
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level); slot->base_gfn + slot->npages, min_level);
...@@ -1091,7 +1135,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -1091,7 +1135,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock(); rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) { tdp_root_for_each_leaf_pte(iter, root, start, end) {
if (tdp_mmu_iter_cond_resched(kvm, &iter, false)) if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
continue; continue;
if (spte_ad_need_write_protect(iter.old_spte)) { if (spte_ad_need_write_protect(iter.old_spte)) {
...@@ -1126,7 +1170,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) ...@@ -1126,7 +1170,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
struct kvm_mmu_page *root; struct kvm_mmu_page *root;
bool spte_set = false; bool spte_set = false;
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages); slot->base_gfn + slot->npages);
...@@ -1213,7 +1257,7 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, ...@@ -1213,7 +1257,7 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
rcu_read_lock(); rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) { tdp_root_for_each_pte(iter, root, start, end) {
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
flush = false; flush = false;
continue; continue;
} }
...@@ -1248,7 +1292,7 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, ...@@ -1248,7 +1292,7 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
{ {
struct kvm_mmu_page *root; struct kvm_mmu_page *root;
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
flush = zap_collapsible_spte_range(kvm, root, slot, flush); flush = zap_collapsible_spte_range(kvm, root, slot, flush);
return flush; return flush;
......
...@@ -13,14 +13,18 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm, ...@@ -13,14 +13,18 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
return refcount_inc_not_zero(&root->tdp_mmu_root_count); return refcount_inc_not_zero(&root->tdp_mmu_root_count);
} }
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared);
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
gfn_t end, bool can_yield, bool flush); gfn_t end, bool can_yield, bool flush,
bool shared);
static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
gfn_t start, gfn_t end, bool flush) gfn_t start, gfn_t end, bool flush,
bool shared)
{ {
return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
shared);
} }
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{ {
...@@ -37,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) ...@@ -37,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
*/ */
lockdep_assert_held_write(&kvm->mmu_lock); lockdep_assert_held_write(&kvm->mmu_lock);
return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
sp->gfn, end, false, false); sp->gfn, end, false, false, false);
} }
void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_zap_all(struct kvm *kvm);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment