Commit c142786c authored by Avi Kivity's avatar Avi Kivity Committed by Marcelo Tosatti

KVM: MMU: Don't use RCU for lockless shadow walking

Using RCU for lockless shadow walking can increase the amount of memory
in use by the system, since RCU grace periods are unpredictable.  We also
have an unconditional write to a shared variable (reader_counter), which
isn't good for scaling.

Replace that with a scheme similar to x86's get_user_pages_fast(): disable
interrupts during lockless shadow walk to force the freer
(kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
processor with interrupts enabled.

We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
kvm_flush_remote_tlbs() from avoiding the IPI.
Signed-off-by: default avatarAvi Kivity <avi@redhat.com>
Signed-off-by: default avatarMarcelo Tosatti <mtosatti@redhat.com>
parent b2da15ac
......@@ -240,8 +240,6 @@ struct kvm_mmu_page {
#endif
int write_flooding_count;
struct rcu_head rcu;
};
struct kvm_pio_request {
......@@ -540,8 +538,6 @@ struct kvm_arch {
u64 hv_guest_os_id;
u64 hv_hypercall;
atomic_t reader_counter;
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
......
......@@ -551,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
rcu_read_lock();
atomic_inc(&vcpu->kvm->arch.reader_counter);
/* Increase the counter before walking shadow page table */
smp_mb__after_atomic_inc();
/*
* Prevent page table teardown by making any free-er wait during
* kvm_flush_remote_tlbs() IPI to all active vcpus.
*/
local_irq_disable();
vcpu->mode = READING_SHADOW_PAGE_TABLES;
/*
* Make sure a following spte read is not reordered ahead of the write
* to vcpu->mode.
*/
smp_mb();
}
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
/* Decrease the counter after walking shadow page table finished */
smp_mb__before_atomic_dec();
atomic_dec(&vcpu->kvm->arch.reader_counter);
rcu_read_unlock();
/*
* Make sure the write to vcpu->mode is not reordered in front of
* reads to sptes. If it does, kvm_commit_zap_page() can see us
* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
*/
smp_mb();
vcpu->mode = OUTSIDE_GUEST_MODE;
local_irq_enable();
}
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
......@@ -1989,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
return ret;
}
static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
{
struct kvm_mmu_page *sp;
list_for_each_entry(sp, invalid_list, link)
kvm_mmu_isolate_page(sp);
}
static void free_pages_rcu(struct rcu_head *head)
{
struct kvm_mmu_page *next, *sp;
sp = container_of(head, struct kvm_mmu_page, rcu);
while (sp) {
if (!list_empty(&sp->link))
next = list_first_entry(&sp->link,
struct kvm_mmu_page, link);
else
next = NULL;
kvm_mmu_free_page(sp);
sp = next;
}
}
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
......@@ -2021,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
if (list_empty(invalid_list))
return;
kvm_flush_remote_tlbs(kvm);
if (atomic_read(&kvm->arch.reader_counter)) {
kvm_mmu_isolate_pages(invalid_list);
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
list_del_init(invalid_list);
/*
* wmb: make sure everyone sees our modifications to the page tables
* rmb: make sure we see changes to vcpu->mode
*/
smp_mb();
trace_kvm_mmu_delay_free_pages(sp);
call_rcu(&sp->rcu, free_pages_rcu);
return;
}
/*
* Wait for all vcpus to exit guest mode and/or lockless shadow
* page table walks.
*/
kvm_flush_remote_tlbs(kvm);
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
......@@ -2039,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_mmu_isolate_page(sp);
kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
}
/*
......
......@@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
enum {
OUTSIDE_GUEST_MODE,
IN_GUEST_MODE,
EXITING_GUEST_MODE
EXITING_GUEST_MODE,
READING_SHADOW_PAGE_TABLES,
};
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment