Commit 1d7715c6 authored by Vladimir Davydov's avatar Vladimir Davydov Committed by Linus Torvalds

mmu-notifier: add clear_young callback

In the scope of the idle memory tracking feature, which is introduced by
the following patch, we need to clear the referenced/accessed bit not only
in primary, but also in secondary ptes.  The latter is required in order
to estimate wss of KVM VMs.  At the same time we want to avoid flushing
tlb, because it is quite expensive and it won't really affect the final
result.

Currently, there is no function for clearing pte young bit that would meet
our requirements, so this patch introduces one.  To achieve that we have
to add a new mmu-notifier callback, clear_young, since there is no method
for testing-and-clearing a secondary pte w/o flushing tlb.  The new method
is not mandatory and currently only implemented by KVM.
Signed-off-by: default avatarVladimir Davydov <vdavydov@parallels.com>
Reviewed-by: default avatarAndres Lagar-Cavilla <andreslc@google.com>
Acked-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 80ae2fdc
...@@ -65,6 +65,16 @@ struct mmu_notifier_ops { ...@@ -65,6 +65,16 @@ struct mmu_notifier_ops {
unsigned long start, unsigned long start,
unsigned long end); unsigned long end);
/*
* clear_young is a lightweight version of clear_flush_young. Like the
* latter, it is supposed to test-and-clear the young/accessed bitflag
* in the secondary pte, but it may omit flushing the secondary tlb.
*/
int (*clear_young)(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
/* /*
* test_young is called to check the young/accessed bitflag in * test_young is called to check the young/accessed bitflag in
* the secondary pte. This is used to know if the page is * the secondary pte. This is used to know if the page is
...@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm); ...@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start, unsigned long start,
unsigned long end); unsigned long end);
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
unsigned long start,
unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm, extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address); unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm, extern void __mmu_notifier_change_pte(struct mm_struct *mm,
...@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, ...@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
return 0; return 0;
} }
static inline int mmu_notifier_clear_young(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
if (mm_has_notifiers(mm))
return __mmu_notifier_clear_young(mm, start, end);
return 0;
}
static inline int mmu_notifier_test_young(struct mm_struct *mm, static inline int mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address) unsigned long address)
{ {
...@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ...@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
__young; \ __young; \
}) })
#define ptep_clear_young_notify(__vma, __address, __ptep) \
({ \
int __young; \
struct vm_area_struct *___vma = __vma; \
unsigned long ___address = __address; \
__young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
___address + PAGE_SIZE); \
__young; \
})
#define pmdp_clear_young_notify(__vma, __address, __pmdp) \
({ \
int __young; \
struct vm_area_struct *___vma = __vma; \
unsigned long ___address = __address; \
__young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
___address + PMD_SIZE); \
__young; \
})
#define ptep_clear_flush_notify(__vma, __address, __ptep) \ #define ptep_clear_flush_notify(__vma, __address, __ptep) \
({ \ ({ \
unsigned long ___addr = __address & PAGE_MASK; \ unsigned long ___addr = __address & PAGE_MASK; \
......
...@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, ...@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young; return young;
} }
int __mmu_notifier_clear_young(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
struct mmu_notifier *mn;
int young = 0, id;
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->clear_young)
young |= mn->ops->clear_young(mn, mm, start, end);
}
srcu_read_unlock(&srcu, id);
return young;
}
int __mmu_notifier_test_young(struct mm_struct *mm, int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address) unsigned long address)
{ {
......
...@@ -387,6 +387,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, ...@@ -387,6 +387,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
return young; return young;
} }
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int young, idx;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
/*
* Even though we do not flush TLB, this will still adversely
* affect performance on pre-Haswell Intel EPT, where there is
* no EPT Access Bit to clear so that we have to tear down EPT
* tables instead. If we find this unacceptable, we can always
* add a parameter to kvm_age_hva so that it effectively doesn't
* do anything on clear_young.
*
* Also note that currently we never issue secondary TLB flushes
* from clear_young, leaving this job up to the regular system
* cadence. If we find this inaccurate, we might come up with a
* more sophisticated heuristic later.
*/
young = kvm_age_hva(kvm, start, end);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return young;
}
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
struct mm_struct *mm, struct mm_struct *mm,
unsigned long address) unsigned long address)
...@@ -419,6 +449,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { ...@@ -419,6 +449,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young, .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
.clear_young = kvm_mmu_notifier_clear_young,
.test_young = kvm_mmu_notifier_test_young, .test_young = kvm_mmu_notifier_test_young,
.change_pte = kvm_mmu_notifier_change_pte, .change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release, .release = kvm_mmu_notifier_release,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment