KVM: X86: Optimize pte_list_desc with per-array counter

Add a counter field into pte_list_desc, so as to simplify the add/remove/loop logic. E.g., we don't need to loop over the array any more for most reasons. This will make more sense after we've switched the array size to be larger otherwise the counter will be a waste. Initially I wanted to store a tail pointer at the head of the array list so we don't need to traverse the list at least for pushing new ones (if without the counter we traverse both the list and the array). However that'll need slightly more change without a huge lot benefit, e.g., after we grow entry numbers per array the list traversing is not so expensive. So let's be simple but still try to get as much benefit as we can with just these extra few lines of changes (not to mention the code looks easier too without looping over arrays). I used the same a test case to fork 500 child and recycle them ("./rmap_fork 500" [1]), this patch further speeds up the total fork time of about 4%, which is a total of 33% of vanilla kernel: Vanilla: 473.90 (+-5.93%) 3->15 slots: 366.10 (+-4.94%) Add counter: 351.00 (+-3.70%) [1] https://github.com/xzpeter/clibs/commit/825436f825453de2ea5aaee4bdb1c92281efe5b3Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20210730220602.26327-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

KVM: X86: Optimize pte_list_desc with per-array counter
Add a counter field into pte_list_desc, so as to simplify the add/remove/loop logic. E.g., we don't need to loop over the array any more for most reasons. This will make more sense after we've switched the array size to be larger otherwise the counter will be a waste. Initially I wanted to store a tail pointer at the head of the array list so we don't need to traverse the list at least for pushing new ones (if without the counter we traverse both the list and the array). However that'll need slightly more change without a huge lot benefit, e.g., after we grow entry numbers per array the list traversing is not so expensive. So let's be simple but still try to get as much benefit as we can with just these extra few lines of changes (not to mention the code looks easier too without looping over arrays). I used the same a test case to fork 500 child and recycle them ("./rmap_fork 500" [1]), this patch further speeds up the total fork time of about 4%, which is a total of 33% of vanilla kernel: Vanilla: 473.90 (+-5.93%) 3->15 slots: 366.10 (+-4.94%) Add counter: 351.00 (+-3.70%) [1] https://github.com/xzpeter/clibs/commit/825436f825453de2ea5aaee4bdb1c92281efe5b3Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20210730220602.26327-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
13236e25 · Peter Xu · Paolo Bonzini · dc1cff96 · 13236e25
Commit 13236e25 authored Jul 30, 2021 by Peter Xu Committed by Paolo Bonzini Aug 04, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 13 deletions

arch/x86/kvm/mmu/mmu.c arch/x86/kvm/mmu/mmu.c +22 -13

No files found.
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -138,11 +138,21 @@ module_param(dbg, bool, 0644);
 #include <trace/events/kvm.h>

 /* make pte_list_desc fit well in cache lines */
-#define PTE_LIST_EXT 15
+#define PTE_LIST_EXT 14

+/*
+ * Slight optimization of cacheline layout, by putting `more' and `spte_count'
+ * at the start; then accessing it will only use one single cacheline for
+ * either full (entries==PTE_LIST_EXT) case or entries<=6.
+ */
 struct pte_list_desc {
-	u64 *sptes[PTE_LIST_EXT];
 	struct pte_list_desc *more;
+	/*
+	 * Stores number of entries stored in the pte_list_desc.  No need to be
+	 * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
+	 */
+	u64 spte_count;
+	u64 *sptes[PTE_LIST_EXT];
 };

 struct kvm_shadow_walk_iterator {
@@ -901,7 +911,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
 			struct kvm_rmap_head *rmap_head)
 {
 	struct pte_list_desc *desc;
-	int i, count = 0;
+	int count = 0;

 	if (!rmap_head->val) {
 		rmap_printk("%p %llx 0->1\n", spte, *spte);
@@ -911,24 +921,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
 		desc = mmu_alloc_pte_list_desc(vcpu);
 		desc->sptes[0] = (u64 *)rmap_head->val;
 		desc->sptes[1] = spte;
+		desc->spte_count = 2;
 		rmap_head->val = (unsigned long)desc | 1;
 		++count;
 	} else {
 		rmap_printk("%p %llx many->many\n", spte, *spte);
 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-		while (desc->sptes[PTE_LIST_EXT-1]) {
+		while (desc->spte_count == PTE_LIST_EXT) {
 			count += PTE_LIST_EXT;
-
 			if (!desc->more) {
 				desc->more = mmu_alloc_pte_list_desc(vcpu);
 				desc = desc->more;
+				desc->spte_count = 0;
 				break;
 			}
 			desc = desc->more;
 		}
-		for (i = 0; desc->sptes[i]; ++i)
-			++count;
-		desc->sptes[i] = spte;
+		count += desc->spte_count;
+		desc->sptes[desc->spte_count++] = spte;
 	}
 	return count;
 }
@@ -938,13 +948,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
 			   struct pte_list_desc *desc, int i,
 			   struct pte_list_desc *prev_desc)
 {
-	int j;
+	int j = desc->spte_count - 1;

-	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
-		;
 	desc->sptes[i] = desc->sptes[j];
 	desc->sptes[j] = NULL;
-	if (j != 0)
+	desc->spte_count--;
+	if (desc->spte_count)
 		return;
 	if (!prev_desc && !desc->more)
 		rmap_head->val = 0;
@@ -977,7 +986,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
 		prev_desc = NULL;
 		while (desc) {
-			for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+			for (i = 0; i < desc->spte_count; ++i) {
 				if (desc->sptes[i] == spte) {
 					pte_list_desc_remove_entry(rmap_head,
 							desc, i, prev_desc);