KVM: x86/pmu: Defer counter emulated overflow via pmc->prev_counter

Defer reprogramming counters and handling overflow via KVM_REQ_PMU when incrementing counters. KVM skips emulated WRMSR in the VM-Exit fastpath, the fastpath runs with IRQs disabled, skipping instructions can increment and reprogram counters, reprogramming counters can sleep, and sleeping is disallowed while IRQs are disabled. [*] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580 [*] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2981888, name: CPU 15/KVM [*] preempt_count: 1, expected: 0 [*] RCU nest depth: 0, expected: 0 [*] INFO: lockdep is turned off. [*] irq event stamp: 0 [*] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [*] hardirqs last disabled at (0): [<ffffffff8121222a>] copy_process+0x146a/0x62d0 [*] softirqs last enabled at (0): [<ffffffff81212269>] copy_process+0x14a9/0x62d0 [*] softirqs last disabled at (0): [<0000000000000000>] 0x0 [*] Preemption disabled at: [*] [<ffffffffc2063fc1>] vcpu_enter_guest+0x1001/0x3dc0 [kvm] [*] CPU: 17 PID: 2981888 Comm: CPU 15/KVM Kdump: 5.19.0-rc1-g239111db364c-dirty #2 [*] Call Trace: [*] <TASK> [*] dump_stack_lvl+0x6c/0x9b [*] __might_resched.cold+0x22e/0x297 [*] __mutex_lock+0xc0/0x23b0 [*] perf_event_ctx_lock_nested+0x18f/0x340 [*] perf_event_pause+0x1a/0x110 [*] reprogram_counter+0x2af/0x1490 [kvm] [*] kvm_pmu_trigger_event+0x429/0x950 [kvm] [*] kvm_skip_emulated_instruction+0x48/0x90 [kvm] [*] handle_fastpath_set_msr_irqoff+0x349/0x3b0 [kvm] [*] vmx_vcpu_run+0x268e/0x3b80 [kvm_intel] [*] vcpu_enter_guest+0x1d22/0x3dc0 [kvm] Add a field to kvm_pmc to track the previous counter value in order to defer overflow detection to kvm_pmu_handle_event() (the counter must be paused before handling overflow, and that may increment the counter). Opportunistically shrink sizeof(struct kvm_pmc) a bit. Suggested-by: Wanpeng Li <wanpengli@tencent.com> Fixes: 9cd803d4 ("KVM: x86: Update vPMCs when retiring instructions") Signed-off-by: Like Xu <likexu@tencent.com> Link: https://lore.kernel.org/r/20220831085328.45489-6-likexu@tencent.com [sean: avoid re-triggering KVM_REQ_PMU on overflow, tweak changelog] Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220923001355.3741194-5-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

KVM: x86/pmu: Defer counter emulated overflow via pmc->prev_counter
Defer reprogramming counters and handling overflow via KVM_REQ_PMU when incrementing counters. KVM skips emulated WRMSR in the VM-Exit fastpath, the fastpath runs with IRQs disabled, skipping instructions can increment and reprogram counters, reprogramming counters can sleep, and sleeping is disallowed while IRQs are disabled. [*] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580 [*] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2981888, name: CPU 15/KVM [*] preempt_count: 1, expected: 0 [*] RCU nest depth: 0, expected: 0 [*] INFO: lockdep is turned off. [*] irq event stamp: 0 [*] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [*] hardirqs last disabled at (0): [<ffffffff8121222a>] copy_process+0x146a/0x62d0 [*] softirqs last enabled at (0): [<ffffffff81212269>] copy_process+0x14a9/0x62d0 [*] softirqs last disabled at (0): [<0000000000000000>] 0x0 [*] Preemption disabled at: [*] [<ffffffffc2063fc1>] vcpu_enter_guest+0x1001/0x3dc0 [kvm] [*] CPU: 17 PID: 2981888 Comm: CPU 15/KVM Kdump: 5.19.0-rc1-g239111db364c-dirty #2 [*] Call Trace: [*] <TASK> [*] dump_stack_lvl+0x6c/0x9b [*] __might_resched.cold+0x22e/0x297 [*] __mutex_lock+0xc0/0x23b0 [*] perf_event_ctx_lock_nested+0x18f/0x340 [*] perf_event_pause+0x1a/0x110 [*] reprogram_counter+0x2af/0x1490 [kvm] [*] kvm_pmu_trigger_event+0x429/0x950 [kvm] [*] kvm_skip_emulated_instruction+0x48/0x90 [kvm] [*] handle_fastpath_set_msr_irqoff+0x349/0x3b0 [kvm] [*] vmx_vcpu_run+0x268e/0x3b80 [kvm_intel] [*] vcpu_enter_guest+0x1d22/0x3dc0 [kvm] Add a field to kvm_pmc to track the previous counter value in order to defer overflow detection to kvm_pmu_handle_event() (the counter must be paused before handling overflow, and that may increment the counter). Opportunistically shrink sizeof(struct kvm_pmc) a bit. Suggested-by: Wanpeng Li <wanpengli@tencent.com> Fixes: 9cd803d4 ("KVM: x86: Update vPMCs when retiring instructions") Signed-off-by: Like Xu <likexu@tencent.com> Link: https://lore.kernel.org/r/20220831085328.45489-6-likexu@tencent.com [sean: avoid re-triggering KVM_REQ_PMU on overflow, tweak changelog] Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220923001355.3741194-5-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
de0f6195 · Like Xu · Paolo Bonzini · 68fb4757 · de0f6195 · de0f6195
Commit de0f6195 authored Sep 23, 2022 by Like Xu Committed by Paolo Bonzini Nov 09, 2022
4 changed files
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -491,7 +491,10 @@ enum pmc_type {
 struct kvm_pmc {
 	enum pmc_type type;
 	u8 idx;
+	bool is_paused;
+	bool intr;
 	u64 counter;
+	u64 prev_counter;
 	u64 eventsel;
 	struct perf_event *perf_event;
 	struct kvm_vcpu *vcpu;
@@ -501,8 +504,6 @@ struct kvm_pmc {
 	 * ctrl value for fixed counters.
 	 */
 	u64 current_config;
-	bool is_paused;
-	bool intr;
 };
 /* More counters may conflict with other existing Architectural MSRs */

--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -101,14 +101,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
 	bool skip_pmi = false;
-	/*
-	 * Ignore overflow events for counters that are scheduled to be
-	 * reprogrammed, e.g. if a PMI for the previous event races with KVM's
-	 * handling of a related guest WRMSR.
-	 */
-	if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
-		return;
 	if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
 		if (!in_pmi) {
 			/*
@@ -126,7 +118,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
 	} else {
 		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
 	}
-	kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 	if (!pmc->intr || skip_pmi)
 		return;
@@ -151,7 +142,17 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
 {
 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+	/*
+	 * Ignore overflow events for counters that are scheduled to be
+	 * reprogrammed, e.g. if a PMI for the previous event races with KVM's
+	 * handling of a related guest WRMSR.
+	 */
+	if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
+		return;
 	__kvm_perf_overflow(pmc, true);
+	kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 }
 static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
@@ -311,6 +312,9 @@ static void reprogram_counter(struct kvm_pmc *pmc)
 	if (!check_pmu_event_filter(pmc))
 		goto reprogram_complete;
+	if (pmc->counter < pmc->prev_counter)
+		__kvm_perf_overflow(pmc, false);
 	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
 		printk_once("kvm pmu: pin control bit is ignored\n");
@@ -348,6 +352,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
 reprogram_complete:
 	clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+	pmc->prev_counter = 0;
 }
 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
@@ -536,14 +541,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
 {
-	u64 prev_count;
+	pmc->prev_counter = pmc->counter;
-	prev_count = pmc->counter;
 	pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
+	kvm_pmu_request_counter_reprogam(pmc);
-	reprogram_counter(pmc);
-	if (pmc->counter < prev_count)
-		__kvm_perf_overflow(pmc, false);
 }
 static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,

--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -212,7 +212,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
 		struct kvm_pmc *pmc = &pmu->gp_counters[i];
 		pmc_stop_counter(pmc);
-		pmc->counter = pmc->eventsel = 0;
+		pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
 	}
 }

--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -646,14 +646,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
 		pmc = &pmu->gp_counters[i];
 		pmc_stop_counter(pmc);
-		pmc->counter = pmc->eventsel = 0;
+		pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
 	}
 	for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
 		pmc = &pmu->fixed_counters[i];
 		pmc_stop_counter(pmc);
-		pmc->counter = 0;
+		pmc->counter = pmc->prev_counter = 0;
 	}
 	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;