Commit 1a3ea611 authored by Paul E. McKenney's avatar Paul E. McKenney

x86/nmi: Accumulate NMI-progress evidence in exc_nmi()

CPUs ignoring NMIs is often a sign of those CPUs going bad, but there
are quite a few other reasons why a CPU might ignore NMIs.  Therefore,
accumulate evidence within exc_nmi() as to what might be preventing a
given CPU from responding to an NMI.

[ paulmck: Apply Peter Zijlstra feedback. ]
Signed-off-by: default avatarPaul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
Reviewed-by: default avatarIngo Molnar <mingo@kernel.org>
parent 1b929c02
...@@ -69,6 +69,15 @@ struct nmi_stats { ...@@ -69,6 +69,15 @@ struct nmi_stats {
unsigned int unknown; unsigned int unknown;
unsigned int external; unsigned int external;
unsigned int swallow; unsigned int swallow;
unsigned long recv_jiffies;
unsigned long idt_seq;
unsigned long idt_nmi_seq;
unsigned long idt_ignored;
atomic_long_t idt_calls;
unsigned long idt_seq_snap;
unsigned long idt_nmi_seq_snap;
unsigned long idt_ignored_snap;
long idt_calls_snap;
}; };
static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
...@@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); ...@@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi) DEFINE_IDTENTRY_RAW(exc_nmi)
{ {
irqentry_state_t irq_state; irqentry_state_t irq_state;
struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats);
/* /*
* Re-enable NMIs right here when running as an SEV-ES guest. This might * Re-enable NMIs right here when running as an SEV-ES guest. This might
* cause nested NMIs, but those can be handled safely. * cause nested NMIs, but those can be handled safely.
*/ */
sev_es_nmi_complete(); sev_es_nmi_complete();
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
arch_atomic_long_inc(&nsp->idt_calls);
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
return; return;
...@@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi) ...@@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
} }
this_cpu_write(nmi_state, NMI_EXECUTING); this_cpu_write(nmi_state, NMI_EXECUTING);
this_cpu_write(nmi_cr2, read_cr2()); this_cpu_write(nmi_cr2, read_cr2());
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
WRITE_ONCE(nsp->recv_jiffies, jiffies);
}
nmi_restart: nmi_restart:
/* /*
...@@ -509,8 +526,19 @@ DEFINE_IDTENTRY_RAW(exc_nmi) ...@@ -509,8 +526,19 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
inc_irq_stat(__nmi_count); inc_irq_stat(__nmi_count);
if (!ignore_nmis) if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) {
WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1);
} else if (!ignore_nmis) {
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1));
}
default_do_nmi(regs); default_do_nmi(regs);
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1);
}
}
irqentry_nmi_exit(regs, irq_state); irqentry_nmi_exit(regs, irq_state);
...@@ -525,6 +553,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi) ...@@ -525,6 +553,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
if (user_mode(regs)) if (user_mode(regs))
mds_user_clear_cpu_buffers(); mds_user_clear_cpu_buffers();
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
WARN_ON_ONCE(nsp->idt_seq & 0x1);
WRITE_ONCE(nsp->recv_jiffies, jiffies);
}
} }
#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
......
...@@ -1552,6 +1552,17 @@ config TRACE_IRQFLAGS_NMI ...@@ -1552,6 +1552,17 @@ config TRACE_IRQFLAGS_NMI
depends on TRACE_IRQFLAGS depends on TRACE_IRQFLAGS
depends on TRACE_IRQFLAGS_NMI_SUPPORT depends on TRACE_IRQFLAGS_NMI_SUPPORT
config NMI_CHECK_CPU
bool "Debugging for CPUs failing to respond to backtrace requests"
depends on DEBUG_KERNEL
depends on X86
default n
help
Enables debug prints when a CPU fails to respond to a given
backtrace NMI. These prints provide some reasons why a CPU
might legitimately be failing to respond, for example, if it
is offline of if ignore_nmis is set.
config DEBUG_IRQFLAGS config DEBUG_IRQFLAGS
bool "Debug IRQ flag manipulation" bool "Debug IRQ flag manipulation"
help help
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment