Commit 2a66bc5a authored by Andi Kleen's avatar Andi Kleen Committed by Linus Torvalds

[PATCH] i386/x86-64: Fix SMP NMI watchdog race

Fix SMP race in NMI watchdog on i386/x86-64

Fix a long standing SMP Setup race in the NMI watchdog.  The watchdog would
tick from very early and check if all CPUs increase their timer interrupts.
For that it would check the cpu_online_map.  Now if a CPU took too long to
boot the watchdog would trigger prematurely because the CPU didn't increase
its timer count yet.

Fix is to check cpu_callin_map instead of cpu_online_map because the first is
only set when a CPU started its timer interrupt.

I fixed it on i386 and x86-64.

Description of the problem from Manpreet Singh. Thanks.

Cc: <manpreet@fabric7.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 421f4e4a
...@@ -117,8 +117,12 @@ int __init check_nmi_watchdog (void) ...@@ -117,8 +117,12 @@ int __init check_nmi_watchdog (void)
/* FIXME: Only boot CPU is online at this stage. Check CPUs /* FIXME: Only boot CPU is online at this stage. Check CPUs
as they come up. */ as they come up. */
for (cpu = 0; cpu < NR_CPUS; cpu++) { for (cpu = 0; cpu < NR_CPUS; cpu++) {
if (!cpu_online(cpu)) #ifdef CONFIG_SMP
/* Check cpu_callin_map here because that is set
after the timer is started. */
if (!cpu_isset(cpu, cpu_callin_map))
continue; continue;
#endif
if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
printk("CPU#%d: NMI appears to be stuck!\n", cpu); printk("CPU#%d: NMI appears to be stuck!\n", cpu);
nmi_active = 0; nmi_active = 0;
......
...@@ -66,7 +66,7 @@ EXPORT_SYMBOL(phys_proc_id); ...@@ -66,7 +66,7 @@ EXPORT_SYMBOL(phys_proc_id);
/* bitmap of online cpus */ /* bitmap of online cpus */
cpumask_t cpu_online_map; cpumask_t cpu_online_map;
static cpumask_t cpu_callin_map; cpumask_t cpu_callin_map;
cpumask_t cpu_callout_map; cpumask_t cpu_callout_map;
static cpumask_t smp_commenced_mask; static cpumask_t smp_commenced_mask;
......
...@@ -130,8 +130,12 @@ int __init check_nmi_watchdog (void) ...@@ -130,8 +130,12 @@ int __init check_nmi_watchdog (void)
mdelay((10*1000)/nmi_hz); // wait 10 ticks mdelay((10*1000)/nmi_hz); // wait 10 ticks
for (cpu = 0; cpu < NR_CPUS; cpu++) { for (cpu = 0; cpu < NR_CPUS; cpu++) {
if (!cpu_online(cpu)) #ifdef CONFIG_SMP
/* Check cpu_callin_map here because that is set
after the timer is started. */
if (!cpu_isset(cpu, cpu_callin_map))
continue; continue;
#endif
if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
printk("CPU#%d: NMI appears to be stuck (%d)!\n", printk("CPU#%d: NMI appears to be stuck (%d)!\n",
cpu, cpu,
......
...@@ -63,7 +63,7 @@ EXPORT_SYMBOL(phys_proc_id); ...@@ -63,7 +63,7 @@ EXPORT_SYMBOL(phys_proc_id);
/* Bitmask of currently online CPUs */ /* Bitmask of currently online CPUs */
cpumask_t cpu_online_map; cpumask_t cpu_online_map;
static cpumask_t cpu_callin_map; cpumask_t cpu_callin_map;
cpumask_t cpu_callout_map; cpumask_t cpu_callout_map;
static cpumask_t smp_commenced_mask; static cpumask_t smp_commenced_mask;
......
...@@ -53,6 +53,7 @@ extern u8 x86_cpu_to_apicid[]; ...@@ -53,6 +53,7 @@ extern u8 x86_cpu_to_apicid[];
#define __smp_processor_id() (current_thread_info()->cpu) #define __smp_processor_id() (current_thread_info()->cpu)
extern cpumask_t cpu_callout_map; extern cpumask_t cpu_callout_map;
extern cpumask_t cpu_callin_map;
#define cpu_possible_map cpu_callout_map #define cpu_possible_map cpu_callout_map
/* We don't mark CPUs online until __cpu_up(), so we need another measure */ /* We don't mark CPUs online until __cpu_up(), so we need another measure */
......
...@@ -59,6 +59,7 @@ extern u8 phys_proc_id[NR_CPUS]; ...@@ -59,6 +59,7 @@ extern u8 phys_proc_id[NR_CPUS];
*/ */
extern cpumask_t cpu_callout_map; extern cpumask_t cpu_callout_map;
extern cpumask_t cpu_callin_map;
#define cpu_possible_map cpu_callout_map #define cpu_possible_map cpu_callout_map
static inline int num_booting_cpus(void) static inline int num_booting_cpus(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment