Commit cffc09b3 authored by Dave Jones's avatar Dave Jones

[PATCH] revamped machine check exception support.

- Split out from bluesmoke.c into per-vendor files (Me)
  (If we were that way inclined, we could even make the
   per-vendor bits CONFIG_ options, but thats probably overkill)
- Fixes Kconfig markup. (Roman Zippel)
- P4 can use non-fatal background checker too. (Venkatesh Pallipadi)
- Don't clear MCA status info in case of non-recoverable if
  OS has failed in logging those, BIOS can still ahve a look at
  that info. (Venkatesh)
- We can init bank 0 on P4 (Zwane Mwaikambo)
- Compile away to nothing if CONFIG_X86_MCE=n
- Various other cleaning (Me)
parent 2836fb50
...@@ -337,7 +337,6 @@ config PREEMPT ...@@ -337,7 +337,6 @@ config PREEMPT
config X86_UP_APIC config X86_UP_APIC
bool "Local APIC support on uniprocessors" if !SMP bool "Local APIC support on uniprocessors" if !SMP
default y if SMP
---help--- ---help---
A local APIC (Advanced Programmable Interrupt Controller) is an A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU integrated interrupt controller in the CPU. If you have a single-CPU
...@@ -447,7 +446,7 @@ config X86_MCE ...@@ -447,7 +446,7 @@ config X86_MCE
the 386 and 486, so nearly everyone can say Y here. the 386 and 486, so nearly everyone can say Y here.
config X86_MCE_NONFATAL config X86_MCE_NONFATAL
bool "Check for non-fatal errors on Athlon/Duron" bool "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_MCE depends on X86_MCE
help help
Enabling this feature starts a timer that triggers every 5 seconds which Enabling this feature starts a timer that triggers every 5 seconds which
...@@ -456,12 +455,12 @@ config X86_MCE_NONFATAL ...@@ -456,12 +455,12 @@ config X86_MCE_NONFATAL
Disable this if you don't want to see these messages. Disable this if you don't want to see these messages.
Seeing the messages this option prints out may be indicative of dying hardware, Seeing the messages this option prints out may be indicative of dying hardware,
or out-of-spec (ie, overclocked) hardware. or out-of-spec (ie, overclocked) hardware.
This option only does something on hardware with Intel P6 style MCE. This option only does something on certain CPUs.
(Pentium Pro and above, AMD Athlon/Duron) (AMD Athlon/Duron and Intel Pentium 4)
config X86_MCE_P4THERMAL config X86_MCE_P4THERMAL
bool "check for P4 thermal throttling interrupt." bool "check for P4 thermal throttling interrupt."
depends on X86_MCE && X86_UP_APIC depends on X86_MCE && (X86_UP_APIC || SMP)
help help
Enabling this feature will cause a message to be printed when the P4 Enabling this feature will cause a message to be printed when the P4
enters thermal throttling. enters thermal throttling.
......
...@@ -8,8 +8,7 @@ export-objs := mca.o i386_ksyms.o time.o ...@@ -8,8 +8,7 @@ export-objs := mca.o i386_ksyms.o time.o
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
bootflag.o
obj-y += cpu/ obj-y += cpu/
obj-y += timers/ obj-y += timers/
......
...@@ -13,7 +13,10 @@ obj-y += rise.o ...@@ -13,7 +13,10 @@ obj-y += rise.o
obj-y += nexgen.o obj-y += nexgen.o
obj-y += umc.o obj-y += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_FREQ) += cpufreq/
include $(TOPDIR)/Rules.make include $(TOPDIR)/Rules.make
...@@ -358,7 +358,9 @@ void __init identify_cpu(struct cpuinfo_x86 *c) ...@@ -358,7 +358,9 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[3]); boot_cpu_data.x86_capability[3]);
/* Init Machine Check Exception if available. */ /* Init Machine Check Exception if available. */
#ifdef CONFIG_X86_MCE
mcheck_init(c); mcheck_init(c);
#endif
} }
/* /*
* Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c * Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c
......
obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
include $(TOPDIR)/Rules.make
/*
* Athlon specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For AMD Athlon/Duron */
static void k7_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high&(1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk ("\n");
/* Clear it */
wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
if (recover&2)
panic ("CPU context corrupt");
if (recover&1)
panic ("Unable to continue");
printk (KERN_EMERG "Attempting to continue.\n");
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/* AMD K7 machine check is Intel like */
void __init amd_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
machine_check_vector = k7_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
for (i=0; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
#ifdef CONFIG_X86_MCE_NONFATAL
init_nonfatal_mce_checker();
#endif
}
/*
* mce.c - x86 Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/thread_info.h>
#include "mce.h"
int mce_disabled __initdata = 0;
int nr_mce_banks;
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
}
/* Call the installed machine check handler for this CPU setup. */
void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
asmlinkage void do_machine_check(struct pt_regs * regs, long error_code)
{
machine_check_vector(regs, error_code);
}
/* This has to be run for each processor */
void __init mcheck_init(struct cpuinfo_x86 *c)
{
if (mce_disabled==1)
return;
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
if (c->x86==6 || c->x86==15)
amd_mcheck_init(c);
break;
case X86_VENDOR_INTEL:
if (c->x86==5)
intel_p5_mcheck_init(c);
if (c->x86==6)
intel_p6_mcheck_init(c);
if (c->x86==15)
intel_p4_mcheck_init(c);
break;
case X86_VENDOR_CENTAUR:
if (c->x86==5)
winchip_mcheck_init(c);
break;
default:
break;
}
}
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
return 0;
}
static int __init mcheck_enable(char *str)
{
mce_disabled = -1;
return 0;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
#include <linux/init.h>
void amd_mcheck_init(struct cpuinfo_x86 *c);
void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
void init_nonfatal_mce_checker(void);
/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
extern int mce_disabled __initdata;
extern int nr_mce_banks;
/*
* P4 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
static struct timer_list mce_timer;
static int timerset;
#define MCE_RATE 15*HZ /* timer rate is 15s */
static void mce_checkregs (void *info)
{
u32 low, high;
int i;
preempt_disable();
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
if (high & (1<<31)) {
printk (KERN_EMERG "MCE: The hardware reports a non fatal, correctable incident occured on CPU %d.\n",
smp_processor_id());
printk (KERN_EMERG "Bank %d: %08x%08x\n", i, high, low);
/* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
preempt_enable();
}
static void do_mce_timer(void *data)
{
mce_checkregs (NULL);
smp_call_function (mce_checkregs, NULL, 1, 1);
}
static DECLARE_WORK(mce_work, do_mce_timer, NULL);
static void mce_timerfunc (unsigned long data)
{
#ifdef CONFIG_SMP
if (num_online_cpus() > 1)
schedule_work (&mce_work);
#else
mce_checkregs (NULL);
#endif
mce_timer.expires = jiffies + MCE_RATE;
add_timer (&mce_timer);
}
void init_nonfatal_mce_checker()
{
if (timerset == 0) {
/* Set the timer to check for non-fatal
errors every MCE_RATE seconds */
init_timer (&mce_timer);
mce_timer.expires = jiffies + MCE_RATE;
mce_timer.data = 0;
mce_timer.function = &mce_timerfunc;
add_timer (&mce_timer);
timerset = 1;
printk(KERN_INFO "Machine check exception polling timer started.\n");
}
}
/* /*
* arch/i386/kernel/bluesmoke.c - x86 Machine Check Exception Reporting * P4 specific Machine Check Exception Reporting
*/ */
#include <linux/init.h> #include <linux/init.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/smp.h>
#include <linux/config.h> #include <linux/config.h>
#include <linux/irq.h> #include <linux/irq.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/system.h> #include <asm/system.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/apic.h> #include <asm/apic.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/hardirq.h>
#ifdef CONFIG_X86_MCE #include "mce.h"
/* as supported by the P4/Xeon family */ /* as supported by the P4/Xeon family */
struct intel_mce_extended_msrs { struct intel_mce_extended_msrs {
...@@ -37,17 +32,16 @@ struct intel_mce_extended_msrs { ...@@ -37,17 +32,16 @@ struct intel_mce_extended_msrs {
/* u32 *reserved[]; */ /* u32 *reserved[]; */
}; };
static int mce_disabled __initdata = 0;
static int mce_num_extended_msrs = 0; static int mce_num_extended_msrs = 0;
static int banks;
#ifdef CONFIG_X86_MCE_P4THERMAL #ifdef CONFIG_X86_MCE_P4THERMAL
/* static void unexpected_thermal_interrupt(struct pt_regs *regs)
* P4/Xeon Thermal transition interrupt handler {
*/ printk(KERN_ERR "CPU#%d: Unexpected LVT TMR interrupt!\n", smp_processor_id());
}
/* P4/Xeon Thermal transition interrupt handler */
static void intel_thermal_interrupt(struct pt_regs *regs) static void intel_thermal_interrupt(struct pt_regs *regs)
{ {
u32 l, h; u32 l, h;
...@@ -55,7 +49,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs) ...@@ -55,7 +49,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
ack_APIC_irq(); ack_APIC_irq();
rdmsr(MSR_IA32_THERM_STATUS, l, h); rdmsr (MSR_IA32_THERM_STATUS, l, h);
if (l & 1) { if (l & 1) {
printk(KERN_EMERG "CPU#%d: Temperature above threshold\n", cpu); printk(KERN_EMERG "CPU#%d: Temperature above threshold\n", cpu);
printk(KERN_EMERG "CPU#%d: Running in modulated clock mode\n", cpu); printk(KERN_EMERG "CPU#%d: Running in modulated clock mode\n", cpu);
...@@ -64,15 +58,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs) ...@@ -64,15 +58,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
} }
} }
static void unexpected_thermal_interrupt(struct pt_regs *regs) /* Thermal interrupt handler for this CPU setup */
{
printk(KERN_ERR "CPU#%d: Unexpected LVT TMR interrupt!\n", smp_processor_id());
}
/*
* Thermal interrupt handler for this CPU setup
*/
static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
asmlinkage void smp_thermal_interrupt(struct pt_regs regs) asmlinkage void smp_thermal_interrupt(struct pt_regs regs)
...@@ -83,7 +69,6 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs regs) ...@@ -83,7 +69,6 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs regs)
} }
/* P4/Xeon Thermal regulation detect and init */ /* P4/Xeon Thermal regulation detect and init */
static void __init intel_init_thermal(struct cpuinfo_x86 *c) static void __init intel_init_thermal(struct cpuinfo_x86 *c)
{ {
u32 l, h; u32 l, h;
...@@ -101,7 +86,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c) ...@@ -101,7 +86,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
* be some SMM goo which handles it, so we can't even put a handler * be some SMM goo which handles it, so we can't even put a handler
* since it might be delivered via SMI already -zwanem. * since it might be delivered via SMI already -zwanem.
*/ */
rdmsr(MSR_IA32_MISC_ENABLE, l, h); rdmsr (MSR_IA32_MISC_ENABLE, l, h);
h = apic_read(APIC_LVTTHMR); h = apic_read(APIC_LVTTHMR);
if ((l & (1<<3)) && (h & APIC_DM_SMI)) { if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG "CPU#%d: Thermal monitoring handled by SMI\n", cpu); printk(KERN_DEBUG "CPU#%d: Thermal monitoring handled by SMI\n", cpu);
...@@ -120,25 +105,24 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c) ...@@ -120,25 +105,24 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
apic_write_around(APIC_LVTTHMR, h); apic_write_around(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
/* ok we're good to go... */ /* ok we're good to go... */
vendor_thermal_interrupt = intel_thermal_interrupt; vendor_thermal_interrupt = intel_thermal_interrupt;
rdmsr(MSR_IA32_MISC_ENABLE, l, h); rdmsr (MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
l = apic_read(APIC_LVTTHMR); l = apic_read (APIC_LVTTHMR);
apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk(KERN_INFO "CPU#%d: Thermal monitoring enabled\n", cpu); printk (KERN_INFO "CPU#%d: Thermal monitoring enabled\n", cpu);
return; return;
} }
#endif /* CONFIG_X86_MCE_P4THERMAL */ #endif /* CONFIG_X86_MCE_P4THERMAL */
/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r) static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
{ {
u32 h; u32 h;
...@@ -146,16 +130,16 @@ static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r) ...@@ -146,16 +130,16 @@ static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
if (mce_num_extended_msrs == 0) if (mce_num_extended_msrs == 0)
goto done; goto done;
rdmsr(MSR_IA32_MCG_EAX, r->eax, h); rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
rdmsr(MSR_IA32_MCG_EDX, r->edx, h); rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
rdmsr(MSR_IA32_MCG_ESI, r->esi, h); rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
rdmsr(MSR_IA32_MCG_EDI, r->edi, h); rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
rdmsr(MSR_IA32_MCG_ESP, r->esp, h); rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
rdmsr(MSR_IA32_MCG_EIP, r->eip, h); rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
/* can we rely on kmalloc to do a dynamic /* can we rely on kmalloc to do a dynamic
* allocation for the reserved registers? * allocation for the reserved registers?
...@@ -164,10 +148,6 @@ static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r) ...@@ -164,10 +148,6 @@ static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
return mce_num_extended_msrs; return mce_num_extended_msrs;
} }
/*
* Machine Check Handler For PII/PIII
*/
static void intel_machine_check(struct pt_regs * regs, long error_code) static void intel_machine_check(struct pt_regs * regs, long error_code)
{ {
int recover=1; int recover=1;
...@@ -176,329 +156,106 @@ static void intel_machine_check(struct pt_regs * regs, long error_code) ...@@ -176,329 +156,106 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
int i; int i;
struct intel_mce_extended_msrs dbg; struct intel_mce_extended_msrs dbg;
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if(mcgstl&(1<<0)) /* Recoverable ? */ if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0; recover=0;
printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
if (intel_get_extended_msrs(&dbg)) { if (intel_get_extended_msrs(&dbg)) {
printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
smp_processor_id(), dbg.eip, dbg.eflags); smp_processor_id(), dbg.eip, dbg.eflags);
printk(KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
printk(KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
dbg.esi, dbg.edi, dbg.ebp, dbg.esp); dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
} }
for (i=0;i<banks;i++) { for (i=0; i<nr_mce_banks; i++) {
rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high); rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if(high&(1<<31)) { if (high & (1<<31)) {
if(high&(1<<29)) if (high & (1<<29))
recover|=1; recover |= 1;
if(high&(1<<25)) if (high & (1<<25))
recover|=2; recover |= 2;
printk(KERN_EMERG "Bank %d: %08x%08x", i, high, low); printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high&=~(1<<31); high &= ~(1<<31);
if(high&(1<<27)) { if (high & (1<<27)) {
rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk("[%08x%08x]", ahigh, alow); printk ("[%08x%08x]", ahigh, alow);
} }
if(high&(1<<26)) { if (high & (1<<26)) {
rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk(" at %08x%08x", ahigh, alow); printk (" at %08x%08x", ahigh, alow);
} }
printk("\n"); printk ("\n");
/* Clear it */
wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
} }
} }
if(recover&2) if (recover & 2)
panic("CPU context corrupt"); panic ("CPU context corrupt");
if(recover&1) if (recover & 1)
panic("Unable to continue"); panic ("Unable to continue");
printk(KERN_EMERG "Attempting to continue.\n");
mcgstl&=~(1<<2);
wrmsr(MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/*
* Machine check handler for Pentium class Intel
*/
static void pentium_machine_check(struct pt_regs * regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
if(lotype&(1<<5))
printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
}
/*
* Machine check handler for WinChip C6
*/
static void winchip_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_EMERG "CPU#%d: Machine Check Exception.\n", smp_processor_id());
}
/*
* Handle unconfigured int18 (should never happen)
*/
static void unexpected_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
}
/*
* Call the installed machine check handler for this CPU setup.
*/
static void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
asmlinkage void do_machine_check(struct pt_regs * regs, long error_code)
{
machine_check_vector(regs, error_code);
}
#ifdef CONFIG_X86_MCE_NONFATAL
static struct timer_list mce_timer;
static int timerset = 0;
#define MCE_RATE 15*HZ /* timer rate is 15s */
static void mce_checkregs (void *info)
{
u32 low, high;
int i;
preempt_disable();
for (i=0; i<banks; i++) {
rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
if ((low | high) != 0) {
printk (KERN_EMERG "MCE: The hardware reports a non fatal, correctable incident occured on CPU %d.\n", smp_processor_id());
printk (KERN_EMERG "Bank %d: %08x%08x\n", i, high, low);
/* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
printk(KERN_EMERG "Attempting to continue.\n");
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
for (i=0; i<nr_mce_banks; i++) {
u32 msr;
msr = MSR_IA32_MC0_STATUS+i*4;
rdmsr (msr, low, high);
if (high&(1<<31)) {
/* Clear it */
wrmsr(msr, 0UL, 0UL);
/* Serialize */ /* Serialize */
wmb(); wmb();
} }
} }
preempt_enable(); mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
} }
static void do_mce_timer(void *data)
{
smp_call_function (mce_checkregs, NULL, 1, 1);
}
static DECLARE_WORK(mce_work, do_mce_timer, NULL); void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c)
static void mce_timerfunc (unsigned long data)
{
#ifdef CONFIG_SMP
if (num_online_cpus() > 1)
schedule_work(&mce_work);
#else
mce_checkregs(NULL);
#endif
mce_timer.expires = jiffies + MCE_RATE;
add_timer (&mce_timer);
}
#endif
/*
* Set up machine check reporting for processors with Intel style MCE
*/
static void __init intel_mcheck_init(struct cpuinfo_x86 *c)
{ {
u32 l, h; u32 l, h;
int i; int i;
static int done;
/*
* Check for MCE support
*/
if( !cpu_has(c, X86_FEATURE_MCE) )
return;
/*
* Pentium machine check
*/
if(c->x86 == 5)
{
/* Default P5 to off as its often misconnected */
if(mce_disabled != -1)
return;
machine_check_vector = pentium_machine_check;
wmb();
/* Read registers before enabling */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
if(done==0)
printk(KERN_INFO "Intel old style machine check architecture supported.\n");
/* Enable MCE */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
return;
}
/*
* Check for PPro style MCA
*/
if( !cpu_has(c, X86_FEATURE_MCA) )
return;
/* Ok machine check is available */
machine_check_vector = intel_machine_check; machine_check_vector = intel_machine_check;
wmb(); wmb();
if(done==0) printk (KERN_INFO "Intel machine check architecture supported.\n");
printk(KERN_INFO "Intel machine check architecture supported.\n"); rdmsr (MSR_IA32_MCG_CAP, l, h);
rdmsr(MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */
if(l&(1<<8)) /* Control register present ? */ wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); nr_mce_banks = l & 0xff;
banks = l&0xff;
for (i=0; i<nr_mce_banks; i++) {
/* Don't enable bank 0 on intel P6 cores, it goes bang quickly. */ wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6) { wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
for(i=1; i<banks; i++)
wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
} else {
for(i=0; i<banks; i++)
wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
} }
for(i=0; i<banks; i++) set_in_cr4 (X86_CR4_MCE);
wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
set_in_cr4(X86_CR4_MCE); /* Check for P4/Xeon extended MCE MSRs */
printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", smp_processor_id()); rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<9)) {/* MCG_EXT_P */
mce_num_extended_msrs = (l >> 16) & 0xff;
printk (KERN_INFO "CPU#%d: Intel P4/Xeon Extended MCE MSRs (%d) available\n",
smp_processor_id(), mce_num_extended_msrs);
/*
* Check for P4/Xeon specific MCE extensions
*/
if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 15) {
/* Check for P4/Xeon extended MCE MSRs */
rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<9)) {/* MCG_EXT_P */
mce_num_extended_msrs = (l >> 16) & 0xff;
printk(KERN_INFO "CPU#%d: Intel P4/Xeon Extended MCE MSRs (%d) available\n",
smp_processor_id(), mce_num_extended_msrs);
}
#ifdef CONFIG_X86_MCE_P4THERMAL #ifdef CONFIG_X86_MCE_P4THERMAL
/* Check for P4/Xeon Thermal monitor */ /* Check for P4/Xeon Thermal monitor */
intel_init_thermal(c); intel_init_thermal(c);
#endif #endif
} }
done=1;
}
/*
* Set up machine check reporting on the Winchip C6 series
*/
static void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
/* Not supported on C3 */
if(c->x86 != 5)
return;
/* Winchip C6 */
machine_check_vector = winchip_machine_check;
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo&= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Winchip machine check reporting enabled on CPU#%d.\n", smp_processor_id());
}
/*
* This has to be run for each processor
*/
void __init mcheck_init(struct cpuinfo_x86 *c)
{
if(mce_disabled==1)
return;
switch(c->x86_vendor)
{
case X86_VENDOR_AMD:
/* AMD K7 machine check is Intel like */
if(c->x86 == 6 || c->x86 == 15) {
intel_mcheck_init(c);
#ifdef CONFIG_X86_MCE_NONFATAL #ifdef CONFIG_X86_MCE_NONFATAL
if (timerset == 0) { init_nonfatal_mce_checker();
/* Set the timer to check for non-fatal
errors every MCE_RATE seconds */
init_timer (&mce_timer);
mce_timer.expires = jiffies + MCE_RATE;
mce_timer.data = 0;
mce_timer.function = &mce_timerfunc;
add_timer (&mce_timer);
timerset = 1;
printk(KERN_INFO "Machine check exception polling timer started.\n");
}
#endif #endif
}
break;
case X86_VENDOR_INTEL:
intel_mcheck_init(c);
break;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
break;
default:
break;
}
} }
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
return 0;
}
static int __init mcheck_enable(char *str)
{
mce_disabled = -1;
return 0;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
#else
asmlinkage void do_machine_check(struct pt_regs * regs, long error_code) {}
asmlinkage void smp_thermal_interrupt(struct pt_regs regs) {}
void __init mcheck_init(struct cpuinfo_x86 *c) {}
#endif
/*
* P5 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for Pentium class Intel */
static void pentium_machine_check(struct pt_regs * regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
if(lotype&(1<<5))
printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
}
/* Set up machine check reporting for processors with Intel style MCE */
void __init intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
/*Check for MCE support */
if( !cpu_has(c, X86_FEATURE_MCE) )
return;
/* Default P5 to off as its often misconnected */
if(mce_disabled != -1)
return;
machine_check_vector = pentium_machine_check;
wmb();
/* Read registers before enabling */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
printk(KERN_INFO "Intel old style machine check architecture supported.\n");
/* Enable MCE */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
}
/*
* P6 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For PII/PIII */
static void intel_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high & (1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk ("\n");
}
}
if (recover & 2)
panic ("CPU context corrupt");
if (recover & 1)
panic ("Unable to continue");
printk (KERN_EMERG "Attempting to continue.\n");
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
for (i=0; i<nr_mce_banks; i++) {
unsigned int msr;
msr = MSR_IA32_MC0_STATUS+i*4;
rdmsr (msr,low, high);
if (high & (1<<31)) {
/* Clear it */
wrmsr (msr, 0UL, 0UL);
/* Serialize */
wmb();
}
}
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/* Set up machine check reporting for processors with Intel style MCE */
void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
/* Check for MCE support */
if (!cpu_has(c, X86_FEATURE_MCE))
return;
/* Check for PPro style MCA */
if (!cpu_has(c, X86_FEATURE_MCA))
return;
/* Ok machine check is available */
machine_check_vector = intel_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
/* Don't enable bank 0 on intel P6 cores, it goes bang quickly. */
for (i=1; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}
/*
* IDT Winchip specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for WinChip C6 */
static void winchip_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
}
/* Set up machine check reporting on the Winchip C6 series */
void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
machine_check_vector = winchip_machine_check;
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo&= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
}
...@@ -471,10 +471,12 @@ ENTRY(page_fault) ...@@ -471,10 +471,12 @@ ENTRY(page_fault)
pushl $do_page_fault pushl $do_page_fault
jmp error_code jmp error_code
#ifdef CONFIG_X86_MCE
ENTRY(machine_check) ENTRY(machine_check)
pushl $0 pushl $0
pushl $do_machine_check pushl $do_machine_check
jmp error_code jmp error_code
#endif
ENTRY(spurious_interrupt_bug) ENTRY(spurious_interrupt_bug)
pushl $0 pushl $0
......
...@@ -906,7 +906,9 @@ void __init trap_init(void) ...@@ -906,7 +906,9 @@ void __init trap_init(void)
set_trap_gate(15,&spurious_interrupt_bug); set_trap_gate(15,&spurious_interrupt_bug);
set_trap_gate(16,&coprocessor_error); set_trap_gate(16,&coprocessor_error);
set_trap_gate(17,&alignment_check); set_trap_gate(17,&alignment_check);
#ifdef CONFIG_X86_MCE
set_trap_gate(18,&machine_check); set_trap_gate(18,&machine_check);
#endif
set_trap_gate(19,&simd_coprocessor_error); set_trap_gate(19,&simd_coprocessor_error);
set_system_gate(SYSCALL_VECTOR,&system_call); set_system_gate(SYSCALL_VECTOR,&system_call);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment