I've a lot of pending patches to bluesmoke (splitting it all up
into per-vendor files). This patch will make that a real PITA to
rework. Can you take a look at the work in 2.5-ac and diff against
that instead ?
                Dave
--- linux-2.5.44-ac/arch/i386/kernel/cpu/mcheck/p4.c.org	2002-11-01
15:57:09.000000000 -0800
+++ linux-2.5.44-ac/arch/i386/kernel/cpu/mcheck/p4.c	2002-11-01
19:45:34.000000000 -0800
@@ -5,8 +5,10 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/jiffies.h>
 #include <linux/config.h>
 #include <linux/irq.h>
+#include <linux/workqueue.h>
 #include <linux/interrupt.h>
 #include <linux/smp.h>
 
@@ -190,10 +192,6 @@
 				printk(" at %08x%08x", ahigh, alow);
 			}
 			printk("\n");
-			/* Clear it */
-			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-			/* Serialize */
-			wmb();
 		}
 	}
 
@@ -201,11 +199,78 @@
 		panic("CPU context corrupt");
 	if(recover&1)
 		panic("Unable to continue");
+
 	printk(KERN_EMERG "Attempting to continue.\n");
+	/* 
+	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+	 * recoverable/continuable.This will allow BIOS to look at the MSRs
+	 * for errors if the OS could not log the error.
+	 */
+	for (i=0;i<banks;i++) {
+		unsigned int msr;
+		msr = MSR_IA32_MC0_STATUS+i*4;
+		rdmsr(msr,low, high);
+		if(high&(1<<31)) {
+			/* Clear it */
+			wrmsr(msr, 0UL, 0UL);
+			/* Serialize */
+			wmb();
+		}
+	}
 	mcgstl&=~(1<<2);
 	wrmsr(MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
 }
 
+#ifdef CONFIG_X86_MCE_NONFATAL
+static struct timer_list mce_timer;
+static int timerset = 0;
+
+#define MCE_RATE	15*HZ	/* timer rate is 15s */
+
+static void mce_checkregs (void *info)
+{
+	u32 low, high;
+	int i;
+
+	preempt_disable(); 
+	for (i=0; i<banks; i++) {
+		rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
+
+		if (high&(1<<31)) {
+			printk (KERN_EMERG "MCE: The hardware reports a non
fatal, correctable incident occured on CPU %d.\n", smp_processor_id());
+			printk (KERN_EMERG "Bank %d: %08x%08x\n", i, high,
low);
+
+			/* Scrub the error so we don't pick it up in
MCE_RATE seconds time. */
+			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+
+			/* Serialize */
+			wmb();
+		}
+	}
+	preempt_enable();
+}
+
+static void do_mce_timer(void *data)
+{ 
+	mce_checkregs(NULL);
+	smp_call_function (mce_checkregs, NULL, 1, 1);
+} 
+
+static DECLARE_WORK(mce_work, do_mce_timer, NULL);
+
+static void mce_timerfunc (unsigned long data)
+{
+#ifdef CONFIG_SMP
+	if (num_online_cpus() > 1) 
+		schedule_work(&mce_work); 
+#else
+	mce_checkregs(NULL);
+#endif	/* SMP */
+	mce_timer.expires = jiffies + MCE_RATE;
+	add_timer (&mce_timer);
+}	
+#endif	/* NON_FATAL */
+
 void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -241,4 +306,17 @@
 		intel_init_thermal(c);
 #endif
 	}
+#ifdef CONFIG_X86_MCE_NONFATAL
+	if (timerset == 0) {
+		/* Set the timer to check for non-fatal
+		   errors every MCE_RATE seconds */
+		init_timer (&mce_timer);
+		mce_timer.expires = jiffies + MCE_RATE;
+		mce_timer.data = 0;
+		mce_timer.function = &mce_timerfunc;
+		add_timer (&mce_timer);
+		timerset = 1;
+		printk(KERN_INFO "Machine check exception polling timer
started.\n");
+	}
+#endif
 }
--- linux-2.5.44-ac/arch/i386/kernel/cpu/mcheck/p6.c.org	2002-11-01
15:57:28.000000000 -0800
+++ linux-2.5.44-ac/arch/i386/kernel/cpu/mcheck/p6.c	2002-11-01
19:24:07.000000000 -0800
@@ -49,10 +49,6 @@
 				printk(" at %08x%08x", ahigh, alow);
 			}
 			printk("\n");
-			/* Clear it */
-			wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
-			/* Serialize */
-			wmb();
 		}
 	}
 
@@ -60,7 +56,24 @@
 		panic("CPU context corrupt");
 	if(recover&1)
 		panic("Unable to continue");
+
 	printk(KERN_EMERG "Attempting to continue.\n");
+	/* 
+	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+	 * recoverable/continuable.This will allow BIOS to look at the MSRs
+	 * for errors if the OS could not log the error.
+	 */
+	for (i=0;i<banks;i++) {
+		unsigned int msr;
+		msr = MSR_IA32_MC0_STATUS+i*4;
+		rdmsr(msr,low, high);
+		if(high&(1<<31)) {
+			/* Clear it */
+			wrmsr(msr, 0UL, 0UL);
+			/* Serialize */
+			wmb();
+		}
+	}
 	mcgstl&=~(1<<2);
 	wrmsr(MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
 }
--- linux-2.5.44-ac/arch/i386/config.in.org	2002-11-01
19:06:54.000000000 -0800
+++ linux-2.5.44-ac/arch/i386/config.in	2002-11-01 18:38:49.000000000 -0800
@@ -212,7 +212,7 @@
 fi
 
 bool 'Machine Check Exception' CONFIG_X86_MCE
-dep_bool '  Check for non-fatal errors on Athlon/Duron'
CONFIG_X86_MCE_NONFATAL $CONFIG_X86_MCE
+dep_bool '  Check for non-fatal errors on Athlon/Duron/Pentium-4'
CONFIG_X86_MCE_NONFATAL $CONFIG_X86_MCE
 if [ "$CONFIG_SMP" = "y" ]; then
     dep_bool '  Check for P4 thermal throttling interrupt.'
CONFIG_X86_MCE_P4THERMAL $CONFIG_X86_MCE
 else
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/