From: "Yu, Luming" Lockless MCE (machine check exception) is ported from X86-64 to i386. The first patch deletes some files and remove some files to other directories but doesn't change any content of files. Signed-off-by: Guo, Racing Cc: Andi Kleen Signed-off-by: Andrew Morton --- dev/null | 1323 -------------------------------- arch/i386/kernel/cpu/mcheck/init.c | 77 + arch/i386/kernel/cpu/mcheck/mce.c | 553 ++++++++++++- arch/i386/kernel/cpu/mcheck/mce.h | 86 +- arch/i386/kernel/cpu/mcheck/mce_intel.c | 99 ++ 5 files changed, 764 insertions(+), 1374 deletions(-) diff -puN /dev/null arch/i386/kernel/cpu/mcheck/init.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/arch/i386/kernel/cpu/mcheck/init.c 2005-04-26 20:37:51.175449840 -0700 @@ -0,0 +1,77 @@ +/* + * mce.c - x86 Machine Check Exception Reporting + * (c) 2002 Alan Cox , Dave Jones + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "mce.h" + +int mce_disabled __initdata = 0; +int nr_mce_banks; + +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* Handle unconfigured int18 (should never happen) */ +static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; + +/* This has to be run for each processor */ +void __init mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled==1) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + if (c->x86==6 || c->x86==15) + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86==5) + intel_p5_mcheck_init(c); + if (c->x86==6) + intel_p6_mcheck_init(c); + if (c->x86==15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86==5) + winchip_mcheck_init(c); + break; + + default: + break; + } +} + +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 0; +} + +static int __init mcheck_enable(char *str) +{ + mce_disabled = -1; + return 0; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); diff -L arch/i386/kernel/cpu/mcheck/k7.c -puN arch/i386/kernel/cpu/mcheck/k7.c~x86-port-lockless-mce-preparation /dev/null --- 25/arch/i386/kernel/cpu/mcheck/k7.c +++ /dev/null 2003-09-15 06:40:47.000000000 -0700 @@ -1,97 +0,0 @@ -/* - * Athlon/Hammer specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mce.h" - -/* Machine Check Handler For AMD Athlon/Duron */ -static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) -{ - int recover=1; - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int i; - - rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; - - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i=1; i, Dave Jones + * Machine check handler. + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. */ #include #include #include -#include -#include -#include -#include - +#include +#include +#include +#include +#include +#include +#include #include -#include +#include +#include +#include +#include + +#define MISC_MCELOG_MINOR 227 +#define NR_BANKS 5 + +static int mce_dont_init; + +/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, + 3: never panic or exit (for testing only) */ +static int tolerant = 1; +static int banks; +static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long console_logged; +static int notify_user; -#include "mce.h" +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ -int mce_disabled __initdata = 0; -int nr_mce_banks; +struct mce_log mcelog = { + MCE_LOG_SIGNATURE, + MCE_LOG_LEN, +}; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ +void mce_log(struct mce *mce) +{ + unsigned next, entry; + mce->finished = 0; + smp_wmb(); + for (;;) { + entry = rcu_dereference(mcelog.next); + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) + continue; + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + smp_wmb(); + mcelog.entry[entry].finished = 1; + smp_wmb(); -/* Handle unconfigured int18 (should never happen) */ -static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); + if (!test_and_set_bit(0, &console_logged)) + notify_user = 1; } -/* Call the installed machine check handler for this CPU setup. */ -void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +static void print_mce(struct mce *m) +{ + printk(KERN_EMERG "\n" + KERN_EMERG + "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + m->cpu, m->mcgstatus, m->bank, m->status); + if (m->rip) { + printk(KERN_EMERG + "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->rip); + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->rip); + printk("\n"); + } + printk(KERN_EMERG "TSC %Lx ", m->tsc); + if (m->addr) + printk("ADDR %Lx ", m->addr); + if (m->misc) + printk("MISC %Lx ", m->misc); + printk("\n"); +} -/* This has to be run for each processor */ -void __init mcheck_init(struct cpuinfo_x86 *c) +static void mce_panic(char *msg, struct mce *backup, unsigned long start) { - if (mce_disabled==1) + int i; + oops_begin(); + for (i = 0; i < MCE_LOG_LEN; i++) { + unsigned long tsc = mcelog.entry[i].tsc; + if (time_before(tsc, start)) + continue; + print_mce(&mcelog.entry[i]); + if (backup && mcelog.entry[i].tsc == backup->tsc) + backup = NULL; + } + if (backup) + print_mce(backup); + if (tolerant >= 3) + printk("Fake panic: %s\n", msg); + else + panic(msg); +} + +static int mce_available(struct cpuinfo_x86 *c) +{ + return test_bit(X86_FEATURE_MCE, &c->x86_capability) && + test_bit(X86_FEATURE_MCA, &c->x86_capability); +} + +/* + * The actual machine check handler + */ + +void do_machine_check(struct pt_regs * regs, long error_code) +{ + struct mce m, panicm; + int nowayout = (tolerant < 1); + int kill_it = 0; + u64 mcestart = 0; + int i; + int panicm_found = 0; + + if (regs) + notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); + if (!banks) return; + memset(&m, 0, sizeof(struct mce)); + m.cpu = hard_smp_processor_id(); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + + rdtscll(mcestart); + barrier(); + + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + if (m.status & MCI_STATUS_EN) { + /* In theory _OVER could be a nowayout too, but + assume any overflowed errors were no fatal. */ + nowayout |= !!(m.status & MCI_STATUS_PCC); + kill_it |= !!(m.status & MCI_STATUS_UC); + } + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) { + m.rip = regs->rip; + m.cs = regs->cs; + } else { + m.rip = 0; + m.cs = 0; + } + + if (error_code != -1) + rdtscll(m.tsc); + wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); + mce_log(&m); + + /* Did this bank cause the exception? */ + /* Assume that the bank with uncorrectable errors did it, + and that there is only a single one. */ + if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + panicm = m; + panicm_found = 1; + } + + tainted |= TAINT_MACHINE_CHECK; + } + + /* Never do anything final in the polling timer */ + if (!regs) + goto out; + + /* If we didn't find an uncorrectable error, pick + the last one (shouldn't happen, just being safe). */ + if (!panicm_found) + panicm = m; + if (nowayout) + mce_panic("Machine check", &panicm, mcestart); + if (kill_it) { + int user_space = 0; + + if (m.mcgstatus & MCG_STATUS_RIPV) + user_space = panicm.rip && (panicm.cs & 3); + + /* When the machine was in user space and the CPU didn't get + confused it's normally not necessary to panic, unless you + are paranoid (tolerant == 0) + + RED-PEN could be more tolerant for MCEs in idle, + but most likely they occur at boot anyways, where + it is best to just halt the machine. */ + if ((!user_space && (panic_on_oops || tolerant < 2)) || + (unsigned)current->pid <= 1) + mce_panic("Uncorrected machine check", &panicm, mcestart); + + /* do_exit takes an awful lot of locks and has as + slight risk of deadlocking. If you don't want that + don't set tolerant >= 2 */ + if (tolerant < 3) + do_exit(SIGBUS); + } + + out: + /* Last thing done in the machine check exception to clear state. */ + wrmsrl(MSR_IA32_MCG_STATUS, 0); +} + +/* + * Periodic polling timer for "silent" machine check errors. + */ + +static int check_interval = 5 * 60; /* 5 minutes */ +static void mcheck_timer(void *data); +static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); + +static void mcheck_check_cpu(void *info) +{ + if (mce_available(¤t_cpu_data)) + do_machine_check(NULL, 0); +} + +static void mcheck_timer(void *data) +{ + on_each_cpu(mcheck_check_cpu, NULL, 1, 1); + schedule_delayed_work(&mcheck_work, check_interval * HZ); + + /* + * It's ok to read stale data here for notify_user and + * console_logged as we'll simply get the updated versions + * on the next mcheck_timer execution and atomic operations + * on console_logged act as synchronization for notify_user + * writes. + */ + if (notify_user && console_logged) { + notify_user = 0; + clear_bit(0, &console_logged); + printk(KERN_INFO "Machine check events logged\n"); + } +} + + +static __init int periodic_mcheck_init(void) +{ + if (check_interval) + schedule_delayed_work(&mcheck_work, check_interval*HZ); + return 0; +} +__initcall(periodic_mcheck_init); + + +/* + * Initialize Machine Checks for a CPU. + */ +static void mce_init(void *dummy) +{ + u64 cap; + int i; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + banks = cap & 0xff; + if (banks > NR_BANKS) { + printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); + banks = NR_BANKS; + } + + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + do_machine_check(NULL, -1); + + set_in_cr4(X86_CR4_MCE); + + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < banks; i++) { + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + +/* Add per CPU specific workarounds here */ +static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { + /* disable GART TBL walk error reporting, which trips off + incorrectly with the IOMMU & 3ware & Cerberus. */ + clear_bit(10, &bank[4]); + } +} + +static void __init mce_cpu_features(struct cpuinfo_x86 *c) +{ switch (c->x86_vendor) { - case X86_VENDOR_AMD: - if (c->x86==6 || c->x86==15) - amd_mcheck_init(c); - break; + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + default: + break; + } +} - case X86_VENDOR_INTEL: - if (c->x86==5) - intel_p5_mcheck_init(c); - if (c->x86==6) - intel_p6_mcheck_init(c); - if (c->x86==15) - intel_p4_mcheck_init(c); - break; +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off. + */ +void __init mcheck_init(struct cpuinfo_x86 *c) +{ + static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; - case X86_VENDOR_CENTAUR: - if (c->x86==5) - winchip_mcheck_init(c); - break; + mce_cpu_quirks(c); - default: - break; + if (mce_dont_init || + cpu_test_and_set(smp_processor_id(), mce_cpus) || + !mce_available(c)) + return; + + mce_init(NULL); + mce_cpu_features(c); +} + +/* + * Character device to read and clear the MCE log. + */ + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) +{ + unsigned long cpu_tsc[NR_CPUS]; + static DECLARE_MUTEX(mce_read_sem); + unsigned next; + char __user *buf = ubuf; + int i, err; + + down(&mce_read_sem); + next = rcu_dereference(mcelog.next); + + /* Only supports full reads right now */ + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { + up(&mce_read_sem); + return -EINVAL; + } + + err = 0; + for (i = 0; i < next; i++) { + if (!mcelog.entry[i].finished) + continue; + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); + buf += sizeof(struct mce); + } + + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; + + synchronize_kernel(); + + /* Collect entries that were still getting written before the synchronize. */ + + on_each_cpu(collect_tscs, cpu_tsc, 1, 1); + for (i = next; i < MCE_LOG_LEN; i++) { + if (mcelog.entry[i].finished && + mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { + err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); + smp_rmb(); + buf += sizeof(struct mce); + memset(&mcelog.entry[i], 0, sizeof(struct mce)); + } + } + up(&mce_read_sem); + return err ? -EFAULT : buf - ubuf; +} + +static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) +{ + int __user *p = (int __user *)arg; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + return put_user(flags, p); + } + default: + return -ENOTTY; } } +static struct file_operations mce_chrdev_ops = { + .read = mce_read, + .ioctl = mce_ioctl, +}; + +static struct miscdevice mce_log_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +/* + * Old style boot options parsing. Only for compatibility. + */ + static int __init mcheck_disable(char *str) { - mce_disabled = 1; + mce_dont_init = 1; return 0; } +/* mce=off disables machine check. Note you can reenable it later + using sysfs */ static int __init mcheck_enable(char *str) { - mce_disabled = -1; + if (!strcmp(str, "off")) + mce_dont_init = 1; + else + printk("mce= argument %s ignored. Please use /sys", str); return 0; } __setup("nomce", mcheck_disable); __setup("mce", mcheck_enable); + +/* + * Sysfs support + */ + +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ +static int mce_resume(struct sys_device *dev) +{ + on_each_cpu(mce_init, NULL, 1, 1); + return 0; +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + if (check_interval) + cancel_delayed_work(&mcheck_work); + /* Timer race is harmless here */ + on_each_cpu(mce_init, NULL, 1, 1); + if (check_interval) + schedule_delayed_work(&mcheck_work, check_interval*HZ); +} + +static struct sysdev_class mce_sysclass = { + .resume = mce_resume, + set_kset_name("machinecheck"), +}; + +static struct sys_device device_mce = { + .id = 0, + .cls = &mce_sysclass, +}; + +/* Why are there no generic functions for this? */ +#define ACCESSOR(name, var, start) \ + static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + return sprintf(buf, "%lx\n", (unsigned long)var); \ + } \ + static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + char *end; \ + unsigned long new = simple_strtoul(buf, &end, 0); \ + if (end == buf) return -EINVAL; \ + var = new; \ + start; \ + return end-buf; \ + } \ + static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); + +ACCESSOR(bank0ctl,bank[0],mce_restart()) +ACCESSOR(bank1ctl,bank[1],mce_restart()) +ACCESSOR(bank2ctl,bank[2],mce_restart()) +ACCESSOR(bank3ctl,bank[3],mce_restart()) +ACCESSOR(bank4ctl,bank[4],mce_restart()) +ACCESSOR(tolerant,tolerant,) +ACCESSOR(check_interval,check_interval,mce_restart()) + +static __init int mce_init_device(void) +{ + int err; + if (!mce_available(&boot_cpu_data)) + return -EIO; + err = sysdev_class_register(&mce_sysclass); + if (!err) + err = sysdev_register(&device_mce); + if (!err) { + /* could create per CPU objects, but it is not worth it. */ + sysdev_create_file(&device_mce, &attr_bank0ctl); + sysdev_create_file(&device_mce, &attr_bank1ctl); + sysdev_create_file(&device_mce, &attr_bank2ctl); + sysdev_create_file(&device_mce, &attr_bank3ctl); + sysdev_create_file(&device_mce, &attr_bank4ctl); + sysdev_create_file(&device_mce, &attr_tolerant); + sysdev_create_file(&device_mce, &attr_check_interval); + } + + misc_register(&mce_log_device); + return err; + +} +device_initcall(mce_init_device); diff -puN arch/i386/kernel/cpu/mcheck/mce.h~x86-port-lockless-mce-preparation arch/i386/kernel/cpu/mcheck/mce.h --- 25/arch/i386/kernel/cpu/mcheck/mce.h~x86-port-lockless-mce-preparation 2005-04-26 20:37:51.163451664 -0700 +++ 25-akpm/arch/i386/kernel/cpu/mcheck/mce.h 2005-04-26 20:37:51.179449232 -0700 @@ -1,14 +1,80 @@ -#include +#ifndef _ASM_MCE_H +#define _ASM_MCE_H 1 -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); +#include +#include -/* Call the installed machine check handler for this CPU setup. */ -extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); +/* + * Machine Check support for x86 + */ -extern int mce_disabled __initdata; -extern int nr_mce_banks; +#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ +#define MCG_STATUS_EIPV (1UL<<1) /* eip points to correct instruction */ +#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ + +#define MCI_STATUS_VAL (1UL<<63) /* valid error */ +#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ +#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ +#define MCI_STATUS_EN (1UL<<60) /* error enabled */ +#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ +#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ + +/* Fields are zero when not available */ +struct mce { + __u64 status; + __u64 misc; + __u64 addr; + __u64 mcgstatus; + __u64 rip; + __u64 tsc; /* cpu time stamp counter */ + __u64 res1; /* for future extension */ + __u64 res2; /* dito. */ + __u8 cs; /* code segment */ + __u8 bank; /* machine check bank */ + __u8 cpu; /* cpu that raised the error */ + __u8 finished; /* entry is valid */ + __u32 pad; +}; + +/* + * This structure contains all data related to the MCE log. + * Also carries a signature to make it easier to find from external debugging tools. + * Each entry is only valid when its finished flag is set. + */ + +#define MCE_LOG_LEN 32 + +struct mce_log { + char signature[12]; /* "MACHINECHECK" */ + unsigned len; /* = MCE_LOG_LEN */ + unsigned next; + unsigned flags; + unsigned pad0; + struct mce entry[MCE_LOG_LEN]; +}; + +#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ + +#define MCE_LOG_SIGNATURE "MACHINECHECK" + +#define MCE_GET_RECORD_LEN _IOR('M', 1, int) +#define MCE_GET_LOG_LEN _IOR('M', 2, int) +#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int) + +/* Software defined banks */ +#define MCE_EXTENDED_BANK 128 +#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0 + +void mce_log(struct mce *m); +#ifdef CONFIG_X86_MCE_INTEL +void mce_intel_feature_init(struct cpuinfo_x86 *c); +#else +static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) +{ +} +#endif + +#endif diff -puN /dev/null arch/i386/kernel/cpu/mcheck/mce_intel.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/arch/i386/kernel/cpu/mcheck/mce_intel.c 2005-04-26 20:37:51.180449080 -0700 @@ -0,0 +1,99 @@ +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo + */ + +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(unsigned long, next_check); + +asmlinkage void smp_thermal_interrupt(void) +{ + struct mce m; + + ack_APIC_irq(); + + irq_enter(); + if (time_before(jiffies, __get_cpu_var(next_check))) + goto done; + + __get_cpu_var(next_check) = jiffies + HZ*300; + memset(&m, 0, sizeof(m)); + m.cpu = smp_processor_id(); + m.bank = MCE_THERMAL_BANK; + rdtscll(m.tsc); + rdmsrl(MSR_IA32_THERM_STATUS, m.status); + if (m.status & 0x1) { + printk(KERN_EMERG + "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); + } + + mce_log(&m); +done: + irq_exit(); +} + +static void __init intel_init_thermal(struct cpuinfo_x86 *c) +{ + u32 l, h; + int tm2 = 0; + unsigned int cpu = smp_processor_id(); + + if (!cpu_has(c, X86_FEATURE_ACPI)) + return; + + if (!cpu_has(c, X86_FEATURE_ACC)) + return; + + /* first check if TM1 is already enabled by the BIOS, in which + * case there might be some SMM goo which handles it, so we can't even + * put a handler since it might be delivered via SMI already. + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + tm2 = 1; + + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already " + "installed\n", cpu, (h & APIC_VECTOR_MASK)); + return; + } + + h = THERMAL_APIC_VECTOR; + h |= (APIC_DM_FIXED | APIC_LVT_MASKED); + apic_write_around(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + + l = apic_read(APIC_LVTTHMR); + apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + return; +} + +void __init mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); +} diff -L arch/i386/kernel/cpu/mcheck/non-fatal.c -puN arch/i386/kernel/cpu/mcheck/non-fatal.c~x86-port-lockless-mce-preparation /dev/null --- 25/arch/i386/kernel/cpu/mcheck/non-fatal.c +++ /dev/null 2003-09-15 06:40:47.000000000 -0700 @@ -1,93 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mce.h" - -static int firstbank; - -#define MCE_RATE 15*HZ /* timer rate is 15s */ - -static void mce_checkregs (void *info) -{ - u32 low, high; - int i; - - for (i=firstbank; i -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "mce.h" - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs = 0; - - -#ifdef CONFIG_X86_MCE_P4THERMAL -static void unexpected_thermal_interrupt(struct pt_regs *regs) -{ - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", - smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); -} - -/* P4/Xeon Thermal transition interrupt handler */ -static void intel_thermal_interrupt(struct pt_regs *regs) -{ - u32 l, h; - unsigned int cpu = smp_processor_id(); - static unsigned long next[NR_CPUS]; - - ack_APIC_irq(); - - if (time_after(next[cpu], jiffies)) - return; - - next[cpu] = jiffies + HZ*5; - rdmsr(MSR_IA32_THERM_STATUS, l, h); - if (l & 0x1) { - printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); - printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", - cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); - } -} - -/* Thermal interrupt handler for this CPU setup */ -static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; - -fastcall void smp_thermal_interrupt(struct pt_regs *regs) -{ - irq_enter(); - vendor_thermal_interrupt(regs); - irq_exit(); -} - -/* P4/Xeon Thermal regulation detect and init */ -static void __init intel_init_thermal(struct cpuinfo_x86 *c) -{ - u32 l, h; - unsigned int cpu = smp_processor_id(); - - /* Thermal monitoring */ - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; /* -ENODEV */ - - /* Clock modulation */ - if (!cpu_has(c, X86_FEATURE_ACC)) - return; /* -ENODEV */ - - /* first check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already -zwanem. - */ - rdmsr (MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & (1<<3)) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", - cpu); - return; /* -EBUSY */ - } - - /* check whether a vector already exists, temporarily masked? */ - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", - cpu, (h & APIC_VECTOR_MASK)); - return; /* -EBUSY */ - } - - /* The temperature transition interrupt handler setup */ - h = THERMAL_APIC_VECTOR; /* our delivery vector */ - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ - apic_write_around(APIC_LVTTHMR, h); - - rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); - - /* ok we're good to go... */ - vendor_thermal_interrupt = intel_thermal_interrupt; - - rdmsr (MSR_IA32_MISC_ENABLE, l, h); - wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); - - l = apic_read (APIC_LVTTHMR); - apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); - return; -} -#endif /* CONFIG_X86_MCE_P4THERMAL */ - - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - if (mce_num_extended_msrs == 0) - goto done; - - rdmsr (MSR_IA32_MCG_EAX, r->eax, h); - rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr (MSR_IA32_MCG_EDX, r->edx, h); - rdmsr (MSR_IA32_MCG_ESI, r->esi, h); - rdmsr (MSR_IA32_MCG_EDI, r->edi, h); - rdmsr (MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr (MSR_IA32_MCG_ESP, r->esp, h); - rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr (MSR_IA32_MCG_EIP, r->eip, h); - - /* can we rely on kmalloc to do a dynamic - * allocation for the reserved registers? - */ -done: - return mce_num_extended_msrs; -} - -static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) -{ - int recover=1; - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int i; - struct intel_mce_extended_msrs dbg; - - rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; - - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (intel_get_extended_msrs(&dbg)) { - printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags); - printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); - printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i=0; i> 16) & 0xff; - printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff -L arch/i386/kernel/cpu/mcheck/p6.c -puN arch/i386/kernel/cpu/mcheck/p6.c~x86-port-lockless-mce-preparation /dev/null --- 25/arch/i386/kernel/cpu/mcheck/p6.c +++ /dev/null 2003-09-15 06:40:47.000000000 -0700 @@ -1,115 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "mce.h" - -/* Machine Check Handler For PII/PIII */ -static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) -{ - int recover=1; - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int i; - - rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover=0; - - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i=0; i -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MISC_MCELOG_MINOR 227 -#define NR_BANKS 5 - -static int mce_dont_init; - -/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, - 3: never panic or exit (for testing only) */ -static int tolerant = 1; -static int banks; -static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; -static unsigned long console_logged; -static int notify_user; -static int rip_msr; - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -struct mce_log mcelog = { - MCE_LOG_SIGNATURE, - MCE_LOG_LEN, -}; - -void mce_log(struct mce *mce) -{ - unsigned next, entry; - mce->finished = 0; - smp_wmb(); - for (;;) { - entry = rcu_dereference(mcelog.next); - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); - return; - } - /* Old left over entry. Skip. */ - if (mcelog.entry[entry].finished) - continue; - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; - } - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - smp_wmb(); - mcelog.entry[entry].finished = 1; - smp_wmb(); - - if (!test_and_set_bit(0, &console_logged)) - notify_user = 1; -} - -static void print_mce(struct mce *m) -{ - printk(KERN_EMERG "\n" - KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", - m->cpu, m->mcgstatus, m->bank, m->status); - if (m->rip) { - printk(KERN_EMERG - "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->rip); - if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->rip); - printk("\n"); - } - printk(KERN_EMERG "TSC %Lx ", m->tsc); - if (m->addr) - printk("ADDR %Lx ", m->addr); - if (m->misc) - printk("MISC %Lx ", m->misc); - printk("\n"); -} - -static void mce_panic(char *msg, struct mce *backup, unsigned long start) -{ - int i; - oops_begin(); - for (i = 0; i < MCE_LOG_LEN; i++) { - unsigned long tsc = mcelog.entry[i].tsc; - if (time_before(tsc, start)) - continue; - print_mce(&mcelog.entry[i]); - if (backup && mcelog.entry[i].tsc == backup->tsc) - backup = NULL; - } - if (backup) - print_mce(backup); - if (tolerant >= 3) - printk("Fake panic: %s\n", msg); - else - panic(msg); -} - -static int mce_available(struct cpuinfo_x86 *c) -{ - return test_bit(X86_FEATURE_MCE, &c->x86_capability) && - test_bit(X86_FEATURE_MCA, &c->x86_capability); -} - -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->rip = regs->rip; - m->cs = regs->cs; - } else { - m->rip = 0; - m->cs = 0; - } - if (rip_msr) { - /* Assume the RIP in the MSR is exact. Is this true? */ - m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->rip); - m->cs = 0; - } -} - -/* - * The actual machine check handler - */ - -void do_machine_check(struct pt_regs * regs, long error_code) -{ - struct mce m, panicm; - int nowayout = (tolerant < 1); - int kill_it = 0; - u64 mcestart = 0; - int i; - int panicm_found = 0; - - if (regs) - notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); - if (!banks) - return; - - memset(&m, 0, sizeof(struct mce)); - m.cpu = hard_smp_processor_id(); - rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; - - rdtscll(mcestart); - barrier(); - - for (i = 0; i < banks; i++) { - if (!bank[i]) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - m.tsc = 0; - - rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - if (m.status & MCI_STATUS_EN) { - /* In theory _OVER could be a nowayout too, but - assume any overflowed errors were no fatal. */ - nowayout |= !!(m.status & MCI_STATUS_PCC); - kill_it |= !!(m.status & MCI_STATUS_UC); - } - - if (m.status & MCI_STATUS_MISCV) - rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); - if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); - - mce_get_rip(&m, regs); - if (error_code != -1) - rdtscll(m.tsc); - wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); - mce_log(&m); - - /* Did this bank cause the exception? */ - /* Assume that the bank with uncorrectable errors did it, - and that there is only a single one. */ - if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { - panicm = m; - panicm_found = 1; - } - - tainted |= TAINT_MACHINE_CHECK; - } - - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - - /* If we didn't find an uncorrectable error, pick - the last one (shouldn't happen, just being safe). */ - if (!panicm_found) - panicm = m; - if (nowayout) - mce_panic("Machine check", &panicm, mcestart); - if (kill_it) { - int user_space = 0; - - if (m.mcgstatus & MCG_STATUS_RIPV) - user_space = panicm.rip && (panicm.cs & 3); - - /* When the machine was in user space and the CPU didn't get - confused it's normally not necessary to panic, unless you - are paranoid (tolerant == 0) - - RED-PEN could be more tolerant for MCEs in idle, - but most likely they occur at boot anyways, where - it is best to just halt the machine. */ - if ((!user_space && (panic_on_oops || tolerant < 2)) || - (unsigned)current->pid <= 1) - mce_panic("Uncorrected machine check", &panicm, mcestart); - - /* do_exit takes an awful lot of locks and has as - slight risk of deadlocking. If you don't want that - don't set tolerant >= 2 */ - if (tolerant < 3) - do_exit(SIGBUS); - } - - out: - /* Last thing done in the machine check exception to clear state. */ - wrmsrl(MSR_IA32_MCG_STATUS, 0); -} - -/* - * Periodic polling timer for "silent" machine check errors. - */ - -static int check_interval = 5 * 60; /* 5 minutes */ -static void mcheck_timer(void *data); -static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); - -static void mcheck_check_cpu(void *info) -{ - if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); -} - -static void mcheck_timer(void *data) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1, 1); - schedule_delayed_work(&mcheck_work, check_interval * HZ); - - /* - * It's ok to read stale data here for notify_user and - * console_logged as we'll simply get the updated versions - * on the next mcheck_timer execution and atomic operations - * on console_logged act as synchronization for notify_user - * writes. - */ - if (notify_user && console_logged) { - notify_user = 0; - clear_bit(0, &console_logged); - printk(KERN_INFO "Machine check events logged\n"); - } -} - - -static __init int periodic_mcheck_init(void) -{ - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); - return 0; -} -__initcall(periodic_mcheck_init); - - -/* - * Initialize Machine Checks for a CPU. - */ -static void mce_init(void *dummy) -{ - u64 cap; - int i; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > NR_BANKS) { - printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); - banks = NR_BANKS; - } - /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) - rip_msr = MSR_IA32_MCG_EIP; - - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, -1); - - set_in_cr4(X86_CR4_MCE); - - if (cap & MCG_CTL_P) - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - - for (i = 0; i < banks; i++) { - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); - } -} - -/* Add per CPU specific workarounds here */ -static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) -{ - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { - /* disable GART TBL walk error reporting, which trips off - incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); - } -} - -static void __init mce_cpu_features(struct cpuinfo_x86 *c) -{ - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - break; - default: - break; - } -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off. - */ -void __init mcheck_init(struct cpuinfo_x86 *c) -{ - static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; - - mce_cpu_quirks(c); - - if (mce_dont_init || - cpu_test_and_set(smp_processor_id(), mce_cpus) || - !mce_available(c)) - return; - - mce_init(NULL); - mce_cpu_features(c); -} - -/* - * Character device to read and clear the MCE log. - */ - -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - rdtscll(cpu_tsc[smp_processor_id()]); -} - -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) -{ - unsigned long *cpu_tsc; - static DECLARE_MUTEX(mce_read_sem); - unsigned next; - char __user *buf = ubuf; - int i, err; - - cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - - down(&mce_read_sem); - next = rcu_dereference(mcelog.next); - - /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - up(&mce_read_sem); - kfree(cpu_tsc); - return -EINVAL; - } - - err = 0; - for (i = 0; i < next; i++) { - if (!mcelog.entry[i].finished) - continue; - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - } - - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; - - synchronize_kernel(); - - /* Collect entries that were still getting written before the synchronize. */ - - on_each_cpu(collect_tscs, cpu_tsc, 1, 1); - for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); - smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); - } - } - up(&mce_read_sem); - kfree(cpu_tsc); - return err ? -EFAULT : buf - ubuf; -} - -static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) -{ - int __user *p = (int __user *)arg; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -static struct file_operations mce_chrdev_ops = { - .read = mce_read, - .ioctl = mce_ioctl, -}; - -static struct miscdevice mce_log_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - -/* - * Old style boot options parsing. Only for compatibility. - */ - -static int __init mcheck_disable(char *str) -{ - mce_dont_init = 1; - return 0; -} - -/* mce=off disables machine check. Note you can reenable it later - using sysfs */ -static int __init mcheck_enable(char *str) -{ - if (!strcmp(str, "off")) - mce_dont_init = 1; - else - printk("mce= argument %s ignored. Please use /sys", str); - return 0; -} - -__setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); - -/* - * Sysfs support - */ - -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ -static int mce_resume(struct sys_device *dev) -{ - on_each_cpu(mce_init, NULL, 1, 1); - return 0; -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ - if (check_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1, 1); - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); -} - -static struct sysdev_class mce_sysclass = { - .resume = mce_resume, - set_kset_name("machinecheck"), -}; - -static struct sys_device device_mce = { - .id = 0, - .cls = &mce_sysclass, -}; - -/* Why are there no generic functions for this? */ -#define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ - return sprintf(buf, "%lx\n", (unsigned long)var); \ - } \ - static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ - char *end; \ - unsigned long new = simple_strtoul(buf, &end, 0); \ - if (end == buf) return -EINVAL; \ - var = new; \ - start; \ - return end-buf; \ - } \ - static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); - -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(tolerant,tolerant,) -ACCESSOR(check_interval,check_interval,mce_restart()) - -static __init int mce_init_device(void) -{ - int err; - if (!mce_available(&boot_cpu_data)) - return -EIO; - err = sysdev_class_register(&mce_sysclass); - if (!err) - err = sysdev_register(&device_mce); - if (!err) { - /* could create per CPU objects, but it is not worth it. */ - sysdev_create_file(&device_mce, &attr_bank0ctl); - sysdev_create_file(&device_mce, &attr_bank1ctl); - sysdev_create_file(&device_mce, &attr_bank2ctl); - sysdev_create_file(&device_mce, &attr_bank3ctl); - sysdev_create_file(&device_mce, &attr_bank4ctl); - sysdev_create_file(&device_mce, &attr_tolerant); - sysdev_create_file(&device_mce, &attr_check_interval); - } - - misc_register(&mce_log_device); - return err; - -} -device_initcall(mce_init_device); diff -L arch/x86_64/kernel/mce_intel.c -puN arch/x86_64/kernel/mce_intel.c~x86-port-lockless-mce-preparation /dev/null --- 25/arch/x86_64/kernel/mce_intel.c +++ /dev/null 2003-09-15 06:40:47.000000000 -0700 @@ -1,99 +0,0 @@ -/* - * Intel specific MCE features. - * Copyright 2004 Zwane Mwaikambo - */ - -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_PER_CPU(unsigned long, next_check); - -asmlinkage void smp_thermal_interrupt(void) -{ - struct mce m; - - ack_APIC_irq(); - - irq_enter(); - if (time_before(jiffies, __get_cpu_var(next_check))) - goto done; - - __get_cpu_var(next_check) = jiffies + HZ*300; - memset(&m, 0, sizeof(m)); - m.cpu = smp_processor_id(); - m.bank = MCE_THERMAL_BANK; - rdtscll(m.tsc); - rdmsrl(MSR_IA32_THERM_STATUS, m.status); - if (m.status & 0x1) { - printk(KERN_EMERG - "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); - } - - mce_log(&m); -done: - irq_exit(); -} - -static void __init intel_init_thermal(struct cpuinfo_x86 *c) -{ - u32 l, h; - int tm2 = 0; - unsigned int cpu = smp_processor_id(); - - if (!cpu_has(c, X86_FEATURE_ACPI)) - return; - - if (!cpu_has(c, X86_FEATURE_ACC)) - return; - - /* first check if TM1 is already enabled by the BIOS, in which - * case there might be some SMM goo which handles it, so we can't even - * put a handler since it might be delivered via SMI already. - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); - if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { - printk(KERN_DEBUG - "CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) - tm2 = 1; - - if (h & APIC_VECTOR_MASK) { - printk(KERN_DEBUG - "CPU%d: Thermal LVT vector (%#x) already " - "installed\n", cpu, (h & APIC_VECTOR_MASK)); - return; - } - - h = THERMAL_APIC_VECTOR; - h |= (APIC_DM_FIXED | APIC_LVT_MASKED); - apic_write_around(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); - - l = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); - return; -} - -void __init mce_intel_feature_init(struct cpuinfo_x86 *c) -{ - intel_init_thermal(c); -} diff -L include/asm-x86_64/mce.h -puN include/asm-x86_64/mce.h~x86-port-lockless-mce-preparation /dev/null --- 25/include/asm-x86_64/mce.h +++ /dev/null 2003-09-15 06:40:47.000000000 -0700 @@ -1,80 +0,0 @@ -#ifndef _ASM_MCE_H -#define _ASM_MCE_H 1 - -#include -#include - -/* - * Machine Check support for x86 - */ - -#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ - -#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ -#define MCG_STATUS_EIPV (1UL<<1) /* eip points to correct instruction */ -#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */ - -#define MCI_STATUS_VAL (1UL<<63) /* valid error */ -#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */ -#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */ -#define MCI_STATUS_EN (1UL<<60) /* error enabled */ -#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */ -#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */ -#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */ - -/* Fields are zero when not available */ -struct mce { - __u64 status; - __u64 misc; - __u64 addr; - __u64 mcgstatus; - __u64 rip; - __u64 tsc; /* cpu time stamp counter */ - __u64 res1; /* for future extension */ - __u64 res2; /* dito. */ - __u8 cs; /* code segment */ - __u8 bank; /* machine check bank */ - __u8 cpu; /* cpu that raised the error */ - __u8 finished; /* entry is valid */ - __u32 pad; -}; - -/* - * This structure contains all data related to the MCE log. - * Also carries a signature to make it easier to find from external debugging tools. - * Each entry is only valid when its finished flag is set. - */ - -#define MCE_LOG_LEN 32 - -struct mce_log { - char signature[12]; /* "MACHINECHECK" */ - unsigned len; /* = MCE_LOG_LEN */ - unsigned next; - unsigned flags; - unsigned pad0; - struct mce entry[MCE_LOG_LEN]; -}; - -#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ - -#define MCE_LOG_SIGNATURE "MACHINECHECK" - -#define MCE_GET_RECORD_LEN _IOR('M', 1, int) -#define MCE_GET_LOG_LEN _IOR('M', 2, int) -#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int) - -/* Software defined banks */ -#define MCE_EXTENDED_BANK 128 -#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0 - -void mce_log(struct mce *m); -#ifdef CONFIG_X86_MCE_INTEL -void mce_intel_feature_init(struct cpuinfo_x86 *c); -#else -static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) -{ -} -#endif - -#endif _