Newer
Older
// SPDX-License-Identifier: GPL-2.0-only
* Rest from unknown author(s).
* 2004 Andi Kleen. Rewrote most of it.
* Copyright 2008 Intel Corporation
* Author: Andi Kleen
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
#include <linux/ratelimit.h>
#include <linux/rcupdate.h>
#include <linux/kobject.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/device.h>
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/ctype.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/irq_work.h>
#include <linux/export.h>
#include <linux/set_memory.h>
#include <asm/intel-family.h>
#include <asm/traps.h>
#include <asm/reboot.h>
/* sysfs synchronization */
static DEFINE_MUTEX(mce_sysfs_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
#define SPINUNIT 100 /* 100ns */
DEFINE_PER_CPU(unsigned, mce_exception_count);
DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank_dev {
struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
.bootlog = -1,
.monarch_timeout = -1
static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;

Naveen N. Rao
committed
/*
* MCA banks polled by the period polling timer for corrected events.
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
*/
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
/*
* MCA banks controlled through firmware first for corrected errors.
* This is a global list of banks for which we won't enable CMCI and we
* won't poll. Firmware controls these banks and is responsible for
* reporting corrected errors through GHES. Uncorrected/recoverable
* errors are still notified through a machine check.
*/
mce_banks_t mce_banks_ce_disabled;
static struct work_struct mce_work;
static struct irq_work mce_irq_work;
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
*/
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
/* need the internal __ version to avoid deadlocks */
m->time = __ktime_get_real_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
m->ppin = cpu_data(m->extcpu).ppin;
m->microcode = boot_cpu_data.microcode;
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
irq_work_queue(&mce_irq_work);
EXPORT_SYMBOL_GPL(mce_log);
void mce_register_decode_chain(struct notifier_block *nb)
{
if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
nb->priority > MCE_PRIO_HIGHEST))
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
void mce_unregister_decode_chain(struct notifier_block *nb)
{
blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
static void __print_mce(struct mce *m)
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
m->extcpu,
(m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
m->mcgstatus, m->bank, m->status);
if (m->ip) {
pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
pr_cont("{%pS}", (void *)(unsigned long)m->ip);
pr_cont("ADDR %llx ", m->addr);
pr_cont("MISC %llx ", m->misc);
if (m->ppin)
pr_cont("PPIN %llx ", m->ppin);

Yazen Ghannam
committed
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
/*
* Note this output is parsed by external tools and old fields
* should not be changed.
*/
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
}
static void print_mce(struct mce *m)
{
__print_mce(m);
if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_fake_panicked;
/* Panic in progress. Enable interrupts and wait for final IPI */
static void wait_for_panic(void)
{
long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
preempt_disable();
local_irq_enable();
while (timeout-- > 0)
udelay(1);
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
panic("Panicing machine check CPU died");
}
static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
/*
* Allow instrumentation around external facilities usage. Not that it
* matters a whole lot since the machine is going to panic anyway.
*/
instrumentation_begin();
if (!fake_panic) {
/*
* Make sure only one CPU runs in machine check panic
*/
if (atomic_inc_return(&mce_panicked) > 1)
wait_for_panic();
barrier();
bust_spinlocks(1);
console_verbose();
} else {
/* Don't log too much for fake panic */
if (atomic_inc_return(&mce_fake_panicked) > 1)
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
llist_for_each_entry(l, pending, llnode) {
struct mce *m = &l->mce;
if (!(m->status & MCI_STATUS_UC)) {
if (!apei_err)
apei_err = apei_write_mce(m);
}
}
/* Now print uncorrected but with the final one last */
llist_for_each_entry(l, pending, llnode) {
struct mce *m = &l->mce;
if (!(m->status & MCI_STATUS_UC))
continue;
if (!final || mce_cmp(m, final)) {
if (!apei_err)
apei_err = apei_write_mce(m);
}
if (!apei_err)
apei_err = apei_write_mce(final);
}
pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
/* Support code for software error injection */
static int msr_to_offset(u32 msr)
{
unsigned bank = __this_cpu_read(injectm.bank);
if (msr == mca_cfg.rip_msr)
return offsetof(struct mce, ip);
return offsetof(struct mce, status);
return offsetof(struct mce, addr);
return offsetof(struct mce, misc);
if (msr == MSR_IA32_MCG_STATUS)
return offsetof(struct mce, mcgstatus);
return -1;
}
void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
if (wrmsr) {
pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
(unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
regs->ip, (void *)regs->ip);
} else {
pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip);
}
show_stack_regs(regs);
panic("MCA architectural violation!\n");
while (true)
cpu_relax();
/* MSR access wrappers used for error injection */
noinstr u64 mce_rdmsrl(u32 msr)
DECLARE_ARGS(val, low, high);
if (__this_cpu_read(injectm.finished)) {
int offset;
u64 ret;
instrumentation_begin();
offset = msr_to_offset(msr);
ret = 0;
else
ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
instrumentation_end();
return ret;
/*
* RDMSR on MCA MSRs should not fault. If they do, this is very much an
* architectural violation and needs to be reported to hw vendor. Panic
* the box to not allow any further progress.
*/
asm volatile("1: rdmsr\n"
"2:\n"
_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
: EAX_EDX_RET(val, low, high) : "c" (msr));
return EAX_EDX_VAL(val, low, high);
}
static noinstr void mce_wrmsrl(u32 msr, u64 v)
u32 low, high;
if (__this_cpu_read(injectm.finished)) {
instrumentation_begin();
offset = msr_to_offset(msr);
*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
instrumentation_end();
low = (u32)v;
high = (u32)(v >> 32);
/* See comment in mce_rdmsrl() */
asm volatile("1: wrmsr\n"
"2:\n"
_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
: : "c" (msr), "a"(low), "d" (high) : "memory");
/*
* Collect all global (w.r.t. this processor) status about this machine
* check into our "mce" struct so that we can use it later to assess
* the severity of the problem as we read per-bank specific details.
*/
static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
/*
* Enable instrumentation around mce_setup() which calls external
* facilities.
*/
instrumentation_begin();
instrumentation_end();
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
if (regs) {
/*
* Get the address of the instruction at the time of
* the machine check error.
*/
if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
m->ip = regs->ip;
m->cs = regs->cs;
/*
* When in VM86 mode make the cs look like ring 3
* always. This is a lie, but it's better than passing
* the additional vm86 bit around everywhere.
*/
if (v8086_mode(regs))
m->cs |= 3;
}
/* Use accurate RIP reporting if available. */
if (mca_cfg.rip_msr)
m->ip = mce_rdmsrl(mca_cfg.rip_msr);

Akinobu Mita
committed
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
static void mce_schedule_work(void)
{
static void mce_irq_work_cb(struct irq_work *entry)
/*
* Check if the address reported by the CPU is in a format we can parse.
* It would be possible to add code for most other cases, but all would
* be somewhat complicated (e.g. segment offset would require an instruction
* parser). So only support physical addresses up to page granularity for now.
int mce_usable_address(struct mce *m)
if (!(m->status & MCI_STATUS_ADDRV))
/* Checks after this one are Intel/Zhaoxin-specific: */
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
if (!(m->status & MCI_STATUS_MISCV))
return 0;
if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0;
if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0;
EXPORT_SYMBOL_GPL(mce_usable_address);
bool mce_is_memory_error(struct mce *m)
switch (m->cpuvendor) {
case X86_VENDOR_AMD:
case X86_VENDOR_HYGON:
return amd_mce_is_memory_error(m);
case X86_VENDOR_INTEL:
case X86_VENDOR_ZHAOXIN:
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
* indicating a memory error. Bit 8 is used for indicating a
* cache hierarchy error. The combination of bit 2 and bit 3
* is used for indicating a `generic' cache hierarchy error
* But we can't just blindly check the above bits, because if
* bit 11 is set, then it is a bus/interconnect error - and
* either way the above bits just gives more detail on what
* bus/interconnect error happened. Note that bit 12 can be
* ignored, as it's the "filter" bit.
*/
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
EXPORT_SYMBOL_GPL(mce_is_memory_error);
static bool whole_page(struct mce *m)
{
if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
return true;
return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
}
bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
return false;
if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
return false;
if (m->status & MCI_STATUS_UC)
return false;
return true;
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
if (!m)
return NOTIFY_DONE;
/* Emit the trace record: */
trace_mce_record(m);
set_bit(0, &mce_need_notify);
mce_notify_irq();
return NOTIFY_DONE;
}
static struct notifier_block early_nb = {
.notifier_call = mce_early_notifier,
.priority = MCE_PRIO_EARLY,
static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
unsigned long pfn;
if (!mce || !mce_usable_address(mce))
return NOTIFY_DONE;
if (mce->severity != MCE_AO_SEVERITY &&
mce->severity != MCE_DEFERRED_SEVERITY)
return NOTIFY_DONE;
pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
mce->kflags |= MCE_HANDLED_UC;
}
return NOTIFY_OK;
static struct notifier_block mce_uc_nb = {
.notifier_call = uc_decode_notifier,
.priority = MCE_PRIO_UC,
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
if (!m)
return NOTIFY_DONE;
return NOTIFY_DONE;
}
static struct notifier_block mce_default_nb = {
.notifier_call = mce_default_notifier,
/* lowest prio, we want it to run last. */
.priority = MCE_PRIO_LOWEST,
/*
* Read ADDR and MISC registers.
*/
static noinstr void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
*/
if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
u8 shift = MCI_MISC_ADDR_LSB(m->misc);
m->addr >>= shift;
m->addr <<= shift;
}
smca_extract_err_addr(m);
if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
if (m->status & MCI_STATUS_SYNDV)
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
}
DEFINE_PER_CPU(unsigned, mce_poll_count);
* Poll for corrected events or events that happened before reset.
* Those are just logged through /dev/mcelog.
*
* This is executed in standard interrupt context.
*
* Note: spec recommends to panic for fatal unsignalled
* errors here. However this would be quite problematic --
* we would need to reimplement the Monarch handling and
* it would mess up the exclusion between exception handler
* and poll handler -- * so we skip this for now.
* These cases should not happen anyways, or only when the CPU
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
bool error_seen = false;
struct mce m;
int i;
this_cpu_inc(mce_poll_count);
if (flags & MCP_TIMESTAMP)
m.tsc = rdtsc();
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
m.misc = 0;
m.addr = 0;
m.bank = i;
barrier();
m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
continue;
/*
* If we are logging everything (at CPU online) or this
* is a corrected error, then we must log it.
*/
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
goto log_it;
/*
* Newer Intel systems that support software error
* recovery need to make additional checks. Other
* CPUs should skip over uncorrected errors, but log
* everything else.
*/
if (!mca_cfg.ser) {
if (m.status & MCI_STATUS_UC)
continue;
goto log_it;
}
/* Log "not enabled" (speculative) errors */
if (!(m.status & MCI_STATUS_EN))
goto log_it;
/*
* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
* UC == 1 && PCC == 0 && S == 0
*/
if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
goto log_it;
/*
* Skip anything else. Presumption is that our read of this
* bank is racing with a machine check. Leave the log alone
* for do_machine_check() to deal with it.
*/
continue;
error_seen = true;
if (flags & MCP_DONTLOG)
goto clear_it;
m.severity = mce_severity(&m, NULL, NULL, false);
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
goto clear_it;
if (flags & MCP_QUEUE_LOG)
mce_gen_pool_add(&m);
else
mce_log(&m);
/*
* Clear state for this bank.
*/
mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
}
/*
* Don't clear MCG_STATUS here because it's only defined for
* exceptions.
*/
return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
/*
* During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
* EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
* Vol 3B Table 15-20). But this confuses both the code that determines
* whether the machine check occurred in kernel or user mode, and also
* the severity assessment code. Pretend that EIPV was set, and take the
* ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
*/
static __always_inline void
quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
{
if (bank != 0)
return;
if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
return;
if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
MCACOD)) !=
(MCI_STATUS_UC|MCI_STATUS_EN|
MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
MCI_STATUS_AR|MCACOD_INSTR))
return;
m->mcgstatus |= MCG_STATUS_EIPV;
m->ip = regs->ip;
m->cs = regs->cs;
}
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
/*
* Disable fast string copy and return from the MCE handler upon the first SRAR
* MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
* CPUs.
* The fast string copy instructions ("REP; MOVS*") could consume an
* uncorrectable memory error in the cache line _right after_ the desired region
* to copy and raise an MCE with RIP pointing to the instruction _after_ the
* "REP; MOVS*".
* This mitigation addresses the issue completely with the caveat of performance
* degradation on the CPU affected. This is still better than the OS crashing on
* MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
* kernel context (e.g., copy_page).
*
* Returns true when fast string copy on CPU has been disabled.
*/
static noinstr bool quirk_skylake_repmov(void)
{
u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
u64 mc1_status;
/*
* Apply the quirk only to local machine checks, i.e., no broadcast
* sync is needed.
*/
if (!(mcgstatus & MCG_STATUS_LMCES) ||
!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
return false;
mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
/* Check for a software-recoverable data fetch error. */
if ((mc1_status &
(MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
MCI_STATUS_AR | MCI_STATUS_S)) ==
(MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
MCI_STATUS_AR | MCI_STATUS_S)) {
misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
instrumentation_begin();
pr_err_once("Erratum detected, disable fast string copy instructions.\n");
instrumentation_end();
return true;
}
return false;
}
/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
struct pt_regs *regs)
char *tmp = *msg;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
if (mce_flags.snb_ifu_quirk)
quirk_sandybridge_ifu(i, m, regs);
m->bank = i;
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
mce_read_aux(m, i);
/*
* Variable to establish order between CPUs while scanning.
* Each CPU spins initially until executing is equal its number.
*/
static atomic_t mce_executing;
/*
* Defines order of CPUs on entry. First CPU becomes Monarch.
*/
static atomic_t mce_callin;
/*
* Track which CPUs entered the MCA broadcast synchronization and which not in
* order to print holdouts.
*/
static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
/*
* Check if a timeout waiting for other CPUs happened.
*/
static noinstr int mce_timed_out(u64 *t, const char *msg)
int ret = 0;
/* Enable instrumentation around calls to external facilities */
instrumentation_begin();
/*
* The others already did panic for some reason.
* Bail out like in a timeout.
* rmb() to tell the compiler that system_state
* might have been modified by someone else.
*/
rmb();
wait_for_panic();
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
}
*t -= SPINUNIT;
out:
touch_nmi_watchdog();
instrumentation_end();
return ret;
}
/*
* The Monarch's reign. The Monarch is the CPU who entered
* the machine check handler first. It waits for the others to
* raise the exception too and then grades them. When any
* error is fatal panic. Only then let the others continue.
*
* The other CPUs entering the MCE handler will be controlled by the
* Monarch. They are called Subjects.
*
* This way we prevent any potential data corruption in a unrecoverable case
* and also makes sure always all CPU's errors are examined.
*
* Also this detects the case of a machine check event coming from outer
* space (not detected by any CPUs) In this case some external agent wants
* us to shut down, so panic too.
*
* The other CPUs might still decide to panic if the handler happens
* in a unrecoverable place, but in this case the system is in a semi-stable
* state and won't corrupt anything by itself. It's ok to let the others
* continue for a bit first.
*
* All the spin loops have timeouts; when a timeout happens a CPU
* typically elects itself to be Monarch.
*/
static void mce_reign(void)
{
int cpu;
struct mce *m = NULL;
int global_worst = 0;
char *msg = NULL;
/*
* This CPU is the Monarch and the other CPUs have run
* through their handlers.
* Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) {
struct mce *mtmp = &per_cpu(mces_seen, cpu);
if (mtmp->severity > global_worst) {
global_worst = mtmp->severity;
m = &per_cpu(mces_seen, cpu);
}
}
/*
* Cannot recover? Panic here then.
* This dumps all the mces in the log buffer and stops the
* other CPUs.
*/
if (m && global_worst >= MCE_PANIC_SEVERITY) {
/* call mce_severity() to get "msg" for panic */
mce_severity(m, NULL, &msg, true);
mce_panic("Fatal machine check", m, msg);
/*
* For UC somewhere we let the CPU who detects it handle it.
* Also must let continue the others, otherwise the handling
* CPU could deadlock on a lock.
*/
/*
* No machine check event found. Must be some external
* source or one CPU is hung. Panic.
*/
if (global_worst <= MCE_KEEP_SEVERITY)
mce_panic("Fatal machine check from unknown source", NULL, NULL);