core.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Machine check handler.
 *
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
 * Rest from unknown author(s).
 * 2004 Andi Kleen. Rewrote most of it.
 * Copyright 2008 Intel Corporation
 * Author: Andi Kleen
 */

#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
#include <linux/ratelimit.h>
#include <linux/rcupdate.h>
#include <linux/kobject.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/device.h>
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/ctype.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
#include <linux/set_memory.h>
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>

#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>

#include "internal.h"

/* sysfs synchronization */
static DEFINE_MUTEX(mce_sysfs_mutex);

#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>

#define SPINUNIT		100	/* 100ns */

DEFINE_PER_CPU(unsigned, mce_exception_count);

DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);

DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);

#define ATTR_LEN               16
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank_dev {
	struct device_attribute	attr;			/* device attribute */
	char			attrname[ATTR_LEN];	/* attribute name */
	u8			bank;			/* bank number */
};
static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];

struct mce_vendor_flags mce_flags __read_mostly;

struct mca_config mca_cfg __read_mostly = {
	.bootlog  = -1,
	.monarch_timeout = -1
};

static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;

/*
 * MCA banks polled by the period polling timer for corrected events.
 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};

/*
 * MCA banks controlled through firmware first for corrected errors.
 * This is a global list of banks for which we won't enable CMCI and we
 * won't poll. Firmware controls these banks and is responsible for
 * reporting corrected errors through GHES. Uncorrected/recoverable
 * errors are still notified through a machine check.
 */
mce_banks_t mce_banks_ce_disabled;

static struct work_struct mce_work;
static struct irq_work mce_irq_work;

/*
 * CPU/chipset specific EDAC code can register a notifier call here to print
 * MCE errors in a human-readable form.
 */
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);

/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
	memset(m, 0, sizeof(struct mce));
	m->cpu = m->extcpu = smp_processor_id();
	/* need the internal __ version to avoid deadlocks */
	m->time = __ktime_get_real_seconds();
	m->cpuvendor = boot_cpu_data.x86_vendor;
	m->cpuid = cpuid_eax(1);
	m->socketid = cpu_data(m->extcpu).phys_proc_id;
	m->apicid = cpu_data(m->extcpu).initial_apicid;
	m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
	m->ppin = cpu_data(m->extcpu).ppin;
	m->microcode = boot_cpu_data.microcode;
}

DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);

void mce_log(struct mce *m)
{
	if (!mce_gen_pool_add(m))
		irq_work_queue(&mce_irq_work);
}
EXPORT_SYMBOL_GPL(mce_log);

void mce_register_decode_chain(struct notifier_block *nb)
{
	if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
		    nb->priority > MCE_PRIO_HIGHEST))
		return;

	blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);

void mce_unregister_decode_chain(struct notifier_block *nb)
{
	blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);

static void __print_mce(struct mce *m)
{
	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
		 m->extcpu,
		 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
		 m->mcgstatus, m->bank, m->status);

	if (m->ip) {
		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
			m->cs, m->ip);

		if (m->cs == __KERNEL_CS)
			pr_cont("{%pS}", (void *)(unsigned long)m->ip);
		pr_cont("\n");
	}

	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
	if (m->addr)
		pr_cont("ADDR %llx ", m->addr);
	if (m->misc)
		pr_cont("MISC %llx ", m->misc);
	if (m->ppin)
		pr_cont("PPIN %llx ", m->ppin);

	if (mce_flags.smca) {
		if (m->synd)
			pr_cont("SYND %llx ", m->synd);
		if (m->ipid)
			pr_cont("IPID %llx ", m->ipid);
	}

	pr_cont("\n");

	/*
	 * Note this output is parsed by external tools and old fields
	 * should not be changed.
	 */
	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
		m->microcode);
}

static void print_mce(struct mce *m)
{
	__print_mce(m);

	if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
		pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}

#define PANIC_TIMEOUT 5 /* 5 seconds */

static atomic_t mce_panicked;

static int fake_panic;
static atomic_t mce_fake_panicked;

/* Panic in progress. Enable interrupts and wait for final IPI */
static void wait_for_panic(void)
{
	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;

	preempt_disable();
	local_irq_enable();
	while (timeout-- > 0)
		udelay(1);
	if (panic_timeout == 0)
		panic_timeout = mca_cfg.panic_timeout;
	panic("Panicing machine check CPU died");
}

static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
	struct llist_node *pending;
	struct mce_evt_llist *l;
	int apei_err = 0;

	/*
	 * Allow instrumentation around external facilities usage. Not that it
	 * matters a whole lot since the machine is going to panic anyway.
	 */
	instrumentation_begin();

	if (!fake_panic) {
		/*
		 * Make sure only one CPU runs in machine check panic
		 */
		if (atomic_inc_return(&mce_panicked) > 1)
			wait_for_panic();
		barrier();

		bust_spinlocks(1);
		console_verbose();
	} else {
		/* Don't log too much for fake panic */
		if (atomic_inc_return(&mce_fake_panicked) > 1)
			goto out;
	}
	pending = mce_gen_pool_prepare_records();
	/* First print corrected ones that are still unlogged */
	llist_for_each_entry(l, pending, llnode) {
		struct mce *m = &l->mce;
		if (!(m->status & MCI_STATUS_UC)) {
			print_mce(m);
			if (!apei_err)
				apei_err = apei_write_mce(m);
		}
	}
	/* Now print uncorrected but with the final one last */
	llist_for_each_entry(l, pending, llnode) {
		struct mce *m = &l->mce;
		if (!(m->status & MCI_STATUS_UC))
			continue;
		if (!final || mce_cmp(m, final)) {
			print_mce(m);
			if (!apei_err)
				apei_err = apei_write_mce(m);
		}
	}
	if (final) {
		print_mce(final);
		if (!apei_err)
			apei_err = apei_write_mce(final);
	}
	if (exp)
		pr_emerg(HW_ERR "Machine check: %s\n", exp);
	if (!fake_panic) {
		if (panic_timeout == 0)
			panic_timeout = mca_cfg.panic_timeout;
		panic(msg);
	} else
		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);

out:
	instrumentation_end();
}

/* Support code for software error injection */

static int msr_to_offset(u32 msr)
{
	unsigned bank = __this_cpu_read(injectm.bank);

	if (msr == mca_cfg.rip_msr)
		return offsetof(struct mce, ip);
	if (msr == mca_msr_reg(bank, MCA_STATUS))
		return offsetof(struct mce, status);
	if (msr == mca_msr_reg(bank, MCA_ADDR))
		return offsetof(struct mce, addr);
	if (msr == mca_msr_reg(bank, MCA_MISC))
		return offsetof(struct mce, misc);
	if (msr == MSR_IA32_MCG_STATUS)
		return offsetof(struct mce, mcgstatus);
	return -1;
}

void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
{
	if (wrmsr) {
		pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
			 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
			 regs->ip, (void *)regs->ip);
	} else {
		pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
			 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
	}

	show_stack_regs(regs);

	panic("MCA architectural violation!\n");

	while (true)
		cpu_relax();
}

/* MSR access wrappers used for error injection */
noinstr u64 mce_rdmsrl(u32 msr)
{
	DECLARE_ARGS(val, low, high);

	if (__this_cpu_read(injectm.finished)) {
		int offset;
		u64 ret;

		instrumentation_begin();

		offset = msr_to_offset(msr);
		if (offset < 0)
			ret = 0;
		else
			ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);

		instrumentation_end();

		return ret;
	}

	/*
	 * RDMSR on MCA MSRs should not fault. If they do, this is very much an
	 * architectural violation and needs to be reported to hw vendor. Panic
	 * the box to not allow any further progress.
	 */
	asm volatile("1: rdmsr\n"
		     "2:\n"
		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
		     : EAX_EDX_RET(val, low, high) : "c" (msr));


	return EAX_EDX_VAL(val, low, high);
}

static noinstr void mce_wrmsrl(u32 msr, u64 v)
{
	u32 low, high;

	if (__this_cpu_read(injectm.finished)) {
		int offset;

		instrumentation_begin();

		offset = msr_to_offset(msr);
		if (offset >= 0)
			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;

		instrumentation_end();

		return;
	}

	low  = (u32)v;
	high = (u32)(v >> 32);

	/* See comment in mce_rdmsrl() */
	asm volatile("1: wrmsr\n"
		     "2:\n"
		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
		     : : "c" (msr), "a"(low), "d" (high) : "memory");
}

/*
 * Collect all global (w.r.t. this processor) status about this machine
 * check into our "mce" struct so that we can use it later to assess
 * the severity of the problem as we read per-bank specific details.
 */
static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
{
	/*
	 * Enable instrumentation around mce_setup() which calls external
	 * facilities.
	 */
	instrumentation_begin();
	mce_setup(m);
	instrumentation_end();

	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
	if (regs) {
		/*
		 * Get the address of the instruction at the time of
		 * the machine check error.
		 */
		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
			m->ip = regs->ip;
			m->cs = regs->cs;

			/*
			 * When in VM86 mode make the cs look like ring 3
			 * always. This is a lie, but it's better than passing
			 * the additional vm86 bit around everywhere.
			 */
			if (v8086_mode(regs))
				m->cs |= 3;
		}
		/* Use accurate RIP reporting if available. */
		if (mca_cfg.rip_msr)
			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
	}
}

int mce_available(struct cpuinfo_x86 *c)
{
	if (mca_cfg.disabled)
		return 0;
	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}

static void mce_schedule_work(void)
{
	if (!mce_gen_pool_empty())
		schedule_work(&mce_work);
}

static void mce_irq_work_cb(struct irq_work *entry)
{
	mce_schedule_work();
}

/*
 * Check if the address reported by the CPU is in a format we can parse.
 * It would be possible to add code for most other cases, but all would
 * be somewhat complicated (e.g. segment offset would require an instruction
 * parser). So only support physical addresses up to page granularity for now.
 */
int mce_usable_address(struct mce *m)
{
	if (!(m->status & MCI_STATUS_ADDRV))
		return 0;

	/* Checks after this one are Intel/Zhaoxin-specific: */
	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
	    boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
		return 1;

	if (!(m->status & MCI_STATUS_MISCV))
		return 0;

	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
		return 0;

	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
		return 0;

	return 1;
}
EXPORT_SYMBOL_GPL(mce_usable_address);

bool mce_is_memory_error(struct mce *m)
{
	switch (m->cpuvendor) {
	case X86_VENDOR_AMD:
	case X86_VENDOR_HYGON:
		return amd_mce_is_memory_error(m);

	case X86_VENDOR_INTEL:
	case X86_VENDOR_ZHAOXIN:
		/*
		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
		 *
		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
		 * indicating a memory error. Bit 8 is used for indicating a
		 * cache hierarchy error. The combination of bit 2 and bit 3
		 * is used for indicating a `generic' cache hierarchy error
		 * But we can't just blindly check the above bits, because if
		 * bit 11 is set, then it is a bus/interconnect error - and
		 * either way the above bits just gives more detail on what
		 * bus/interconnect error happened. Note that bit 12 can be
		 * ignored, as it's the "filter" bit.
		 */
		return (m->status & 0xef80) == BIT(7) ||
		       (m->status & 0xef00) == BIT(8) ||
		       (m->status & 0xeffc) == 0xc;

	default:
		return false;
	}
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);

static bool whole_page(struct mce *m)
{
	if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
		return true;

	return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
}

bool mce_is_correctable(struct mce *m)
{
	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
		return false;

	if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
		return false;

	if (m->status & MCI_STATUS_UC)
		return false;

	return true;
}
EXPORT_SYMBOL_GPL(mce_is_correctable);

static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
			      void *data)
{
	struct mce *m = (struct mce *)data;

	if (!m)
		return NOTIFY_DONE;

	/* Emit the trace record: */
	trace_mce_record(m);

	set_bit(0, &mce_need_notify);

	mce_notify_irq();

	return NOTIFY_DONE;
}

static struct notifier_block early_nb = {
	.notifier_call	= mce_early_notifier,
	.priority	= MCE_PRIO_EARLY,
};

static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
			      void *data)
{
	struct mce *mce = (struct mce *)data;
	unsigned long pfn;

	if (!mce || !mce_usable_address(mce))
		return NOTIFY_DONE;

	if (mce->severity != MCE_AO_SEVERITY &&
	    mce->severity != MCE_DEFERRED_SEVERITY)
		return NOTIFY_DONE;

	pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
	if (!memory_failure(pfn, 0)) {
		set_mce_nospec(pfn);
		mce->kflags |= MCE_HANDLED_UC;
	}

	return NOTIFY_OK;
}

static struct notifier_block mce_uc_nb = {
	.notifier_call	= uc_decode_notifier,
	.priority	= MCE_PRIO_UC,
};

static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
				void *data)
{
	struct mce *m = (struct mce *)data;

	if (!m)
		return NOTIFY_DONE;

	if (mca_cfg.print_all || !m->kflags)
		__print_mce(m);

	return NOTIFY_DONE;
}

static struct notifier_block mce_default_nb = {
	.notifier_call	= mce_default_notifier,
	/* lowest prio, we want it to run last. */
	.priority	= MCE_PRIO_LOWEST,
};

/*
 * Read ADDR and MISC registers.
 */
static noinstr void mce_read_aux(struct mce *m, int i)
{
	if (m->status & MCI_STATUS_MISCV)
		m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));

	if (m->status & MCI_STATUS_ADDRV) {
		m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));

		/*
		 * Mask the reported address by the reported granularity.
		 */
		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
			m->addr >>= shift;
			m->addr <<= shift;
		}

		smca_extract_err_addr(m);
	}

	if (mce_flags.smca) {
		m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));

		if (m->status & MCI_STATUS_SYNDV)
			m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
	}
}

DEFINE_PER_CPU(unsigned, mce_poll_count);

/*
 * Poll for corrected events or events that happened before reset.
 * Those are just logged through /dev/mcelog.
 *
 * This is executed in standard interrupt context.
 *
 * Note: spec recommends to panic for fatal unsignalled
 * errors here. However this would be quite problematic --
 * we would need to reimplement the Monarch handling and
 * it would mess up the exclusion between exception handler
 * and poll handler -- * so we skip this for now.
 * These cases should not happen anyways, or only when the CPU
 * is already totally * confused. In this case it's likely it will
 * not fully execute the machine check handler either.
 */
bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
	bool error_seen = false;
	struct mce m;
	int i;

	this_cpu_inc(mce_poll_count);

	mce_gather_info(&m, NULL);

	if (flags & MCP_TIMESTAMP)
		m.tsc = rdtsc();

	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
		if (!mce_banks[i].ctl || !test_bit(i, *b))
			continue;

		m.misc = 0;
		m.addr = 0;
		m.bank = i;

		barrier();
		m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));

		/* If this entry is not valid, ignore it */
		if (!(m.status & MCI_STATUS_VAL))
			continue;

		/*
		 * If we are logging everything (at CPU online) or this
		 * is a corrected error, then we must log it.
		 */
		if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
			goto log_it;

		/*
		 * Newer Intel systems that support software error
		 * recovery need to make additional checks. Other
		 * CPUs should skip over uncorrected errors, but log
		 * everything else.
		 */
		if (!mca_cfg.ser) {
			if (m.status & MCI_STATUS_UC)
				continue;
			goto log_it;
		}

		/* Log "not enabled" (speculative) errors */
		if (!(m.status & MCI_STATUS_EN))
			goto log_it;

		/*
		 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
		 * UC == 1 && PCC == 0 && S == 0
		 */
		if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
			goto log_it;

		/*
		 * Skip anything else. Presumption is that our read of this
		 * bank is racing with a machine check. Leave the log alone
		 * for do_machine_check() to deal with it.
		 */
		continue;

log_it:
		error_seen = true;

		if (flags & MCP_DONTLOG)
			goto clear_it;

		mce_read_aux(&m, i);
		m.severity = mce_severity(&m, NULL, NULL, false);
		/*
		 * Don't get the IP here because it's unlikely to
		 * have anything to do with the actual error location.
		 */

		if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
			goto clear_it;

		if (flags & MCP_QUEUE_LOG)
			mce_gen_pool_add(&m);
		else
			mce_log(&m);

clear_it:
		/*
		 * Clear state for this bank.
		 */
		mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
	}

	/*
	 * Don't clear MCG_STATUS here because it's only defined for
	 * exceptions.
	 */

	sync_core();

	return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);

/*
 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
 * Vol 3B Table 15-20). But this confuses both the code that determines
 * whether the machine check occurred in kernel or user mode, and also
 * the severity assessment code. Pretend that EIPV was set, and take the
 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
 */
static __always_inline void
quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
{
	if (bank != 0)
		return;
	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
		return;
	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
			  MCACOD)) !=
			 (MCI_STATUS_UC|MCI_STATUS_EN|
			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
			  MCI_STATUS_AR|MCACOD_INSTR))
		return;

	m->mcgstatus |= MCG_STATUS_EIPV;
	m->ip = regs->ip;
	m->cs = regs->cs;
}

/*
 * Disable fast string copy and return from the MCE handler upon the first SRAR
 * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
 * CPUs.
 * The fast string copy instructions ("REP; MOVS*") could consume an
 * uncorrectable memory error in the cache line _right after_ the desired region
 * to copy and raise an MCE with RIP pointing to the instruction _after_ the
 * "REP; MOVS*".
 * This mitigation addresses the issue completely with the caveat of performance
 * degradation on the CPU affected. This is still better than the OS crashing on
 * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
 * kernel context (e.g., copy_page).
 *
 * Returns true when fast string copy on CPU has been disabled.
 */
static noinstr bool quirk_skylake_repmov(void)
{
	u64 mcgstatus   = mce_rdmsrl(MSR_IA32_MCG_STATUS);
	u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
	u64 mc1_status;

	/*
	 * Apply the quirk only to local machine checks, i.e., no broadcast
	 * sync is needed.
	 */
	if (!(mcgstatus & MCG_STATUS_LMCES) ||
	    !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
		return false;

	mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));

	/* Check for a software-recoverable data fetch error. */
	if ((mc1_status &
	     (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
	      MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
	      MCI_STATUS_AR | MCI_STATUS_S)) ==
	     (MCI_STATUS_VAL |                   MCI_STATUS_UC | MCI_STATUS_EN |
	      MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
	      MCI_STATUS_AR | MCI_STATUS_S)) {
		misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
		mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
		mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);

		instrumentation_begin();
		pr_err_once("Erratum detected, disable fast string copy instructions.\n");
		instrumentation_end();

		return true;
	}

	return false;
}

/*
 * Do a quick check if any of the events requires a panic.
 * This decides if we keep the events around or clear them.
 */
static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
					  struct pt_regs *regs)
{
	char *tmp = *msg;
	int i;

	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
		m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
		if (!(m->status & MCI_STATUS_VAL))
			continue;

		arch___set_bit(i, validp);
		if (mce_flags.snb_ifu_quirk)
			quirk_sandybridge_ifu(i, m, regs);

		m->bank = i;
		if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
			mce_read_aux(m, i);
			*msg = tmp;
			return 1;
		}
	}
	return 0;
}

/*
 * Variable to establish order between CPUs while scanning.
 * Each CPU spins initially until executing is equal its number.
 */
static atomic_t mce_executing;

/*
 * Defines order of CPUs on entry. First CPU becomes Monarch.
 */
static atomic_t mce_callin;

/*
 * Track which CPUs entered the MCA broadcast synchronization and which not in
 * order to print holdouts.
 */
static cpumask_t mce_missing_cpus = CPU_MASK_ALL;

/*
 * Check if a timeout waiting for other CPUs happened.
 */
static noinstr int mce_timed_out(u64 *t, const char *msg)
{
	int ret = 0;

	/* Enable instrumentation around calls to external facilities */
	instrumentation_begin();

	/*
	 * The others already did panic for some reason.
	 * Bail out like in a timeout.
	 * rmb() to tell the compiler that system_state
	 * might have been modified by someone else.
	 */
	rmb();
	if (atomic_read(&mce_panicked))
		wait_for_panic();
	if (!mca_cfg.monarch_timeout)
		goto out;
	if ((s64)*t < SPINUNIT) {
		if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
			pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
				 cpumask_pr_args(&mce_missing_cpus));
		mce_panic(msg, NULL, NULL);

		ret = 1;
		goto out;
	}
	*t -= SPINUNIT;

out:
	touch_nmi_watchdog();

	instrumentation_end();

	return ret;
}

/*
 * The Monarch's reign.  The Monarch is the CPU who entered
 * the machine check handler first. It waits for the others to
 * raise the exception too and then grades them. When any
 * error is fatal panic. Only then let the others continue.
 *
 * The other CPUs entering the MCE handler will be controlled by the
 * Monarch. They are called Subjects.
 *
 * This way we prevent any potential data corruption in a unrecoverable case
 * and also makes sure always all CPU's errors are examined.
 *
 * Also this detects the case of a machine check event coming from outer
 * space (not detected by any CPUs) In this case some external agent wants
 * us to shut down, so panic too.
 *
 * The other CPUs might still decide to panic if the handler happens
 * in a unrecoverable place, but in this case the system is in a semi-stable
 * state and won't corrupt anything by itself. It's ok to let the others
 * continue for a bit first.
 *
 * All the spin loops have timeouts; when a timeout happens a CPU
 * typically elects itself to be Monarch.
 */
static void mce_reign(void)
{
	int cpu;
	struct mce *m = NULL;
	int global_worst = 0;
	char *msg = NULL;

	/*
	 * This CPU is the Monarch and the other CPUs have run
	 * through their handlers.
	 * Grade the severity of the errors of all the CPUs.
	 */
	for_each_possible_cpu(cpu) {
		struct mce *mtmp = &per_cpu(mces_seen, cpu);

		if (mtmp->severity > global_worst) {
			global_worst = mtmp->severity;
			m = &per_cpu(mces_seen, cpu);
		}
	}

	/*
	 * Cannot recover? Panic here then.
	 * This dumps all the mces in the log buffer and stops the
	 * other CPUs.
	 */
	if (m && global_worst >= MCE_PANIC_SEVERITY) {
		/* call mce_severity() to get "msg" for panic */
		mce_severity(m, NULL, &msg, true);
		mce_panic("Fatal machine check", m, msg);
	}

	/*
	 * For UC somewhere we let the CPU who detects it handle it.
	 * Also must let continue the others, otherwise the handling
	 * CPU could deadlock on a lock.
	 */

	/*
	 * No machine check event found. Must be some external
	 * source or one CPU is hung. Panic.
	 */
	if (global_worst <= MCE_KEEP_SEVERITY)
		mce_panic("Fatal machine check from unknown source", NULL, NULL);

	/*