From 72c0098d92cedb11c7e0151e84918840a4e96b31 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 6 Sep 2017 19:54:53 -0700
Subject: [PATCH] x86/mm: Reinitialize TLB state on hotplug and resume

When Linux brings a CPU down and back up, it switches to init_mm and then
loads swapper_pg_dir into CR3.  With PCID enabled, this has the side effect
of masking off the ASID bits in CR3.

This can result in some confusion in the TLB handling code.  If we
bring a CPU down and back up with any ASID other than 0, we end up
with the wrong ASID active on the CPU after resume.  This could
cause our internal state to become corrupt, although major
corruption is unlikely because init_mm doesn't have any user pages.
More obviously, if CONFIG_DEBUG_VM=y, we'll trip over an assertion
in the next context switch.  The result of *that* is a failure to
resume from suspend with probability 1 - 1/6^(cpus-1).

Fix it by reinitializing cpu_tlbstate on resume and CPU bringup.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Jiri Kosina <jikos@kernel.org>
Fixes: 10af6235e0d3 ("x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID")
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/tlbflush.h |  2 ++
 arch/x86/kernel/cpu/common.c    |  2 ++
 arch/x86/mm/tlb.c               | 44 +++++++++++++++++++++++++++++++++
 arch/x86/power/cpu.c            |  1 +
 4 files changed, 49 insertions(+)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d23e61dc0640e..4893abf7f74f9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -198,6 +198,8 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 	cr4_set_bits(mask);
 }
 
+extern void initialize_tlbstate_and_flush(void);
+
 static inline void __native_flush_tlb(void)
 {
 	/*
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index efba8e3da3e28..40cb4d0a59820 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1583,6 +1583,7 @@ void cpu_init(void)
 	mmgrab(&init_mm);
 	me->active_mm = &init_mm;
 	BUG_ON(me->mm);
+	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, me);
 
 	load_sp0(t, &current->thread);
@@ -1637,6 +1638,7 @@ void cpu_init(void)
 	mmgrab(&init_mm);
 	curr->active_mm = &init_mm;
 	BUG_ON(curr->mm);
+	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, curr);
 
 	load_sp0(t, thread);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ce104b962a170..dbbcfd59726ac 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -213,6 +213,50 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	switch_ldt(real_prev, next);
 }
 
+/*
+ * Call this when reinitializing a CPU.  It fixes the following potential
+ * problems:
+ *
+ * - The ASID changed from what cpu_tlbstate thinks it is (most likely
+ *   because the CPU was taken down and came back up with CR3's PCID
+ *   bits clear.  CPU hotplug can do this.
+ *
+ * - The TLB contains junk in slots corresponding to inactive ASIDs.
+ *
+ * - The CPU went so far out to lunch that it may have missed a TLB
+ *   flush.
+ */
+void initialize_tlbstate_and_flush(void)
+{
+	int i;
+	struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+	unsigned long cr3 = __read_cr3();
+
+	/* Assert that CR3 already references the right mm. */
+	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
+
+	/*
+	 * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
+	 * doesn't work like other CR4 bits because it can only be set from
+	 * long mode.)
+	 */
+	WARN_ON(boot_cpu_has(X86_CR4_PCIDE) &&
+		!(cr4_read_shadow() & X86_CR4_PCIDE));
+
+	/* Force ASID 0 and force a TLB flush. */
+	write_cr3(cr3 & ~CR3_PCID_MASK);
+
+	/* Reinitialize tlbstate. */
+	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
+	this_cpu_write(cpu_tlbstate.next_asid, 1);
+	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
+	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+
+	for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
+		this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
+}
+
 /*
  * flush_tlb_func_common()'s memory ordering requirement is that any
  * TLB fills that happen after we flush the TLB are ordered after we
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 78459a6d455a3..4d68d59f457d5 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -181,6 +181,7 @@ static void fix_processor_context(void)
 #endif
 	load_TR_desc();				/* This does ltr */
 	load_mm_ldt(current->active_mm);	/* This does lldt */
+	initialize_tlbstate_and_flush();
 
 	fpu__resume_cpu();
 
-- 
GitLab