From fd468043d4d87da49d717d7747dba9f21bf13ed7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 23 Feb 2018 11:35:10 -0800
Subject: [PATCH] x86: avoid per-cpu system call trampoline

The per-cpu system call trampoline was a clever trick, and allows us to
have percpu data even before swapgs is done by just doing %rip-relative
addressing.  And that was important, because syscall doesn't have a
kernel stack, so we needed that percpu data very very early, just to get
a temporary register to switch the page tables around.

However, it turns out to be unnecessary.  Because we actually have a
temporary register that we can use: %r11 is destroyed by the 'syscall'
instruction anyway.

Ok, technically it contains the user mode flags register, but we *have*
that information anyway: it's still in %rflags, we've just masked off a
few unimportant bits.  We'll destroy the rest too when we do the "and"
of the CR3 value, but who cares? It's a system call.

Btw, there are a few bits in eflags that might matter to user space: DF
and AC.  Right now this clears them, but that is fixable by just
changing the MSR_SYSCALL_MASK value to not include them, and clearing
them by hand the way we do for all other kernel entry points anyway.

So the only _real_ flags we'd destroy are IF and the arithmetic flags
that get trampled on by the arithmetic instructions that are part of the
%cr3 reload logic.

However, if we really end up caring, we can save off even those: we'd
take advantage of the fact that %rcx - which contains the returning IP
of the system call - also has 8 bits free.

Why 8? Even with 5-level paging, we only have 57 bits of virtual address
space, and the high address space is for the kernel (and vsyscall, but
we'd just disable native vsyscall).  So the %rip value saved in %rcx can
have only 56 valid bits, which means that we have 8 bits free.

So *if* we care about IF and the arithmetic flags being saved over a
system call, we'd do:

        shlq $8,%rcx
        movb %r11b,%cl
        shrl $8,%r11d
        andl $8,%r11d
        orb %r11b,%cl

to save those bits off before we then user %r11 as a temporary register
(we'd obviously need to then undo that as we save the user space state
on the stack).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/entry/entry_64.S             | 81 ++++++---------------------
 arch/x86/include/asm/cpu_entry_area.h |  2 -
 arch/x86/kernel/asm-offsets.c         |  1 -
 arch/x86/kernel/cpu/common.c          | 11 +---
 arch/x86/kernel/vmlinux.lds.S         |  8 ---
 arch/x86/mm/cpu_entry_area.c          |  5 --
 6 files changed, 17 insertions(+), 91 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d5c7f18f79ac..81d1a9f04e40 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,67 +142,16 @@ END(native_usergs_sysret64)
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
-	.pushsection .entry_trampoline, "ax"
-
 /*
- * The code in here gets remapped into cpu_entry_area's trampoline.  This means
- * that the assembler and linker have the wrong idea as to where this code
- * lives (and, in fact, it's mapped more than once, so it's not even at a
- * fixed address).  So we can't reference any symbols outside the entry
- * trampoline and expect it to work.
+ * The 'syscall' instruction will have cleared the MSR_SYSCALL_MASK
+ * bits in eflags. Currently that is:
+ *
+ *             X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
+ *             X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT
  *
- * Instead, we carefully abuse %rip-relative addressing.
- * _entry_trampoline(%rip) refers to the start of the remapped) entry
- * trampoline.  We can thus find cpu_entry_area with this macro:
+ * and we don't care about any of them. So %r11 is a fine scratch
+ * register.
  */
-
-#define CPU_ENTRY_AREA \
-	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
-
-/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH	CPU_ENTRY_AREA_entry_stack + \
-			SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
-
-ENTRY(entry_SYSCALL_64_trampoline)
-	UNWIND_HINT_EMPTY
-	swapgs
-
-	/* Stash the user RSP. */
-	movq	%rsp, RSP_SCRATCH
-
-	/* Note: using %rsp as a scratch reg. */
-	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
-
-	/* Load the top of the task stack into RSP */
-	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
-
-	/* Start building the simulated IRET frame. */
-	pushq	$__USER_DS			/* pt_regs->ss */
-	pushq	RSP_SCRATCH			/* pt_regs->sp */
-	pushq	%r11				/* pt_regs->flags */
-	pushq	$__USER_CS			/* pt_regs->cs */
-	pushq	%rcx				/* pt_regs->ip */
-
-	/*
-	 * x86 lacks a near absolute jump, and we can't jump to the real
-	 * entry text with a relative jump.  We could push the target
-	 * address and then use retq, but this destroys the pipeline on
-	 * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
-	 * spill RDI and restore it in a second-stage trampoline.
-	 */
-	pushq	%rdi
-	movq	$entry_SYSCALL_64_stage2, %rdi
-	JMP_NOSPEC %rdi
-END(entry_SYSCALL_64_trampoline)
-
-	.popsection
-
-ENTRY(entry_SYSCALL_64_stage2)
-	UNWIND_HINT_EMPTY
-	popq	%rdi
-	jmp	entry_SYSCALL_64_after_hwframe
-END(entry_SYSCALL_64_stage2)
-
 ENTRY(entry_SYSCALL_64)
 	UNWIND_HINT_EMPTY
 	/*
@@ -212,17 +161,19 @@ ENTRY(entry_SYSCALL_64)
 	 */
 
 	swapgs
-	/*
-	 * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
-	 * is not required to switch CR3.
-	 */
-	movq	%rsp, PER_CPU_VAR(rsp_scratch)
+
+        /* Note: using %r11 as a scratch reg - user eflags */
+        SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
+
+	movq	%rsp, %r11
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER_DS			/* pt_regs->ss */
-	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
-	pushq	%r11				/* pt_regs->flags */
+	pushq	%r11				/* pt_regs->sp */
+	pushfq					/* pt_regs->flags */
+	orq     $X86_EFLAGS_IF,(%rsp)		/* We'll always return with interrupts enabled */
+	movq	(%rsp),%r11			/* We "restore" %r11 */
 	pushq	$__USER_CS			/* pt_regs->cs */
 	pushq	%rcx				/* pt_regs->ip */
 GLOBAL(entry_SYSCALL_64_after_hwframe)
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 4a7884b8dca5..29c706415443 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -30,8 +30,6 @@ struct cpu_entry_area {
 	 */
 	struct tss_struct tss;
 
-	char entry_trampoline[PAGE_SIZE];
-
 #ifdef CONFIG_X86_64
 	/*
 	 * Exception stacks used for IST entries.
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 76417a9aab73..13c07c7dd5e0 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -100,7 +100,6 @@ void common(void) {
 
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
-	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
 	OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
 	DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 348cf4821240..293f0e2a3bed 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1418,19 +1418,10 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
-	extern char _entry_trampoline[];
-	extern char entry_SYSCALL_64_trampoline[];
-
 	int cpu = smp_processor_id();
-	unsigned long SYSCALL64_entry_trampoline =
-		(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
-		(entry_SYSCALL_64_trampoline - _entry_trampoline);
 
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-	if (static_cpu_has(X86_FEATURE_PTI))
-		wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
-	else
-		wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
 	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9b138a06c1a4..21ae8fd3c9a8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -116,14 +116,6 @@ SECTIONS
 		*(.fixup)
 		*(.gnu.warning)
 
-#ifdef CONFIG_X86_64
-		. = ALIGN(PAGE_SIZE);
-		_entry_trampoline = .;
-		*(.entry_trampoline)
-		. = ALIGN(PAGE_SIZE);
-		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
-#endif
-
 #ifdef CONFIG_RETPOLINE
 		__indirect_thunk_start = .;
 		*(.text.__x86.indirect_thunk)
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index b9283cc27622..ae5c715bc9dc 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -68,8 +68,6 @@ static void percpu_setup_debug_store(int cpu)
 static void __init setup_cpu_entry_area(int cpu)
 {
 #ifdef CONFIG_X86_64
-	extern char _entry_trampoline[];
-
 	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
 	pgprot_t gdt_prot = PAGE_KERNEL_RO;
 	pgprot_t tss_prot = PAGE_KERNEL_RO;
@@ -131,9 +129,6 @@ static void __init setup_cpu_entry_area(int cpu)
 	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
 			     &per_cpu(exception_stacks, cpu),
 			     sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
-
-	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
-		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
 #endif
 	percpu_setup_debug_store(cpu);
 }
-- 
GitLab