Skip to content
Snippets Groups Projects
Commit fd468043 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

x86: avoid per-cpu system call trampoline


The per-cpu system call trampoline was a clever trick, and allows us to
have percpu data even before swapgs is done by just doing %rip-relative
addressing.  And that was important, because syscall doesn't have a
kernel stack, so we needed that percpu data very very early, just to get
a temporary register to switch the page tables around.

However, it turns out to be unnecessary.  Because we actually have a
temporary register that we can use: %r11 is destroyed by the 'syscall'
instruction anyway.

Ok, technically it contains the user mode flags register, but we *have*
that information anyway: it's still in %rflags, we've just masked off a
few unimportant bits.  We'll destroy the rest too when we do the "and"
of the CR3 value, but who cares? It's a system call.

Btw, there are a few bits in eflags that might matter to user space: DF
and AC.  Right now this clears them, but that is fixable by just
changing the MSR_SYSCALL_MASK value to not include them, and clearing
them by hand the way we do for all other kernel entry points anyway.

So the only _real_ flags we'd destroy are IF and the arithmetic flags
that get trampled on by the arithmetic instructions that are part of the
%cr3 reload logic.

However, if we really end up caring, we can save off even those: we'd
take advantage of the fact that %rcx - which contains the returning IP
of the system call - also has 8 bits free.

Why 8? Even with 5-level paging, we only have 57 bits of virtual address
space, and the high address space is for the kernel (and vsyscall, but
we'd just disable native vsyscall).  So the %rip value saved in %rcx can
have only 56 valid bits, which means that we have 8 bits free.

So *if* we care about IF and the arithmetic flags being saved over a
system call, we'd do:

        shlq $8,%rcx
        movb %r11b,%cl
        shrl $8,%r11d
        andl $8,%r11d
        orb %r11b,%cl

to save those bits off before we then user %r11 as a temporary register
(we'd obviously need to then undo that as we save the user space state
on the stack).

Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 6f70eb2b
No related branches found
No related tags found
No related merge requests found
...@@ -142,67 +142,16 @@ END(native_usergs_sysret64) ...@@ -142,67 +142,16 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs. * with them due to bugs in both AMD and Intel CPUs.
*/ */
.pushsection .entry_trampoline, "ax"
/* /*
* The code in here gets remapped into cpu_entry_area's trampoline. This means * The 'syscall' instruction will have cleared the MSR_SYSCALL_MASK
* that the assembler and linker have the wrong idea as to where this code * bits in eflags. Currently that is:
* lives (and, in fact, it's mapped more than once, so it's not even at a *
* fixed address). So we can't reference any symbols outside the entry * X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
* trampoline and expect it to work. * X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT
* *
* Instead, we carefully abuse %rip-relative addressing. * and we don't care about any of them. So %r11 is a fine scratch
* _entry_trampoline(%rip) refers to the start of the remapped) entry * register.
* trampoline. We can thus find cpu_entry_area with this macro:
*/ */
#define CPU_ENTRY_AREA \
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
ENTRY(entry_SYSCALL_64_trampoline)
UNWIND_HINT_EMPTY
swapgs
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH
/* Note: using %rsp as a scratch reg. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
/* Start building the simulated IRET frame. */
pushq $__USER_DS /* pt_regs->ss */
pushq RSP_SCRATCH /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
/*
* x86 lacks a near absolute jump, and we can't jump to the real
* entry text with a relative jump. We could push the target
* address and then use retq, but this destroys the pipeline on
* many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
* spill RDI and restore it in a second-stage trampoline.
*/
pushq %rdi
movq $entry_SYSCALL_64_stage2, %rdi
JMP_NOSPEC %rdi
END(entry_SYSCALL_64_trampoline)
.popsection
ENTRY(entry_SYSCALL_64_stage2)
UNWIND_HINT_EMPTY
popq %rdi
jmp entry_SYSCALL_64_after_hwframe
END(entry_SYSCALL_64_stage2)
ENTRY(entry_SYSCALL_64) ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY UNWIND_HINT_EMPTY
/* /*
...@@ -212,17 +161,19 @@ ENTRY(entry_SYSCALL_64) ...@@ -212,17 +161,19 @@ ENTRY(entry_SYSCALL_64)
*/ */
swapgs swapgs
/*
* This path is only taken when PAGE_TABLE_ISOLATION is disabled so it /* Note: using %r11 as a scratch reg - user eflags */
* is not required to switch CR3. SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
*/
movq %rsp, PER_CPU_VAR(rsp_scratch) movq %rsp, %r11
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */ /* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */ pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ pushq %r11 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */ pushfq /* pt_regs->flags */
orq $X86_EFLAGS_IF,(%rsp) /* We'll always return with interrupts enabled */
movq (%rsp),%r11 /* We "restore" %r11 */
pushq $__USER_CS /* pt_regs->cs */ pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */ pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe) GLOBAL(entry_SYSCALL_64_after_hwframe)
......
...@@ -30,8 +30,6 @@ struct cpu_entry_area { ...@@ -30,8 +30,6 @@ struct cpu_entry_area {
*/ */
struct tss_struct tss; struct tss_struct tss;
char entry_trampoline[PAGE_SIZE];
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* /*
* Exception stacks used for IST entries. * Exception stacks used for IST entries.
......
...@@ -100,7 +100,6 @@ void common(void) { ...@@ -100,7 +100,6 @@ void common(void) {
/* Layout info for cpu_entry_area */ /* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
} }
...@@ -1418,19 +1418,10 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count); ...@@ -1418,19 +1418,10 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
/* May not be marked __init: used by software suspend */ /* May not be marked __init: used by software suspend */
void syscall_init(void) void syscall_init(void)
{ {
extern char _entry_trampoline[];
extern char entry_SYSCALL_64_trampoline[];
int cpu = smp_processor_id(); int cpu = smp_processor_id();
unsigned long SYSCALL64_entry_trampoline =
(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
(entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
if (static_cpu_has(X86_FEATURE_PTI)) wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
else
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION #ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
......
...@@ -116,14 +116,6 @@ SECTIONS ...@@ -116,14 +116,6 @@ SECTIONS
*(.fixup) *(.fixup)
*(.gnu.warning) *(.gnu.warning)
#ifdef CONFIG_X86_64
. = ALIGN(PAGE_SIZE);
_entry_trampoline = .;
*(.entry_trampoline)
. = ALIGN(PAGE_SIZE);
ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
#endif
#ifdef CONFIG_RETPOLINE #ifdef CONFIG_RETPOLINE
__indirect_thunk_start = .; __indirect_thunk_start = .;
*(.text.__x86.indirect_thunk) *(.text.__x86.indirect_thunk)
......
...@@ -68,8 +68,6 @@ static void percpu_setup_debug_store(int cpu) ...@@ -68,8 +68,6 @@ static void percpu_setup_debug_store(int cpu)
static void __init setup_cpu_entry_area(int cpu) static void __init setup_cpu_entry_area(int cpu)
{ {
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
extern char _entry_trampoline[];
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO; pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO; pgprot_t tss_prot = PAGE_KERNEL_RO;
...@@ -131,9 +129,6 @@ static void __init setup_cpu_entry_area(int cpu) ...@@ -131,9 +129,6 @@ static void __init setup_cpu_entry_area(int cpu)
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
&per_cpu(exception_stacks, cpu), &per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif #endif
percpu_setup_debug_store(cpu); percpu_setup_debug_store(cpu);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment