Skip to content
Snippets Groups Projects
Commit 1fe3f29e authored by Ingo Molnar's avatar Ingo Molnar
Browse files

Merge branches 'x86/fpu', 'x86/mm' and 'x86/asm' into x86/pkeys


Provide a stable basis for the pkeys patches, which touches various
x86 details.

Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Branches
Tags
No related merge requests found
Showing
with 294 additions and 313 deletions
...@@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ...@@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
clearcpuid=BITNUM [X86] clearcpuid=BITNUM [X86]
Disable CPUID feature X for the kernel. See Disable CPUID feature X for the kernel. See
arch/x86/include/asm/cpufeature.h for the valid bit arch/x86/include/asm/cpufeatures.h for the valid bit
numbers. Note the Linux specific bits are not necessarily numbers. Note the Linux specific bits are not necessarily
stable over kernel options, but the vendor specific stable over kernel options, but the vendor specific
ones should be. ones should be.
...@@ -2566,6 +2566,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ...@@ -2566,6 +2566,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nointroute [IA-64] nointroute [IA-64]
noinvpcid [X86] Disable the INVPCID cpu feature.
nojitter [IA-64] Disables jitter checking for ITC timers. nojitter [IA-64] Disables jitter checking for ITC timers.
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
......
...@@ -350,16 +350,6 @@ config DEBUG_IMR_SELFTEST ...@@ -350,16 +350,6 @@ config DEBUG_IMR_SELFTEST
If unsure say N here. If unsure say N here.
config X86_DEBUG_STATIC_CPU_HAS
bool "Debug alternatives"
depends on DEBUG_KERNEL
---help---
This option causes additional code to be generated which
fails if static_cpu_has() is used before alternatives have
run.
If unsure, say N.
config X86_DEBUG_FPU config X86_DEBUG_FPU
bool "Debug the x86 FPU code" bool "Debug the x86 FPU code"
depends on DEBUG_KERNEL depends on DEBUG_KERNEL
......
#ifndef BOOT_CPUFLAGS_H #ifndef BOOT_CPUFLAGS_H
#define BOOT_CPUFLAGS_H #define BOOT_CPUFLAGS_H
#include <asm/cpufeature.h> #include <asm/cpufeatures.h>
#include <asm/processor-flags.h> #include <asm/processor-flags.h>
struct cpu_features { struct cpu_features {
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "../include/asm/required-features.h" #include "../include/asm/required-features.h"
#include "../include/asm/disabled-features.h" #include "../include/asm/disabled-features.h"
#include "../include/asm/cpufeature.h" #include "../include/asm/cpufeatures.h"
#include "../kernel/cpu/capflags.c" #include "../kernel/cpu/capflags.c"
int main(void) int main(void)
......
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
#include <linux/crc32.h> #include <linux/crc32.h>
#include <crypto/internal/hash.h> #include <crypto/internal/hash.h>
#include <asm/cpufeature.h> #include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h> #include <asm/cpu_device_id.h>
#include <asm/fpu/api.h> #include <asm/fpu/api.h>
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <crypto/internal/hash.h> #include <crypto/internal/hash.h>
#include <asm/cpufeature.h> #include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h> #include <asm/cpu_device_id.h>
#include <asm/fpu/internal.h> #include <asm/fpu/internal.h>
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
#include <linux/string.h> #include <linux/string.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <asm/fpu/api.h> #include <asm/fpu/api.h>
#include <asm/cpufeature.h> #include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h> #include <asm/cpu_device_id.h>
asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
......
...@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with ...@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
.byte 0xf1 .byte 0xf1
.endm .endm
#else /* CONFIG_X86_64 */
/*
* For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
* are different from the entry_32.S versions in not changing the segment
* registers. So only suitable for in kernel use, not when transitioning
* from or to user space. The resulting stack frame is not a standard
* pt_regs frame. The main use case is calling C code from assembler
* when all the registers need to be preserved.
*/
.macro SAVE_ALL
pushl %eax
pushl %ebp
pushl %edi
pushl %esi
pushl %edx
pushl %ecx
pushl %ebx
.endm
.macro RESTORE_ALL
popl %ebx
popl %ecx
popl %edx
popl %esi
popl %edi
popl %ebp
popl %eax
.endm
#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_64 */
/* /*
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/vdso.h> #include <asm/vdso.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/cpufeature.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h> #include <trace/events/syscalls.h>
...@@ -344,6 +345,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) ...@@ -344,6 +345,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
prepare_exit_to_usermode(regs); prepare_exit_to_usermode(regs);
} }
#ifdef CONFIG_X86_64
__visible void do_syscall_64(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned long nr = regs->orig_ax;
local_irq_enable();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
nr = syscall_trace_enter(regs);
/*
* NB: Native and x32 syscalls are dispatched from the same
* table. The only functional difference is the x32 bit in
* regs->orig_ax, which changes the behavior of some syscalls.
*/
if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
regs->ax = sys_call_table[nr & __SYSCALL_MASK](
regs->di, regs->si, regs->dx,
regs->r10, regs->r8, regs->r9);
}
syscall_return_slowpath(regs);
}
#endif
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/* /*
* Does a 32-bit syscall. Called with IRQs on and does all entry and * Does a 32-bit syscall. Called with IRQs on and does all entry and
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
#include <asm/processor-flags.h> #include <asm/processor-flags.h>
#include <asm/ftrace.h> #include <asm/ftrace.h>
#include <asm/irq_vectors.h> #include <asm/irq_vectors.h>
#include <asm/cpufeature.h> #include <asm/cpufeatures.h>
#include <asm/alternative-asm.h> #include <asm/alternative-asm.h>
#include <asm/asm.h> #include <asm/asm.h>
#include <asm/smap.h> #include <asm/smap.h>
......
...@@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) ...@@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
movq %rsp, PER_CPU_VAR(rsp_scratch) movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
TRACE_IRQS_OFF
/* Construct struct pt_regs on stack */ /* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */ pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
/*
* Re-enable interrupts.
* We use 'rsp_scratch' as a scratch space, hence irq-off block above
* must execute atomically in the face of possible interrupt-driven
* task preemption. We must enable interrupts only after we're done
* with using rsp_scratch:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
pushq %r11 /* pt_regs->flags */ pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */ pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */ pushq %rcx /* pt_regs->ip */
...@@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) ...@@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
pushq %r11 /* pt_regs->r11 */ pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) /*
jnz tracesys * If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
*/
testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath: entry_SYSCALL_64_fastpath:
/*
* Easy case: enable interrupts and issue the syscall. If the syscall
* needs pt_regs, we'll call a stub that disables interrupts again
* and jumps to the slow path.
*/
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
#if __SYSCALL_MASK == ~0 #if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max, %rax cmpq $__NR_syscall_max, %rax
#else #else
...@@ -182,103 +188,56 @@ entry_SYSCALL_64_fastpath: ...@@ -182,103 +188,56 @@ entry_SYSCALL_64_fastpath:
#endif #endif
ja 1f /* return -ENOSYS (already in pt_regs->ax) */ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10, %rcx movq %r10, %rcx
/*
* This call instruction is handled specially in stub_ptregs_64.
* It might end up jumping to the slow path. If it jumps, RAX
* and all argument registers are clobbered.
*/
call *sys_call_table(, %rax, 8) call *sys_call_table(, %rax, 8)
.Lentry_SYSCALL_64_after_fastpath_call:
movq %rax, RAX(%rsp) movq %rax, RAX(%rsp)
1: 1:
/*
* Syscall return path ending with SYSRET (fast path).
* Has incompletely filled pt_regs.
*/
LOCKDEP_SYS_EXIT
/*
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
DISABLE_INTERRUPTS(CLBR_NONE)
/* /*
* We must check ti flags with interrupts (or at least preemption) * If we get here, then we know that pt_regs is clean for SYSRET64.
* off because we must *never* return to userspace without * If we see that no exit work is required (which we are required
* processing exit work that is enqueued if we're preempted here. * to check with IRQs off), then we can go straight to SYSRET64.
* In particular, returning to userspace with any of the one-shot
* flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
* very bad.
*/ */
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ jnz 1f
RESTORE_C_REGS_EXCEPT_RCX_R11 LOCKDEP_SYS_EXIT
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11 movq EFLAGS(%rsp), %r11
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp movq RSP(%rsp), %rsp
/*
* 64-bit SYSRET restores rip from rcx,
* rflags from r11 (but RF and VM bits are forced to 0),
* cs and ss are loaded from MSRs.
* Restoration of rflags re-enables interrupts.
*
* NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
* descriptor is not reinitialized. This means that we should
* avoid SYSRET with SS == NULL, which could happen if we schedule,
* exit the kernel, and re-enter using an interrupt vector. (All
* interrupt entries on x86_64 set SS to NULL.) We prevent that
* from happening by reloading SS in __switch_to. (Actually
* detecting the failure in 64-bit userspace is tricky but can be
* done.)
*/
USERGS_SYSRET64 USERGS_SYSRET64
GLOBAL(int_ret_from_sys_call_irqs_off) 1:
/*
* The fast path looked good when we started, but something changed
* along the way and we need to switch to the slow path. Calling
* raise(3) will trigger this, for example. IRQs are off.
*/
TRACE_IRQS_ON TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE) ENABLE_INTERRUPTS(CLBR_NONE)
jmp int_ret_from_sys_call
/* Do syscall entry tracing */
tracesys:
movq %rsp, %rdi
movl $AUDIT_ARCH_X86_64, %esi
call syscall_trace_enter_phase1
test %rax, %rax
jnz tracesys_phase2 /* if needed, run the slow path */
RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
movq ORIG_RAX(%rsp), %rax
jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
tracesys_phase2:
SAVE_EXTRA_REGS SAVE_EXTRA_REGS
movq %rsp, %rdi movq %rsp, %rdi
movl $AUDIT_ARCH_X86_64, %esi call syscall_return_slowpath /* returns with IRQs disabled */
movq %rax, %rdx jmp return_from_SYSCALL_64
call syscall_trace_enter_phase2
/*
* Reload registers from stack in case ptrace changed them.
* We don't reload %rax because syscall_trace_entry_phase2() returned
* the value it wants us to use in the table lookup.
*/
RESTORE_C_REGS_EXCEPT_RAX
RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max, %rax
#else
andl $__SYSCALL_MASK, %eax
cmpl $__NR_syscall_max, %eax
#endif
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10, %rcx /* fixup for C */
call *sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
1:
/* Use IRET because user could have changed pt_regs->foo */
/* entry_SYSCALL64_slow_path:
* Syscall return path ending with IRET. /* IRQs are off. */
* Has correct iret frame.
*/
GLOBAL(int_ret_from_sys_call)
SAVE_EXTRA_REGS SAVE_EXTRA_REGS
movq %rsp, %rdi movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */ call do_syscall_64 /* returns with IRQs disabled */
return_from_SYSCALL_64:
RESTORE_EXTRA_REGS RESTORE_EXTRA_REGS
TRACE_IRQS_IRETQ /* we're about to change IF */ TRACE_IRQS_IRETQ /* we're about to change IF */
...@@ -355,83 +314,45 @@ opportunistic_sysret_failed: ...@@ -355,83 +314,45 @@ opportunistic_sysret_failed:
jmp restore_c_regs_and_iret jmp restore_c_regs_and_iret
END(entry_SYSCALL_64) END(entry_SYSCALL_64)
ENTRY(stub_ptregs_64)
.macro FORK_LIKE func
ENTRY(stub_\func)
SAVE_EXTRA_REGS 8
jmp sys_\func
END(stub_\func)
.endm
FORK_LIKE clone
FORK_LIKE fork
FORK_LIKE vfork
ENTRY(stub_execve)
call sys_execve
return_from_execve:
testl %eax, %eax
jz 1f
/* exec failed, can use fast SYSRET code path in this case */
ret
1:
/* must use IRET code path (pt_regs->cs may have changed) */
addq $8, %rsp
ZERO_EXTRA_REGS
movq %rax, RAX(%rsp)
jmp int_ret_from_sys_call
END(stub_execve)
/* /*
* Remaining execve stubs are only 7 bytes long. * Syscalls marked as needing ptregs land here.
* ENTRY() often aligns to 16 bytes, which in this case has no benefits. * If we are on the fast path, we need to save the extra regs,
* which we achieve by trying again on the slow path. If we are on
* the slow path, the extra regs are already saved.
*
* RAX stores a pointer to the C function implementing the syscall.
* IRQs are on.
*/ */
.align 8 cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
GLOBAL(stub_execveat) jne 1f
call sys_execveat
jmp return_from_execve
END(stub_execveat)
#if defined(CONFIG_X86_X32_ABI)
.align 8
GLOBAL(stub_x32_execve)
call compat_sys_execve
jmp return_from_execve
END(stub_x32_execve)
.align 8
GLOBAL(stub_x32_execveat)
call compat_sys_execveat
jmp return_from_execve
END(stub_x32_execveat)
#endif
/* /*
* sigreturn is special because it needs to restore all registers on return. * Called from fast path -- disable IRQs again, pop return address
* This cannot be done with SYSRET, so use the IRET return path instead. * and jump to slow path
*/
ENTRY(stub_rt_sigreturn)
/*
* SAVE_EXTRA_REGS result is not normally needed:
* sigreturn overwrites all pt_regs->GPREGS.
* But sigreturn can fail (!), and there is no easy way to detect that.
* To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
* we SAVE_EXTRA_REGS here.
*/ */
SAVE_EXTRA_REGS 8 DISABLE_INTERRUPTS(CLBR_NONE)
call sys_rt_sigreturn TRACE_IRQS_OFF
return_from_stub: popq %rax
addq $8, %rsp jmp entry_SYSCALL64_slow_path
RESTORE_EXTRA_REGS
movq %rax, RAX(%rsp)
jmp int_ret_from_sys_call
END(stub_rt_sigreturn)
#ifdef CONFIG_X86_X32_ABI 1:
ENTRY(stub_x32_rt_sigreturn) /* Called from C */
SAVE_EXTRA_REGS 8 jmp *%rax /* called from C */
call sys32_x32_rt_sigreturn END(stub_ptregs_64)
jmp return_from_stub
END(stub_x32_rt_sigreturn) .macro ptregs_stub func
#endif ENTRY(ptregs_\func)
leaq \func(%rip), %rax
jmp stub_ptregs_64
END(ptregs_\func)
.endm
/* Instantiate ptregs_stub for each ptregs-using syscall */
#define __SYSCALL_64_QUAL_(sym)
#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
#include <asm/syscalls_64.h>
/* /*
* A newly forked process directly context switches into this address. * A newly forked process directly context switches into this address.
...@@ -439,7 +360,6 @@ END(stub_x32_rt_sigreturn) ...@@ -439,7 +360,6 @@ END(stub_x32_rt_sigreturn)
* rdi: prev task we switched from * rdi: prev task we switched from
*/ */
ENTRY(ret_from_fork) ENTRY(ret_from_fork)
LOCK ; btr $TIF_FORK, TI_flags(%r8) LOCK ; btr $TIF_FORK, TI_flags(%r8)
pushq $0x0002 pushq $0x0002
...@@ -447,28 +367,32 @@ ENTRY(ret_from_fork) ...@@ -447,28 +367,32 @@ ENTRY(ret_from_fork)
call schedule_tail /* rdi: 'prev' task parameter */ call schedule_tail /* rdi: 'prev' task parameter */
RESTORE_EXTRA_REGS
testb $3, CS(%rsp) /* from kernel_thread? */ testb $3, CS(%rsp) /* from kernel_thread? */
jnz 1f
/* /*
* By the time we get here, we have no idea whether our pt_regs, * We came from kernel_thread. This code path is quite twisted, and
* ti flags, and ti status came from the 64-bit SYSCALL fast path, * someone should clean it up.
* the slow path, or one of the 32-bit compat paths. *
* Use IRET code path to return, since it can safely handle * copy_thread_tls stashes the function pointer in RBX and the
* all of the above. * parameter to be passed in RBP. The called function is permitted
* to call do_execve and thereby jump to user mode.
*/ */
jnz int_ret_from_sys_call movq RBP(%rsp), %rdi
call *RBX(%rsp)
movl $0, RAX(%rsp)
/* /*
* We came from kernel_thread * Fall through as though we're exiting a syscall. This makes a
* nb: we depend on RESTORE_EXTRA_REGS above * twisted sort of sense if we just called do_execve.
*/ */
movq %rbp, %rdi
call *%rbx 1:
movl $0, RAX(%rsp) movq %rsp, %rdi
RESTORE_EXTRA_REGS call syscall_return_slowpath /* returns with IRQs disabled */
jmp int_ret_from_sys_call TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
jmp restore_regs_and_iret
END(ret_from_fork) END(ret_from_fork)
/* /*
......
...@@ -6,17 +6,11 @@ ...@@ -6,17 +6,11 @@
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/syscall.h> #include <asm/syscall.h>
#ifdef CONFIG_IA32_EMULATION #define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#define SYM(sym, compat) compat
#else
#define SYM(sym, compat) sym
#endif
#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h> #include <asm/syscalls_32.h>
#undef __SYSCALL_I386 #undef __SYSCALL_I386
#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), #define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
......
...@@ -6,19 +6,14 @@ ...@@ -6,19 +6,14 @@
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/syscall.h> #include <asm/syscall.h>
#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) #define __SYSCALL_64_QUAL_(sym) sym
#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
#ifdef CONFIG_X86_X32_ABI #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
#else
# define __SYSCALL_X32(nr, sym, compat) /* nothing */
#endif
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h> #include <asm/syscalls_64.h>
#undef __SYSCALL_64 #undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, compat) [nr] = sym, #define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
12 common brk sys_brk 12 common brk sys_brk
13 64 rt_sigaction sys_rt_sigaction 13 64 rt_sigaction sys_rt_sigaction
14 common rt_sigprocmask sys_rt_sigprocmask 14 common rt_sigprocmask sys_rt_sigprocmask
15 64 rt_sigreturn stub_rt_sigreturn 15 64 rt_sigreturn sys_rt_sigreturn/ptregs
16 64 ioctl sys_ioctl 16 64 ioctl sys_ioctl
17 common pread64 sys_pread64 17 common pread64 sys_pread64
18 common pwrite64 sys_pwrite64 18 common pwrite64 sys_pwrite64
...@@ -62,10 +62,10 @@ ...@@ -62,10 +62,10 @@
53 common socketpair sys_socketpair 53 common socketpair sys_socketpair
54 64 setsockopt sys_setsockopt 54 64 setsockopt sys_setsockopt
55 64 getsockopt sys_getsockopt 55 64 getsockopt sys_getsockopt
56 common clone stub_clone 56 common clone sys_clone/ptregs
57 common fork stub_fork 57 common fork sys_fork/ptregs
58 common vfork stub_vfork 58 common vfork sys_vfork/ptregs
59 64 execve stub_execve 59 64 execve sys_execve/ptregs
60 common exit sys_exit 60 common exit sys_exit
61 common wait4 sys_wait4 61 common wait4 sys_wait4
62 common kill sys_kill 62 common kill sys_kill
...@@ -178,7 +178,7 @@ ...@@ -178,7 +178,7 @@
169 common reboot sys_reboot 169 common reboot sys_reboot
170 common sethostname sys_sethostname 170 common sethostname sys_sethostname
171 common setdomainname sys_setdomainname 171 common setdomainname sys_setdomainname
172 common iopl sys_iopl 172 common iopl sys_iopl/ptregs
173 common ioperm sys_ioperm 173 common ioperm sys_ioperm
174 64 create_module 174 64 create_module
175 common init_module sys_init_module 175 common init_module sys_init_module
...@@ -328,7 +328,7 @@ ...@@ -328,7 +328,7 @@
319 common memfd_create sys_memfd_create 319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load 320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf 321 common bpf sys_bpf
322 64 execveat stub_execveat 322 64 execveat sys_execveat/ptregs
323 common userfaultfd sys_userfaultfd 323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier 324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2 325 common mlock2 sys_mlock2
...@@ -339,14 +339,14 @@ ...@@ -339,14 +339,14 @@
# for native 64-bit operation. # for native 64-bit operation.
# #
512 x32 rt_sigaction compat_sys_rt_sigaction 512 x32 rt_sigaction compat_sys_rt_sigaction
513 x32 rt_sigreturn stub_x32_rt_sigreturn 513 x32 rt_sigreturn sys32_x32_rt_sigreturn
514 x32 ioctl compat_sys_ioctl 514 x32 ioctl compat_sys_ioctl
515 x32 readv compat_sys_readv 515 x32 readv compat_sys_readv
516 x32 writev compat_sys_writev 516 x32 writev compat_sys_writev
517 x32 recvfrom compat_sys_recvfrom 517 x32 recvfrom compat_sys_recvfrom
518 x32 sendmsg compat_sys_sendmsg 518 x32 sendmsg compat_sys_sendmsg
519 x32 recvmsg compat_sys_recvmsg 519 x32 recvmsg compat_sys_recvmsg
520 x32 execve stub_x32_execve 520 x32 execve compat_sys_execve/ptregs
521 x32 ptrace compat_sys_ptrace 521 x32 ptrace compat_sys_ptrace
522 x32 rt_sigpending compat_sys_rt_sigpending 522 x32 rt_sigpending compat_sys_rt_sigpending
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait 523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
...@@ -371,4 +371,4 @@ ...@@ -371,4 +371,4 @@
542 x32 getsockopt compat_sys_getsockopt 542 x32 getsockopt compat_sys_getsockopt
543 x32 io_setup compat_sys_io_setup 543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit 544 x32 io_submit compat_sys_io_submit
545 x32 execveat stub_x32_execveat 545 x32 execveat compat_sys_execveat/ptregs
...@@ -3,13 +3,63 @@ ...@@ -3,13 +3,63 @@
in="$1" in="$1"
out="$2" out="$2"
syscall_macro() {
abi="$1"
nr="$2"
entry="$3"
# Entry can be either just a function name or "function/qualifier"
real_entry="${entry%%/*}"
qualifier="${entry:${#real_entry}}" # Strip the function name
qualifier="${qualifier:1}" # Strip the slash, if any
echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
}
emit() {
abi="$1"
nr="$2"
entry="$3"
compat="$4"
if [ "$abi" == "64" -a -n "$compat" ]; then
echo "a compat entry for a 64-bit syscall makes no sense" >&2
exit 1
fi
if [ -z "$compat" ]; then
if [ -n "$entry" ]; then
syscall_macro "$abi" "$nr" "$entry"
fi
else
echo "#ifdef CONFIG_X86_32"
if [ -n "$entry" ]; then
syscall_macro "$abi" "$nr" "$entry"
fi
echo "#else"
syscall_macro "$abi" "$nr" "$compat"
echo "#endif"
fi
}
grep '^[0-9]' "$in" | sort -n | ( grep '^[0-9]' "$in" | sort -n | (
while read nr abi name entry compat; do while read nr abi name entry compat; do
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
if [ -n "$compat" ]; then if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
echo "__SYSCALL_${abi}($nr, $entry, $compat)" # COMMON is the same as 64, except that we don't expect X32
elif [ -n "$entry" ]; then # programs to use it. Our expectation has nothing to do with
echo "__SYSCALL_${abi}($nr, $entry, $entry)" # any generated code, so treat them the same.
emit 64 "$nr" "$entry" "$compat"
elif [ "$abi" == "X32" ]; then
# X32 is equivalent to 64 on an X32-compatible kernel.
echo "#ifdef CONFIG_X86_X32_ABI"
emit 64 "$nr" "$entry" "$compat"
echo "#endif"
elif [ "$abi" == "I386" ]; then
emit "$abi" "$nr" "$entry" "$compat"
else
echo "Unknown abi $abi" >&2
exit 1
fi fi
done done
) > "$out" ) > "$out"
...@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, ...@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
} }
fprintf(outfile, "\n};\n\n"); fprintf(outfile, "\n};\n\n");
fprintf(outfile, "static struct page *pages[%lu];\n\n",
mapping_size / 4096);
fprintf(outfile, "const struct vdso_image %s = {\n", name); fprintf(outfile, "const struct vdso_image %s = {\n", name);
fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.data = raw_data,\n");
fprintf(outfile, "\t.size = %lu,\n", mapping_size); fprintf(outfile, "\t.size = %lu,\n", mapping_size);
fprintf(outfile, "\t.text_mapping = {\n");
fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
fprintf(outfile, "\t\t.pages = pages,\n");
fprintf(outfile, "\t},\n");
if (alt_sec) { if (alt_sec) {
fprintf(outfile, "\t.alt = %lu,\n", fprintf(outfile, "\t.alt = %lu,\n",
(unsigned long)GET_LE(&alt_sec->sh_offset)); (unsigned long)GET_LE(&alt_sec->sh_offset));
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/mm_types.h> #include <linux/mm_types.h>
#include <asm/cpufeature.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/vdso.h> #include <asm/vdso.h>
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
*/ */
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/cpufeature.h> #include <asm/cpufeatures.h>
#include <asm/alternative-asm.h> #include <asm/alternative-asm.h>
/* /*
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <asm/page.h> #include <asm/page.h>
#include <asm/hpet.h> #include <asm/hpet.h>
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/cpufeature.h>
#if defined(CONFIG_X86_64) #if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1; unsigned int __read_mostly vdso64_enabled = 1;
...@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1; ...@@ -27,13 +28,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
void __init init_vdso_image(const struct vdso_image *image) void __init init_vdso_image(const struct vdso_image *image)
{ {
int i;
int npages = (image->size) / PAGE_SIZE;
BUG_ON(image->size % PAGE_SIZE != 0); BUG_ON(image->size % PAGE_SIZE != 0);
for (i = 0; i < npages; i++)
image->text_mapping.pages[i] =
virt_to_page(image->data + i*PAGE_SIZE);
apply_alternatives((struct alt_instr *)(image->data + image->alt), apply_alternatives((struct alt_instr *)(image->data + image->alt),
(struct alt_instr *)(image->data + image->alt + (struct alt_instr *)(image->data + image->alt +
...@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) ...@@ -90,18 +85,87 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
#endif #endif
} }
static int vdso_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
return VM_FAULT_SIGBUS;
vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
get_page(vmf->page);
return 0;
}
static const struct vm_special_mapping text_mapping = {
.name = "[vdso]",
.fault = vdso_fault,
};
static int vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
long sym_offset;
int ret = -EFAULT;
if (!image)
return VM_FAULT_SIGBUS;
sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
image->sym_vvar_start;
/*
* Sanity check: a symbol offset of zero means that the page
* does not exist for this vdso image, not that the page is at
* offset zero relative to the text mapping. This should be
* impossible here, because sym_offset should only be zero for
* the page past the end of the vvar mapping.
*/
if (sym_offset == 0)
return VM_FAULT_SIGBUS;
if (sym_offset == image->sym_vvar_page) {
ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_hpet_page) {
#ifdef CONFIG_HPET_TIMER
if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
ret = vm_insert_pfn_prot(
vma,
(unsigned long)vmf->virtual_address,
hpet_address >> PAGE_SHIFT,
pgprot_noncached(PAGE_READONLY));
}
#endif
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
pvclock_pvti_cpu0_va();
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
ret = vm_insert_pfn(
vma,
(unsigned long)vmf->virtual_address,
__pa(pvti) >> PAGE_SHIFT);
}
}
if (ret == 0 || ret == -EBUSY)
return VM_FAULT_NOPAGE;
return VM_FAULT_SIGBUS;
}
static int map_vdso(const struct vdso_image *image, bool calculate_addr) static int map_vdso(const struct vdso_image *image, bool calculate_addr)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct vm_area_struct *vma; struct vm_area_struct *vma;
unsigned long addr, text_start; unsigned long addr, text_start;
int ret = 0; int ret = 0;
static struct page *no_pages[] = {NULL}; static const struct vm_special_mapping vvar_mapping = {
static struct vm_special_mapping vvar_mapping = {
.name = "[vvar]", .name = "[vvar]",
.pages = no_pages, .fault = vvar_fault,
}; };
struct pvclock_vsyscall_time_info *pvti;
if (calculate_addr) { if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack, addr = vdso_addr(current->mm->start_stack,
...@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) ...@@ -121,6 +185,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
text_start = addr - image->sym_vvar_start; text_start = addr - image->sym_vvar_start;
current->mm->context.vdso = (void __user *)text_start; current->mm->context.vdso = (void __user *)text_start;
current->mm->context.vdso_image = image;
/* /*
* MAYWRITE to allow gdb to COW and set breakpoints * MAYWRITE to allow gdb to COW and set breakpoints
...@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) ...@@ -130,7 +195,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
image->size, image->size,
VM_READ|VM_EXEC| VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
&image->text_mapping); &text_mapping);
if (IS_ERR(vma)) { if (IS_ERR(vma)) {
ret = PTR_ERR(vma); ret = PTR_ERR(vma);
...@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) ...@@ -140,7 +205,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
vma = _install_special_mapping(mm, vma = _install_special_mapping(mm,
addr, addr,
-image->sym_vvar_start, -image->sym_vvar_start,
VM_READ|VM_MAYREAD, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
VM_PFNMAP,
&vvar_mapping); &vvar_mapping);
if (IS_ERR(vma)) { if (IS_ERR(vma)) {
...@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) ...@@ -148,41 +214,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
goto up_fail; goto up_fail;
} }
if (image->sym_vvar_page)
ret = remap_pfn_range(vma,
text_start + image->sym_vvar_page,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT,
PAGE_SIZE,
PAGE_READONLY);
if (ret)
goto up_fail;
#ifdef CONFIG_HPET_TIMER
if (hpet_address && image->sym_hpet_page) {
ret = io_remap_pfn_range(vma,
text_start + image->sym_hpet_page,
hpet_address >> PAGE_SHIFT,
PAGE_SIZE,
pgprot_noncached(PAGE_READONLY));
if (ret)
goto up_fail;
}
#endif
pvti = pvclock_pvti_cpu0_va();
if (pvti && image->sym_pvclock_page) {
ret = remap_pfn_range(vma,
text_start + image->sym_pvclock_page,
__pa(pvti) >> PAGE_SHIFT,
PAGE_SIZE,
PAGE_READONLY);
if (ret)
goto up_fail;
}
up_fail: up_fail:
if (ret) if (ret)
current->mm->context.vdso = NULL; current->mm->context.vdso = NULL;
...@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg) ...@@ -254,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
node = cpu_to_node(cpu); node = cpu_to_node(cpu);
#endif #endif
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) if (static_cpu_has(X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu); write_rdtscp_aux((node << 12) | cpu);
/* /*
......
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <asm/vgtod.h> #include <asm/vgtod.h>
#include <asm/vvar.h> #include <asm/vvar.h>
int vclocks_used __read_mostly;
DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
void update_vsyscall_tz(void) void update_vsyscall_tz(void)
...@@ -26,12 +28,17 @@ void update_vsyscall_tz(void) ...@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
void update_vsyscall(struct timekeeper *tk) void update_vsyscall(struct timekeeper *tk)
{ {
int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
/* Mark the new vclock used. */
BUILD_BUG_ON(VCLOCK_MAX >= 32);
WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
gtod_write_begin(vdata); gtod_write_begin(vdata);
/* copy vsyscall data */ /* copy vsyscall data */
vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; vdata->vclock_mode = vclock_mode;
vdata->cycle_last = tk->tkr_mono.cycle_last; vdata->cycle_last = tk->tkr_mono.cycle_last;
vdata->mask = tk->tkr_mono.mask; vdata->mask = tk->tkr_mono.mask;
vdata->mult = tk->tkr_mono.mult; vdata->mult = tk->tkr_mono.mult;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment