diff options
Diffstat (limited to 'arch/x86/entry')
-rw-r--r-- | arch/x86/entry/Makefile | 16 | ||||
-rw-r--r-- | arch/x86/entry/calling.h | 55 | ||||
-rw-r--r-- | arch/x86/entry/common.c | 516 | ||||
-rw-r--r-- | arch/x86/entry/entry_32.S | 589 | ||||
-rw-r--r-- | arch/x86/entry/entry_64.S | 975 | ||||
-rw-r--r-- | arch/x86/entry/entry_64_compat.S | 86 | ||||
-rw-r--r-- | arch/x86/entry/syscall_x32.c | 7 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscall_32.tbl | 8 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscall_64.tbl | 8 | ||||
-rw-r--r-- | arch/x86/entry/thunk_32.S | 5 | ||||
-rw-r--r-- | arch/x86/entry/thunk_64.S | 14 | ||||
-rw-r--r-- | arch/x86/entry/vdso/Makefile | 25 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso2c.c | 4 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso2c.h | 16 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso32/note.S | 30 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vma.c | 15 |
16 files changed, 771 insertions, 1598 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 85eb381259c2..08bf95dbc911 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -3,10 +3,24 @@ # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE) + +CFLAGS_common.o += -fno-stack-protector +CFLAGS_syscall_64.o += -fno-stack-protector +CFLAGS_syscall_32.o += -fno-stack-protector +CFLAGS_syscall_x32.o += -fno-stack-protector CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,) + obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 1c7f13bb6728..ae9b0d4615b3 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include <asm/percpu.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/inst.h> /* @@ -341,7 +342,16 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm -#endif /* CONFIG_X86_64 */ +.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req + rdgsbase \save_reg + GET_PERCPU_BASE \scratch_reg + wrgsbase \scratch_reg +.endm + +#else /* CONFIG_X86_64 */ +# undef UNWIND_HINT_IRET_REGS +# define UNWIND_HINT_IRET_REGS +#endif /* !CONFIG_X86_64 */ .macro STACKLEAK_ERASE #ifdef CONFIG_GCC_PLUGIN_STACKLEAK @@ -349,22 +359,37 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +#ifdef CONFIG_SMP + /* - * This does 'call enter_from_user_mode' unless we can avoid it based on - * kernel config or using the static jump infrastructure. + * CPU/node NR is loaded from the limit (size) field of a special segment + * descriptor entry in GDT. */ -.macro CALL_enter_from_user_mode -#ifdef CONFIG_CONTEXT_TRACKING -#ifdef CONFIG_JUMP_LABEL - STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0 -#endif - call enter_from_user_mode -.Lafter_call_\@: -#endif +.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req + movq $__CPUNODE_SEG, \reg + lsl \reg, \reg +.endm + +/* + * Fetch the per-CPU GSBASE value for this processor and put it in @reg. + * We normally use %gs for accessing per-CPU data, but we are setting up + * %gs here and obviously can not use %gs itself to access per-CPU data. + * + * Do not use RDPID, because KVM loads guest's TSC_AUX on vm-entry and + * may not restore the host's value until the CPU returns to userspace. + * Thus the kernel would consume a guest's TSC_AUX if an NMI arrives + * while running KVM's run loop. + */ +.macro GET_PERCPU_BASE reg:req + LOAD_CPU_AND_NODE_SEG_LIMIT \reg + andq $VDSO_CPUNODE_MASK, \reg + movq __per_cpu_offset(, \reg, 8), \reg .endm -#ifdef CONFIG_PARAVIRT_XXL -#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg #else -#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg -#endif + +.macro GET_PERCPU_BASE reg:req + movq pcpu_unit_offsets(%rip), \reg +.endm + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 76735ec813e6..870efeec8bda 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -10,23 +10,21 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/entry-common.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> #include <linux/export.h> -#include <linux/context_tracking.h> -#include <linux/user-return-notifier.h> #include <linux/nospec.h> -#include <linux/uprobes.h> -#include <linux/livepatch.h> #include <linux/syscalls.h> #include <linux/uaccess.h> +#ifdef CONFIG_XEN_PV +#include <xen/xen-ops.h> +#include <xen/events.h> +#endif + #include <asm/desc.h> #include <asm/traps.h> #include <asm/vdso.h> @@ -35,329 +33,121 @@ #include <asm/nospec-branch.h> #include <asm/io_bitmap.h> #include <asm/syscall.h> +#include <asm/irq_stack.h> -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - -#ifdef CONFIG_CONTEXT_TRACKING -/* Called on entry from user mode with IRQs off. */ -__visible inline void enter_from_user_mode(void) -{ - CT_WARN_ON(ct_state() != CONTEXT_USER); - user_exit_irqoff(); -} -#else -static inline void enter_from_user_mode(void) {} -#endif - -static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) -{ #ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - audit_syscall_entry(regs->orig_ax, regs->di, - regs->si, regs->dx, regs->r10); - } else -#endif - { - audit_syscall_entry(regs->orig_ax, regs->bx, - regs->cx, regs->dx, regs->si); - } -} - -/* - * Returns the syscall nr to run (which should match regs->orig_ax) or -1 - * to skip the syscall. - */ -static long syscall_trace_enter(struct pt_regs *regs) +__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { - u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - - struct thread_info *ti = current_thread_info(); - unsigned long ret = 0; - u32 work; - - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - BUG_ON(regs != task_pt_regs(current)); - - work = READ_ONCE(ti->flags); - - if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { - ret = tracehook_report_syscall_entry(regs); - if (ret || (work & _TIF_SYSCALL_EMU)) - return -1L; - } - -#ifdef CONFIG_SECCOMP - /* - * Do seccomp after ptrace, to catch any tracer changes. - */ - if (work & _TIF_SECCOMP) { - struct seccomp_data sd; + nr = syscall_enter_from_user_mode(regs, nr); - sd.arch = arch; - sd.nr = regs->orig_ax; - sd.instruction_pointer = regs->ip; -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - sd.args[0] = regs->di; - sd.args[1] = regs->si; - sd.args[2] = regs->dx; - sd.args[3] = regs->r10; - sd.args[4] = regs->r8; - sd.args[5] = regs->r9; - } else + instrumentation_begin(); + if (likely(nr < NR_syscalls)) { + nr = array_index_nospec(nr, NR_syscalls); + regs->ax = sys_call_table[nr](regs); +#ifdef CONFIG_X86_X32_ABI + } else if (likely((nr & __X32_SYSCALL_BIT) && + (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { + nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, + X32_NR_syscalls); + regs->ax = x32_sys_call_table[nr](regs); #endif - { - sd.args[0] = regs->bx; - sd.args[1] = regs->cx; - sd.args[2] = regs->dx; - sd.args[3] = regs->si; - sd.args[4] = regs->di; - sd.args[5] = regs->bp; - } - - ret = __secure_computing(&sd); - if (ret == -1) - return ret; } + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} #endif - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->orig_ax); - - do_audit_syscall_entry(regs, arch); +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_IA32_EMULATION)) + current_thread_info()->status |= TS_COMPAT; - return ret ?: regs->orig_ax; + return (unsigned int)regs->orig_ax; } -#define EXIT_TO_USERMODE_LOOP_FLAGS \ - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) - -static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) +/* + * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. + */ +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, + unsigned int nr) { - /* - * In order to return to user mode, we need to have IRQs off with - * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags - * can be set at any time on preemptible kernels if we have IRQs on, - * so we need to loop. Disabling preemption wouldn't help: doing the - * work to clear some of the flags can sleep. - */ - while (true) { - /* We have work to do. */ - local_irq_enable(); - - if (cached_flags & _TIF_NEED_RESCHED) - schedule(); - - if (cached_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (cached_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - - /* deal with pending signal delivery */ - if (cached_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (cached_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } - - if (cached_flags & _TIF_USER_RETURN_NOTIFY) - fire_user_return_notifiers(); - - /* Disable IRQs and retry */ - local_irq_disable(); - - cached_flags = READ_ONCE(current_thread_info()->flags); - - if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) - break; + if (likely(nr < IA32_NR_syscalls)) { + instrumentation_begin(); + nr = array_index_nospec(nr, IA32_NR_syscalls); + regs->ax = ia32_sys_call_table[nr](regs); + instrumentation_end(); } } -/* Called with IRQs disabled. */ -__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) +/* Handles int $0x80 */ +__visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { - struct thread_info *ti = current_thread_info(); - u32 cached_flags; + unsigned int nr = syscall_32_enter(regs); - addr_limit_user_check(); - - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); - - cached_flags = READ_ONCE(ti->flags); - - if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) - exit_to_usermode_loop(regs, cached_flags); - - /* Reload ti->flags; we may have rescheduled above. */ - cached_flags = READ_ONCE(ti->flags); - - if (unlikely(cached_flags & _TIF_IO_BITMAP)) - tss_update_io_bitmap(); - - fpregs_assert_state_consistent(); - if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) - switch_fpu_return(); - -#ifdef CONFIG_COMPAT /* - * Compat syscalls set TS_COMPAT. Make sure we clear it before - * returning to user mode. We need to clear it *after* signal - * handling, because syscall restart has a fixup for compat - * syscalls. The fixup is exercised by the ptrace_syscall_32 - * selftest. - * - * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer - * special case only applies after poking regs and before the - * very next return to user mode. + * Subtlety here: if ptrace pokes something larger than 2^32-1 into + * orig_ax, the unsigned int return value truncates it. This may + * or may not be necessary, but it matches the old asm behavior. */ - ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); -#endif + nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); - user_enter_irqoff(); - - mds_user_clear_cpu_buffers(); + do_syscall_32_irqs_on(regs, nr); + syscall_exit_to_user_mode(regs); } -#define SYSCALL_EXIT_WORK_FLAGS \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) - -static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) +static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { - bool step; - - audit_syscall_exit(regs); - - if (cached_flags & _TIF_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, regs->ax); + unsigned int nr = syscall_32_enter(regs); + int res; /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). + * This cannot use syscall_enter_from_user_mode() as it has to + * fetch EBP before invoking any of the syscall entry work + * functions. */ - step = unlikely( - (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) - == _TIF_SINGLESTEP); - if (step || cached_flags & _TIF_SYSCALL_TRACE) - tracehook_report_syscall_exit(regs, step); -} + syscall_enter_from_user_mode_prepare(regs); -/* - * Called with IRQs on and fully valid regs. Returns with IRQs off in a - * state such that we can immediately switch to user mode. - */ -__visible inline void syscall_return_slowpath(struct pt_regs *regs) -{ - struct thread_info *ti = current_thread_info(); - u32 cached_flags = READ_ONCE(ti->flags); - - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - - if (IS_ENABLED(CONFIG_PROVE_LOCKING) && - WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) - local_irq_enable(); - - rseq_syscall(regs); - - /* - * First do one-time work. If these work items are enabled, we - * want to run them exactly once per syscall exit with IRQs on. - */ - if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) - syscall_slow_exit_work(regs, cached_flags); - - local_irq_disable(); - prepare_exit_to_usermode(regs); -} - -#ifdef CONFIG_X86_64 -__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) -{ - struct thread_info *ti; - - enter_from_user_mode(); - local_irq_enable(); - ti = current_thread_info(); - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) - nr = syscall_trace_enter(regs); - - if (likely(nr < NR_syscalls)) { - nr = array_index_nospec(nr, NR_syscalls); - regs->ax = sys_call_table[nr](regs); -#ifdef CONFIG_X86_X32_ABI - } else if (likely((nr & __X32_SYSCALL_BIT) && - (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { - nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, - X32_NR_syscalls); - regs->ax = x32_sys_call_table[nr](regs); -#endif - } - - syscall_return_slowpath(regs); -} -#endif - -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -/* - * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does - * all entry and exit work and returns with IRQs off. This function is - * extremely hot in workloads that use it, and it's usually called from - * do_fast_syscall_32, so forcibly inline it to improve performance. - */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) -{ - struct thread_info *ti = current_thread_info(); - unsigned int nr = (unsigned int)regs->orig_ax; - -#ifdef CONFIG_IA32_EMULATION - ti->status |= TS_COMPAT; -#endif - - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { + instrumentation_begin(); + /* Fetch EBP from where the vDSO stashed it. */ + if (IS_ENABLED(CONFIG_X86_64)) { /* - * Subtlety here: if ptrace pokes something larger than - * 2^32-1 into orig_ax, this truncates it. This may or - * may not be necessary, but it matches the old asm - * behavior. + * Micro-optimization: the pointer we're following is + * explicitly 32 bits, so it can't be out of range. */ - nr = syscall_trace_enter(regs); + res = __get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } else { + res = get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); } + instrumentation_end(); - if (likely(nr < IA32_NR_syscalls)) { - nr = array_index_nospec(nr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[nr](regs); + if (res) { + /* User code screwed up. */ + regs->ax = -EFAULT; + syscall_exit_to_user_mode(regs); + return false; } - syscall_return_slowpath(regs); -} + /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ + nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); -/* Handles int $0x80 */ -__visible void do_int80_syscall_32(struct pt_regs *regs) -{ - enter_from_user_mode(); - local_irq_enable(); - do_syscall_32_irqs_on(regs); + /* Now this is just like a normal syscall. */ + do_syscall_32_irqs_on(regs, nr); + syscall_exit_to_user_mode(regs); + return true; } /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ -__visible long do_fast_syscall_32(struct pt_regs *regs) +__visible noinstr long do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ - unsigned long landing_pad = (unsigned long)current->mm->context.vdso + - vdso_image_32.sym_int80_landing_pad; + vdso_image_32.sym_int80_landing_pad; /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward @@ -366,34 +156,9 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) */ regs->ip = landing_pad; - enter_from_user_mode(); - - local_irq_enable(); - - /* Fetch EBP from where the vDSO stashed it. */ - if ( -#ifdef CONFIG_X86_64 - /* - * Micro-optimization: the pointer we're following is explicitly - * 32 bits, so it can't be out of range. - */ - __get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp) -#else - get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp) -#endif - ) { - - /* User code screwed up. */ - local_irq_disable(); - regs->ax = -EFAULT; - prepare_exit_to_usermode(regs); - return 0; /* Keep it simple: use IRET. */ - } - - /* Now this is just like a normal syscall. */ - do_syscall_32_irqs_on(regs); + /* Invoke the syscall. If it failed, keep it simple: use IRET. */ + if (!__do_fast_syscall_32(regs)) + return 0; #ifdef CONFIG_X86_64 /* @@ -425,9 +190,128 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif } + +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ +__visible noinstr long do_SYSENTER_32(struct pt_regs *regs) +{ + /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ + regs->sp = regs->bp; + + /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ + regs->flags |= X86_EFLAGS_IF; + + return do_fast_syscall_32(regs); +} #endif SYSCALL_DEFINE0(ni_syscall) { return -ENOSYS; } + +noinstr bool idtentry_enter_nmi(struct pt_regs *regs) +{ + bool irq_state = lockdep_hardirqs_enabled(); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); + instrumentation_end(); + + return irq_state; +} + +noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) +{ + instrumentation_begin(); + ftrace_nmi_exit(); + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + } + instrumentation_end(); + + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} + +#ifdef CONFIG_XEN_PV +#ifndef CONFIG_PREEMPTION +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +/* + * In case of scheduling the flag must be cleared and restored after + * returning from schedule as the task might move to a different CPU. + */ +static __always_inline bool get_and_clear_inhcall(void) +{ + bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); + + __this_cpu_write(xen_in_preemptible_hcall, false); + return inhcall; +} + +static __always_inline void restore_inhcall(bool inhcall) +{ + __this_cpu_write(xen_in_preemptible_hcall, inhcall); +} +#else +static __always_inline bool get_and_clear_inhcall(void) { return false; } +static __always_inline void restore_inhcall(bool inhcall) { } +#endif + +static void __xen_pv_evtchn_do_upcall(void) +{ + irq_enter_rcu(); + inc_irq_stat(irq_hv_callback_count); + + xen_hvm_evtchn_do_upcall(); + + irq_exit_rcu(); +} + +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + bool inhcall; + irqentry_state_t state; + + state = irqentry_enter(regs); + old_regs = set_irq_regs(regs); + + instrumentation_begin(); + run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); + instrumentation_begin(); + + set_irq_regs(old_regs); + + inhcall = get_and_clear_inhcall(); + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { + instrumentation_begin(); + irqentry_exit_cond_resched(); + instrumentation_end(); + restore_inhcall(inhcall); + } else { + irqentry_exit(regs, state); + } +} +#endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b67bae7091d7..df8c017e6161 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,40 +44,13 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/frame.h> +#include <asm/trapnr.h> #include <asm/nospec-branch.h> #include "calling.h" .section .entry.text, "ax" -/* - * We use macros for low-level operations which need to be overridden - * for paravirtualization. The following will never clobber any registers: - * INTERRUPT_RETURN (aka. "iret") - * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). - * - * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must - * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). - * Allowing a register to be clobbered can shrink the paravirt replacement - * enough to patch inline, increasing performance. - */ - -#ifdef CONFIG_PREEMPTION -# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -#else -# define preempt_stop(clobbers) -#endif - -.macro TRACE_IRQS_IRET -#ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? - jz 1f - TRACE_IRQS_ON -1: -#endif -.endm - #define PTI_SWITCH_MASK (1 << PAGE_SHIFT) /* @@ -476,8 +449,6 @@ .macro SWITCH_TO_KERNEL_STACK - ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV - BUG_IF_WRONG_CR3 SWITCH_TO_KERNEL_CR3 scratch_reg=%eax @@ -626,8 +597,6 @@ */ .macro SWITCH_TO_ENTRY_STACK - ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV - /* Bytes to copy */ movl $PTREGS_SIZE, %ecx @@ -726,10 +695,68 @@ .Lend_\@: .endm + +/** + * idtentry - Macro to generate entry stubs for simple IDT entries + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack + */ +.macro idtentry vector asmsym cfunc has_error_code:req +SYM_CODE_START(\asmsym) + ASM_CLAC + cld + + .if \has_error_code == 0 + pushl $0 /* Clear the error code */ + .endif + + /* Push the C-function address into the GS slot */ + pushl $\cfunc + /* Invoke the common exception entry */ + jmp handle_exception +SYM_CODE_END(\asmsym) +.endm + +.macro idtentry_irq vector cfunc + .p2align CONFIG_X86_L1_CACHE_SHIFT +SYM_CODE_START_LOCAL(asm_\cfunc) + ASM_CLAC + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + movl %esp, %eax + movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */ + movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */ + call \cfunc + jmp handle_exception_return +SYM_CODE_END(asm_\cfunc) +.endm + +.macro idtentry_sysvec vector cfunc + idtentry \vector asm_\cfunc \cfunc has_error_code=0 +.endm + +/* + * Include the defines which emit the idt entries which are shared + * shared between 32 and 64 bit and emit the __irqentry_text_* markers + * so the stacktrace boundary checks work. + */ + .align 16 + .globl __irqentry_text_start +__irqentry_text_start: + +#include <asm/idtentry.h> + + .align 16 + .globl __irqentry_text_end +__irqentry_text_end: + /* * %eax: prev task * %edx: next task */ +.pushsection .text, "ax" SYM_CODE_START(__switch_to_asm) /* * Save callee-saved registers @@ -776,6 +803,7 @@ SYM_CODE_START(__switch_to_asm) jmp __switch_to SYM_CODE_END(__switch_to_asm) +.popsection /* * The unwinder expects the last frame on the stack to always be at the same @@ -784,6 +812,7 @@ SYM_CODE_END(__switch_to_asm) * asmlinkage function so its argument has to be pushed on the stack. This * wrapper creates a proper "end of stack" frame header before the call. */ +.pushsection .text, "ax" SYM_FUNC_START(schedule_tail_wrapper) FRAME_BEGIN @@ -794,6 +823,8 @@ SYM_FUNC_START(schedule_tail_wrapper) FRAME_END ret SYM_FUNC_END(schedule_tail_wrapper) +.popsection + /* * A newly forked process directly context switches into this address. * @@ -801,6 +832,7 @@ SYM_FUNC_END(schedule_tail_wrapper) * ebx: kernel thread func (NULL for user thread) * edi: kernel thread arg */ +.pushsection .text, "ax" SYM_CODE_START(ret_from_fork) call schedule_tail_wrapper @@ -810,53 +842,21 @@ SYM_CODE_START(ret_from_fork) 2: /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax - call syscall_return_slowpath - STACKLEAK_ERASE - jmp restore_all + call syscall_exit_to_user_mode + jmp .Lsyscall_32_done /* kernel thread */ 1: movl %edi, %eax - CALL_NOSPEC %ebx + CALL_NOSPEC ebx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movl $0, PT_EAX(%esp) jmp 2b SYM_CODE_END(ret_from_fork) - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing -SYM_CODE_START_LOCAL(ret_from_exception) - preempt_stop(CLBR_ANY) -ret_from_intr: -#ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax -#else - /* - * We can be coming here from child spawned by kernel_thread(). - */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax -#endif - cmpl $USER_RPL, %eax - jb restore_all_kernel # not returning to v8086 or userspace - - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl %esp, %eax - call prepare_exit_to_usermode - jmp restore_all -SYM_CODE_END(ret_from_exception) +.popsection SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) /* @@ -868,17 +868,6 @@ SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) * will ignore all of the single-step traps generated in this range. */ -#ifdef CONFIG_XEN_PV -/* - * Xen doesn't set %esp to be precisely what the normal SYSENTER - * entry point expects, so fix it up before using the normal path. - */ -SYM_CODE_START(xen_sysenter_target) - addl $5*4, %esp /* remove xen-provided frame */ - jmp .Lsysenter_past_esp -SYM_CODE_END(xen_sysenter_target) -#endif - /* * 32-bit SYSENTER entry. * @@ -929,9 +918,8 @@ SYM_FUNC_START(entry_SYSENTER_32) .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ - pushl %ebp /* pt_regs->sp (stashed in bp) */ + pushl $0 /* pt_regs->sp (placeholder) */ pushfl /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ @@ -960,22 +948,14 @@ SYM_FUNC_START(entry_SYSENTER_32) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF - movl %esp, %eax - call do_fast_syscall_32 - /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ - "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + call do_SYSENTER_32 + testl %eax, %eax + jz .Lsyscall_32_done STACKLEAK_ERASE -/* Opportunistic SYSEXIT */ - TRACE_IRQS_ON /* User mode traces as IRQs on. */ + /* Opportunistic SYSEXIT */ /* * Setup entry stack - we keep the pointer in %eax and do the @@ -1075,20 +1055,12 @@ SYM_FUNC_START(entry_INT80_32) SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ - /* - * User mode is traced as though IRQs are on, and the interrupt gate - * turned them off. - */ - TRACE_IRQS_OFF - movl %esp, %eax call do_int80_syscall_32 .Lsyscall_32_done: - STACKLEAK_ERASE -restore_all: - TRACE_IRQS_ON +restore_all_switch_stack: SWITCH_TO_ENTRY_STACK CHECK_AND_APPLY_ESPFIX @@ -1107,26 +1079,10 @@ restore_all: */ INTERRUPT_RETURN -restore_all_kernel: -#ifdef CONFIG_PREEMPTION - DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz .Lno_preempt - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz .Lno_preempt - call preempt_schedule_irq -.Lno_preempt: -#endif - TRACE_IRQS_IRET - PARANOID_EXIT_TO_KERNEL_MODE - BUG_IF_WRONG_CR3 - RESTORE_REGS 4 - jmp .Lirq_return - .section .fixup, "ax" -SYM_CODE_START(iret_exc) +SYM_CODE_START(asm_iret_error) pushl $0 # no error code - pushl $do_iret_error + pushl $iret_error #ifdef CONFIG_DEBUG_ENTRY /* @@ -1140,10 +1096,10 @@ SYM_CODE_START(iret_exc) popl %eax #endif - jmp common_exception -SYM_CODE_END(iret_exc) + jmp handle_exception +SYM_CODE_END(asm_iret_error) .previous - _ASM_EXTABLE(.Lirq_return, iret_exc) + _ASM_EXTABLE(.Lirq_return, asm_iret_error) SYM_FUNC_END(entry_INT80_32) .macro FIXUP_ESPFIX_STACK @@ -1193,319 +1149,7 @@ SYM_FUNC_END(entry_INT80_32) #endif .endm -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -SYM_CODE_START(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - .align 8 - .endr -SYM_CODE_END(irq_entries_start) - -#ifdef CONFIG_X86_LOCAL_APIC - .align 8 -SYM_CODE_START(spurious_entries_start) - vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) - pushl $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_spurious - .align 8 - .endr -SYM_CODE_END(spurious_entries_start) - -SYM_CODE_START_LOCAL(common_spurious) - ASM_CLAC - addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - SAVE_ALL switch_stacks=1 - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - movl %esp, %eax - call smp_spurious_interrupt - jmp ret_from_intr -SYM_CODE_END(common_spurious) -#endif - -/* - * the CPU automatically disables interrupts when executing an IRQ vector, - * so IRQ-flags tracing has to follow that: - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -SYM_CODE_START_LOCAL(common_interrupt) - ASM_CLAC - addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - - SAVE_ALL switch_stacks=1 - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - movl %esp, %eax - call do_IRQ - jmp ret_from_intr -SYM_CODE_END(common_interrupt) - -#define BUILD_INTERRUPT3(name, nr, fn) \ -SYM_FUNC_START(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL switch_stacks=1; \ - ENCODE_FRAME_POINTER; \ - TRACE_IRQS_OFF \ - movl %esp, %eax; \ - call fn; \ - jmp ret_from_intr; \ -SYM_FUNC_END(name) - -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ - -/* The include is where all of the SMP etc. interrupts come from */ -#include <asm/entry_arch.h> - -SYM_CODE_START(coprocessor_error) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_error - jmp common_exception -SYM_CODE_END(coprocessor_error) - -SYM_CODE_START(simd_coprocessor_error) - ASM_CLAC - pushl $0 -#ifdef CONFIG_X86_INVD_BUG - /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ - X86_FEATURE_XMM -#else - pushl $do_simd_coprocessor_error -#endif - jmp common_exception -SYM_CODE_END(simd_coprocessor_error) - -SYM_CODE_START(device_not_available) - ASM_CLAC - pushl $0 - pushl $do_device_not_available - jmp common_exception -SYM_CODE_END(device_not_available) - -#ifdef CONFIG_PARAVIRT -SYM_CODE_START(native_iret) - iret - _ASM_EXTABLE(native_iret, iret_exc) -SYM_CODE_END(native_iret) -#endif - -SYM_CODE_START(overflow) - ASM_CLAC - pushl $0 - pushl $do_overflow - jmp common_exception -SYM_CODE_END(overflow) - -SYM_CODE_START(bounds) - ASM_CLAC - pushl $0 - pushl $do_bounds - jmp common_exception -SYM_CODE_END(bounds) - -SYM_CODE_START(invalid_op) - ASM_CLAC - pushl $0 - pushl $do_invalid_op - jmp common_exception -SYM_CODE_END(invalid_op) - -SYM_CODE_START(coprocessor_segment_overrun) - ASM_CLAC - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp common_exception -SYM_CODE_END(coprocessor_segment_overrun) - -SYM_CODE_START(invalid_TSS) - ASM_CLAC - pushl $do_invalid_TSS - jmp common_exception -SYM_CODE_END(invalid_TSS) - -SYM_CODE_START(segment_not_present) - ASM_CLAC - pushl $do_segment_not_present - jmp common_exception -SYM_CODE_END(segment_not_present) - -SYM_CODE_START(stack_segment) - ASM_CLAC - pushl $do_stack_segment - jmp common_exception -SYM_CODE_END(stack_segment) - -SYM_CODE_START(alignment_check) - ASM_CLAC - pushl $do_alignment_check - jmp common_exception -SYM_CODE_END(alignment_check) - -SYM_CODE_START(divide_error) - ASM_CLAC - pushl $0 # no error code - pushl $do_divide_error - jmp common_exception -SYM_CODE_END(divide_error) - -#ifdef CONFIG_X86_MCE -SYM_CODE_START(machine_check) - ASM_CLAC - pushl $0 - pushl $do_mce - jmp common_exception -SYM_CODE_END(machine_check) -#endif - -SYM_CODE_START(spurious_interrupt_bug) - ASM_CLAC - pushl $0 - pushl $do_spurious_interrupt_bug - jmp common_exception -SYM_CODE_END(spurious_interrupt_bug) - -#ifdef CONFIG_XEN_PV -SYM_FUNC_START(xen_hypervisor_callback) - /* - * Check to see if we got the event in the critical - * region in xen_iret_direct, after we've reenabled - * events and checked for pending events. This simulates - * iret instruction's behaviour where it delivers a - * pending interrupt when enabling interrupts: - */ - cmpl $xen_iret_start_crit, (%esp) - jb 1f - cmpl $xen_iret_end_crit, (%esp) - jae 1f - call xen_iret_crit_fixup -1: - pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - mov %esp, %eax - call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPTION - call xen_maybe_preempt_hcall -#endif - jmp ret_from_intr -SYM_FUNC_END(xen_hypervisor_callback) - -/* - * Hypervisor uses this for application faults while it executes. - * We get here for two reasons: - * 1. Fault while reloading DS, ES, FS or GS - * 2. Fault while executing IRET - * Category 1 we fix up by reattempting the load, and zeroing the segment - * register if the load fails. - * Category 2 we fix up by jumping to do_iret_error. We cannot use the - * normal Linux return path in this case because if we use the IRET hypercall - * to pop the stack frame we end up in an infinite loop of failsafe callbacks. - * We distinguish between categories by maintaining a status value in EAX. - */ -SYM_FUNC_START(xen_failsafe_callback) - pushl %eax - movl $1, %eax -1: mov 4(%esp), %ds -2: mov 8(%esp), %es -3: mov 12(%esp), %fs -4: mov 16(%esp), %gs - /* EAX == 0 => Category 1 (Bad segment) - EAX != 0 => Category 2 (Bad IRET) */ - testl %eax, %eax - popl %eax - lea 16(%esp), %esp - jz 5f - jmp iret_exc -5: pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - ENCODE_FRAME_POINTER - jmp ret_from_exception - -.section .fixup, "ax" -6: xorl %eax, %eax - movl %eax, 4(%esp) - jmp 1b -7: xorl %eax, %eax - movl %eax, 8(%esp) - jmp 2b -8: xorl %eax, %eax - movl %eax, 12(%esp) - jmp 3b -9: xorl %eax, %eax - movl %eax, 16(%esp) - jmp 4b -.previous - _ASM_EXTABLE(1b, 6b) - _ASM_EXTABLE(2b, 7b) - _ASM_EXTABLE(3b, 8b) - _ASM_EXTABLE(4b, 9b) -SYM_FUNC_END(xen_failsafe_callback) -#endif /* CONFIG_XEN_PV */ - -#ifdef CONFIG_XEN_PVHVM -BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) -#endif - - -#if IS_ENABLED(CONFIG_HYPERV) - -BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) - -BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, - hyperv_reenlightenment_intr) - -BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, - hv_stimer0_vector_handler) - -#endif /* CONFIG_HYPERV */ - -SYM_CODE_START(page_fault) - ASM_CLAC - pushl $do_page_fault - jmp common_exception_read_cr2 -SYM_CODE_END(page_fault) - -SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2) - /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 - - ENCODE_FRAME_POINTER - - /* fixup %gs */ - GS_TO_REG %ecx - movl PT_GS(%esp), %edi - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - - GET_CR2_INTO(%ecx) # might clobber %eax - - /* fixup orig %eax */ - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - - TRACE_IRQS_OFF - movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi - jmp ret_from_exception -SYM_CODE_END(common_exception_read_cr2) - -SYM_CODE_START_LOCAL_NOALIGN(common_exception) +SYM_CODE_START_LOCAL_NOALIGN(handle_exception) /* the function address is in %gs's slot on the stack */ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER @@ -1520,24 +1164,35 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception) movl PT_ORIG_EAX(%esp), %edx # get the error code movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi - jmp ret_from_exception -SYM_CODE_END(common_exception) + CALL_NOSPEC edi -SYM_CODE_START(debug) +handle_exception_return: +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else /* - * Entry from sysenter is now handled in common_exception + * We can be coming here from child spawned by kernel_thread(). */ - ASM_CLAC - pushl $0 - pushl $do_debug - jmp common_exception -SYM_CODE_END(debug) + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax # returning to v8086 or userspace ? + jnb ret_to_user + + PARANOID_EXIT_TO_KERNEL_MODE + BUG_IF_WRONG_CR3 + RESTORE_REGS 4 + jmp .Lirq_return + +ret_to_user: + movl %esp, %eax + jmp restore_all_switch_stack +SYM_CODE_END(handle_exception) -#ifdef CONFIG_DOUBLEFAULT -SYM_CODE_START(double_fault) +SYM_CODE_START(asm_exc_double_fault) 1: /* * This is a task gate handler, not an interrupt gate handler. @@ -1575,8 +1230,7 @@ SYM_CODE_START(double_fault) 1: hlt jmp 1b -SYM_CODE_END(double_fault) -#endif +SYM_CODE_END(asm_exc_double_fault) /* * NMI is doubly nasty. It can happen on the first instruction of @@ -1585,7 +1239,7 @@ SYM_CODE_END(double_fault) * switched stacks. We handle both conditions by simply checking whether we * interrupted kernel code running on the SYSENTER stack. */ -SYM_CODE_START(nmi) +SYM_CODE_START(asm_exc_nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 @@ -1614,7 +1268,7 @@ SYM_CODE_START(nmi) jb .Lnmi_from_sysenter_stack /* Not on SYSENTER stack. */ - call do_nmi + call exc_nmi jmp .Lnmi_return .Lnmi_from_sysenter_stack: @@ -1624,7 +1278,7 @@ SYM_CODE_START(nmi) */ movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp - call do_nmi + call exc_nmi movl %ebx, %esp .Lnmi_return: @@ -1678,29 +1332,9 @@ SYM_CODE_START(nmi) lss (1+5+6)*4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif -SYM_CODE_END(nmi) - -SYM_CODE_START(int3) - ASM_CLAC - pushl $0 - pushl $do_int3 - jmp common_exception -SYM_CODE_END(int3) - -SYM_CODE_START(general_protection) - ASM_CLAC - pushl $do_general_protection - jmp common_exception -SYM_CODE_END(general_protection) - -#ifdef CONFIG_KVM_GUEST -SYM_CODE_START(async_page_fault) - ASM_CLAC - pushl $do_async_page_fault - jmp common_exception_read_cr2 -SYM_CODE_END(async_page_fault) -#endif +SYM_CODE_END(asm_exc_nmi) +.pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp @@ -1711,3 +1345,4 @@ SYM_CODE_START(rewind_stack_do_exit) call do_exit 1: jmp 1b SYM_CODE_END(rewind_stack_do_exit) +.popsection diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3063aa9090f9..d977079a7d02 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -16,7 +16,6 @@ * * Some macro usage: * - SYM_FUNC_START/END:Define functions in the symbol table. - * - TRACE_IRQ_*: Trace hardirq state for lock debugging. * - idtentry: Define exception entry points. */ #include <linux/linkage.h> @@ -37,7 +36,9 @@ #include <asm/pgtable_types.h> #include <asm/export.h> #include <asm/frame.h> +#include <asm/trapnr.h> #include <asm/nospec-branch.h> +#include <asm/fsgsbase.h> #include <linux/err.h> #include "calling.h" @@ -53,57 +54,6 @@ SYM_CODE_START(native_usergs_sysret64) SYM_CODE_END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_FLAGS flags:req -#ifdef CONFIG_TRACE_IRQFLAGS - btl $9, \flags /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON -1: -#endif -.endm - -.macro TRACE_IRQS_IRETQ - TRACE_IRQS_FLAGS EFLAGS(%rsp) -.endm - -/* - * When dynamic function tracer is enabled it will add a breakpoint - * to all locations that it is about to modify, sync CPUs, update - * all the code, sync CPUs, then remove the breakpoints. In this time - * if lockdep is enabled, it might jump back into the debug handler - * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). - * - * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to - * make sure the stack pointer does not get reset back to the top - * of the debug stack, and instead just reuses the current stack. - */ -#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) - -.macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero - TRACE_IRQS_OFF - call debug_stack_reset -.endm - -.macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero - TRACE_IRQS_ON - call debug_stack_reset -.endm - -.macro TRACE_IRQS_IRETQ_DEBUG - btl $9, EFLAGS(%rsp) /* interrupts off? */ - jnc 1f - TRACE_IRQS_ON_DEBUG -1: -.endm - -#else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ -#endif - /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -144,11 +94,6 @@ SYM_CODE_END(native_usergs_sysret64) SYM_CODE_START(entry_SYSCALL_64) UNWIND_HINT_EMPTY - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ swapgs /* tss.sp2 is scratch space. */ @@ -167,15 +112,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) PUSH_AND_CLEAR_REGS rax=$-ENOSYS - TRACE_IRQS_OFF - /* IRQs are off. */ movq %rax, %rdi movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ - TRACE_IRQS_ON /* return enables interrupts */ - /* * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, @@ -279,6 +220,7 @@ SYM_CODE_END(entry_SYSCALL_64) * %rdi: prev task * %rsi: next task */ +.pushsection .text, "ax" SYM_FUNC_START(__switch_to_asm) /* * Save callee-saved registers @@ -321,6 +263,7 @@ SYM_FUNC_START(__switch_to_asm) jmp __switch_to SYM_FUNC_END(__switch_to_asm) +.popsection /* * A newly forked process directly context switches into this address. @@ -329,6 +272,7 @@ SYM_FUNC_END(__switch_to_asm) * rbx: kernel thread func (NULL for user thread) * r12: kernel thread arg */ +.pushsection .text, "ax" SYM_CODE_START(ret_from_fork) UNWIND_HINT_EMPTY movq %rax, %rdi @@ -340,51 +284,23 @@ SYM_CODE_START(ret_from_fork) 2: UNWIND_HINT_REGS movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ - TRACE_IRQS_ON /* user mode is traced as IRQS on */ + call syscall_exit_to_user_mode /* returns with IRQs disabled */ jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ UNWIND_HINT_EMPTY movq %r12, %rdi - CALL_NOSPEC %rbx + CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movq $0, RAX(%rsp) jmp 2b SYM_CODE_END(ret_from_fork) - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -SYM_CODE_START(irq_entries_start) - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ - jmp common_interrupt - .align 8 - vector=vector+1 - .endr -SYM_CODE_END(irq_entries_start) - - .align 8 -SYM_CODE_START(spurious_entries_start) - vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) - UNWIND_HINT_IRET_REGS - pushq $(~vector+0x80) /* Note: always in signed byte range */ - jmp common_spurious - .align 8 - vector=vector+1 - .endr -SYM_CODE_END(spurious_entries_start) +.popsection .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY @@ -398,228 +314,179 @@ SYM_CODE_END(spurious_entries_start) #endif .endm -/* - * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers - * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. - * Requires kernel GSBASE. - * - * The invariant is that, if irq_count != -1, then the IRQ stack is in use. +/** + * idtentry_body - Macro to emit code calling the C function + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack */ -.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0 - DEBUG_ENTRY_ASSERT_IRQS_OFF - - .if \save_ret - /* - * If save_ret is set, the original stack contains one additional - * entry -- the return address. Therefore, move the address one - * entry below %rsp to \old_rsp. - */ - leaq 8(%rsp), \old_rsp - .else - movq %rsp, \old_rsp - .endif +.macro idtentry_body cfunc has_error_code:req - .if \regs - UNWIND_HINT_REGS base=\old_rsp - .endif + call error_entry + UNWIND_HINT_REGS - incl PER_CPU_VAR(irq_count) - jnz .Lirq_stack_push_old_rsp_\@ + movq %rsp, %rdi /* pt_regs pointer into 1st argument*/ - /* - * Right now, if we just incremented irq_count to zero, we've - * claimed the IRQ stack but we haven't switched to it yet. - * - * If anything is added that can interrupt us here without using IST, - * it must be *extremely* careful to limit its stack usage. This - * could include kprobes and a hypothetical future IST-less #DB - * handler. - * - * The OOPS unwinder relies on the word at the top of the IRQ - * stack linking back to the previous RSP for the entire time we're - * on the IRQ stack. For this to work reliably, we need to write - * it before we actually move ourselves to the IRQ stack. - */ + .if \has_error_code == 1 + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .endif - movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(hardirq_stack_ptr), %rsp + call \cfunc -#ifdef CONFIG_DEBUG_ENTRY - /* - * If the first movq above becomes wrong due to IRQ stack layout - * changes, the only way we'll notice is if we try to unwind right - * here. Assert that we set up the stack right to catch this type - * of bug quickly. - */ - cmpq -8(%rsp), \old_rsp - je .Lirq_stack_okay\@ - ud2 - .Lirq_stack_okay\@: -#endif + jmp error_return +.endm -.Lirq_stack_push_old_rsp_\@: - pushq \old_rsp +/** + * idtentry - Macro to generate entry stubs for simple IDT entries + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * @has_error_code: Hardware pushed error code on stack + * + * The macro emits code to set up the kernel context for straight forward + * and simple IDT entries. No IST stack, no paranoid entry checks. + */ +.macro idtentry vector asmsym cfunc has_error_code:req +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + ASM_CLAC - .if \regs - UNWIND_HINT_REGS indirect=1 + .if \has_error_code == 0 + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif - .if \save_ret - /* - * Push the return address to the stack. This return address can - * be found at the "real" original RSP, which was offset by 8 at - * the beginning of this macro. - */ - pushq -8(\old_rsp) + .if \vector == X86_TRAP_BP + /* + * If coming from kernel space, create a 6-word gap to allow the + * int3 handler to emulate a call instruction. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_no_gap_\@ + .rept 6 + pushq 5*8(%rsp) + .endr + UNWIND_HINT_IRET_REGS offset=8 +.Lfrom_usermode_no_gap_\@: .endif + + idtentry_body \cfunc \has_error_code + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) .endm /* - * Undoes ENTER_IRQ_STACK. + * Interrupt entry/exit. + * + + The interrupt stubs push (vector) onto the stack, which is the error_code + * position of idtentry exceptions, and jump to one of the two idtentry points + * (common/spurious). + * + * common_interrupt is a hotpath, align it to a cache line */ -.macro LEAVE_IRQ_STACK regs=1 - DEBUG_ENTRY_ASSERT_IRQS_OFF - /* We need to be off the IRQ stack before decrementing irq_count. */ - popq %rsp - - .if \regs - UNWIND_HINT_REGS - .endif - - /* - * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming - * the irq stack but we're not on it. - */ - - decl PER_CPU_VAR(irq_count) +.macro idtentry_irq vector cfunc + .p2align CONFIG_X86_L1_CACHE_SHIFT + idtentry \vector asm_\cfunc \cfunc has_error_code=1 .endm /* - * Interrupt entry helper function. + * System vectors which invoke their handlers directly and are not + * going through the regular common device interrupt handling code. + */ +.macro idtentry_sysvec vector cfunc + idtentry \vector asm_\cfunc \cfunc has_error_code=0 +.endm + +/** + * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * + * The macro emits code to set up the kernel context for #MC and #DB * - * Entry runs with interrupts off. Stack layout at entry: - * +----------------------------------------------------+ - * | regs->ss | - * | regs->rsp | - * | regs->eflags | - * | regs->cs | - * | regs->ip | - * +----------------------------------------------------+ - * | regs->orig_ax = ~(interrupt number) | - * +----------------------------------------------------+ - * | return address | - * +----------------------------------------------------+ + * If the entry comes from user space it uses the normal entry path + * including the return to user space work and preemption checks on + * exit. + * + * If hits in kernel mode then it needs to go through the paranoid + * entry as the exception can hit any random state. No preemption + * check on exit to keep the paranoid path simple. */ -SYM_CODE_START(interrupt_entry) - UNWIND_HINT_IRET_REGS offset=16 +.macro idtentry_mce_db vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS ASM_CLAC - cld - testb $3, CS-ORIG_RAX+8(%rsp) - jz 1f - SWAPGS - FENCE_SWAPGS_USER_ENTRY + pushq $-1 /* ORIG_RAX: no syscall to restart */ + /* - * Switch to the thread stack. The IRET frame and orig_ax are - * on the stack, as well as the return address. RDI..R12 are - * not (yet) on the stack and space has not (yet) been - * allocated for them. + * If the entry is from userspace, switch stacks and treat it as + * a normal entry. */ - pushq %rdi + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_switch_stack_\@ - /* Need to switch before accessing the thread stack. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi - movq %rsp, %rdi - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ + call paranoid_entry - /* - * We have RDI, return address, and orig_ax on the stack on - * top of the IRET frame. That means offset=24 - */ - UNWIND_HINT_IRET_REGS base=%rdi offset=24 - - pushq 7*8(%rdi) /* regs->ss */ - pushq 6*8(%rdi) /* regs->rsp */ - pushq 5*8(%rdi) /* regs->eflags */ - pushq 4*8(%rdi) /* regs->cs */ - pushq 3*8(%rdi) /* regs->ip */ - UNWIND_HINT_IRET_REGS - pushq 2*8(%rdi) /* regs->orig_ax */ - pushq 8(%rdi) /* return address */ + UNWIND_HINT_REGS - movq (%rdi), %rdi - jmp 2f -1: - FENCE_SWAPGS_KERNEL_ENTRY -2: - PUSH_AND_CLEAR_REGS save_ret=1 - ENCODE_FRAME_POINTER 8 + movq %rsp, %rdi /* pt_regs pointer */ - testb $3, CS+8(%rsp) - jz 1f + call \cfunc - /* - * IRQ from user mode. - * - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). Since TRACE_IRQS_OFF is idempotent, - * the simplest way to handle it is to just call it twice if - * we enter from user mode. There's no reason to optimize this since - * TRACE_IRQS_OFF is a no-op if lockdep is off. - */ - TRACE_IRQS_OFF + jmp paranoid_exit - CALL_enter_from_user_mode + /* Switch to the regular task stack and use the noist entry point */ +.Lfrom_usermode_switch_stack_\@: + idtentry_body noist_\cfunc, has_error_code=0 -1: - ENTER_IRQ_STACK old_rsp=%rdi save_ret=1 - /* We entered an interrupt context - irqs are off: */ - TRACE_IRQS_OFF +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm - ret -SYM_CODE_END(interrupt_entry) -_ASM_NOKPROBE(interrupt_entry) +/* + * Double fault entry. Straight paranoid. No checks from which context + * this comes because for the espfix induced #DF this would do the wrong + * thing. + */ +.macro idtentry_df vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS offset=8 + ASM_CLAC + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ + call paranoid_entry + UNWIND_HINT_REGS -/* Interrupt entry/exit. */ + movq %rsp, %rdi /* pt_regs pointer into first argument */ + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + call \cfunc + + jmp paranoid_exit + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm /* - * The interrupt stubs push (~vector+0x80) onto the stack and - * then jump to common_spurious/interrupt. + * Include the defines which emit the idt entries which are shared + * shared between 32 and 64 bit and emit the __irqentry_text_* markers + * so the stacktrace boundary checks work. */ -SYM_CODE_START_LOCAL(common_spurious) - addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call smp_spurious_interrupt /* rdi points to pt_regs */ - jmp ret_from_intr -SYM_CODE_END(common_spurious) -_ASM_NOKPROBE(common_spurious) - -/* common_interrupt is a hotpath. Align it */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -SYM_CODE_START_LOCAL(common_interrupt) - addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call do_IRQ /* rdi points to pt_regs */ - /* 0(%rsp): old RSP */ -ret_from_intr: - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - - LEAVE_IRQ_STACK + .align 16 + .globl __irqentry_text_start +__irqentry_text_start: - testb $3, CS(%rsp) - jz retint_kernel +#include <asm/idtentry.h> - /* Interrupt came from user space */ -.Lretint_user: - mov %rsp,%rdi - call prepare_exit_to_usermode - TRACE_IRQS_ON + .align 16 + .globl __irqentry_text_end +__irqentry_text_end: +SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates user mode. */ @@ -662,23 +529,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) INTERRUPT_RETURN -/* Returning to kernel space */ -retint_kernel: -#ifdef CONFIG_PREEMPTION - /* Interrupts are off */ - /* Check if we need preemption */ - btl $9, EFLAGS(%rsp) /* were interrupts off? */ - jnc 1f - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz 1f - call preempt_schedule_irq -1: -#endif - /* - * The iretq could re-enable interrupts: - */ - TRACE_IRQS_IRETQ - SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY /* Assert that pt_regs indicates kernel mode. */ @@ -710,7 +560,7 @@ SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL) /* * This may fault. Non-paranoid faults on return to userspace are * handled by fixup_bad_iret. These include #SS, #GP, and #NP. - * Double-faults due to espfix64 are handled in do_double_fault. + * Double-faults due to espfix64 are handled in exc_double_fault. * Other faults here are fatal. */ iretq @@ -788,280 +638,32 @@ native_irq_return_ldt: */ jmp native_irq_return_iret #endif -SYM_CODE_END(common_interrupt) -_ASM_NOKPROBE(common_interrupt) +SYM_CODE_END(common_interrupt_return) +_ASM_NOKPROBE(common_interrupt_return) /* - * APIC interrupts. - */ -.macro apicinterrupt3 num sym do_sym -SYM_CODE_START(\sym) - UNWIND_HINT_IRET_REGS - pushq $~(\num) -.Lcommon_\sym: - call interrupt_entry - UNWIND_HINT_REGS indirect=1 - call \do_sym /* rdi points to pt_regs */ - jmp ret_from_intr -SYM_CODE_END(\sym) -_ASM_NOKPROBE(\sym) -.endm - -/* Make sure APIC interrupt handlers end up in the irqentry section: */ -#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -#define POP_SECTION_IRQENTRY .popsection - -.macro apicinterrupt num sym do_sym -PUSH_SECTION_IRQENTRY -apicinterrupt3 \num \sym \do_sym -POP_SECTION_IRQENTRY -.endm - -#ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt -#endif - -#ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt -#endif - -apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi - -#ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi -apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi -apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi -#endif - -#ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt -#endif - -#ifdef CONFIG_X86_MCE_AMD -apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt -#endif - -#ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt -#endif - -#ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt -#endif - -apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt - -#ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt -#endif - -/* - * Exception entry points. - */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) - -.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0 - - .if \paranoid - call paranoid_entry - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - .else - call error_entry - .endif - UNWIND_HINT_REGS - - .if \read_cr2 - /* - * Store CR2 early so subsequent faults cannot clobber it. Use R12 as - * intermediate storage as RDX can be clobbered in enter_from_user_mode(). - * GET_CR2_INTO can clobber RAX. - */ - GET_CR2_INTO(%r12); - .endif - - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - - .if \paranoid == 0 - testb $3, CS(%rsp) - jz .Lfrom_kernel_no_context_tracking_\@ - CALL_enter_from_user_mode -.Lfrom_kernel_no_context_tracking_\@: - .endif - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - .if \read_cr2 - movq %r12, %rdx /* Move CR2 into 3rd argument */ - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - .if \paranoid - /* this procedure expect "no swapgs" flag in ebx */ - jmp paranoid_exit - .else - jmp error_exit - .endif - -.endm - -/** - * idtentry - Generate an IDT entry stub - * @sym: Name of the generated entry point - * @do_sym: C function to be called - * @has_error_code: True if this IDT vector has an error code on the stack - * @paranoid: non-zero means that this vector may be invoked from - * kernel mode with user GSBASE and/or user CR3. - * 2 is special -- see below. - * @shift_ist: Set to an IST index if entries from kernel mode should - * decrement the IST stack so that nested entries get a - * fresh stack. (This is for #DB, which has a nasty habit - * of recursing.) - * @create_gap: create a 6-word stack gap when coming from kernel mode. - * @read_cr2: load CR2 into the 3rd argument; done before calling any C code - * - * idtentry generates an IDT stub that sets up a usable kernel context, - * creates struct pt_regs, and calls @do_sym. The stub has the following - * special behaviors: - * - * On an entry from user mode, the stub switches from the trampoline or - * IST stack to the normal thread stack. On an exit to user mode, the - * normal exit-to-usermode path is invoked. + * Reload gs selector with exception handling + * edi: new selector * - * On an exit to kernel mode, if @paranoid == 0, we check for preemption, - * whereas we omit the preemption check if @paranoid != 0. This is purely - * because the implementation is simpler this way. The kernel only needs - * to check for asynchronous kernel preemption when IRQ handlers return. - * - * If @paranoid == 0, then the stub will handle IRET faults by pretending - * that the fault came from user mode. It will handle gs_change faults by - * pretending that the fault happened with kernel GSBASE. Since this handling - * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have - * @paranoid == 0. This special handling will do the wrong thing for - * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. - * - * @paranoid == 2 is special: the stub will never switch stacks. This is for - * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. + * Is in entry.text as it shouldn't be instrumented. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0 -SYM_CODE_START(\sym) - UNWIND_HINT_IRET_REGS offset=\has_error_code*8 - - /* Sanity check */ - .if \shift_ist != -1 && \paranoid != 1 - .error "using shift_ist requires paranoid=1" - .endif - - .if \create_gap && \paranoid - .error "using create_gap requires paranoid=0" - .endif - - ASM_CLAC - - .if \has_error_code == 0 - pushq $-1 /* ORIG_RAX: no syscall to restart */ - .endif - - .if \paranoid == 1 - testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */ - jnz .Lfrom_usermode_switch_stack_\@ - .endif - - .if \create_gap == 1 - /* - * If coming from kernel space, create a 6-word gap to allow the - * int3 handler to emulate a call instruction. - */ - testb $3, CS-ORIG_RAX(%rsp) - jnz .Lfrom_usermode_no_gap_\@ - .rept 6 - pushq 5*8(%rsp) - .endr - UNWIND_HINT_IRET_REGS offset=8 -.Lfrom_usermode_no_gap_\@: - .endif - - idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset - - .if \paranoid == 1 - /* - * Entry from userspace. Switch stacks and treat it - * as a normal entry. This means that paranoid handlers - * run in real process context if user_mode(regs). - */ -.Lfrom_usermode_switch_stack_\@: - idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0 - .endif - -_ASM_NOKPROBE(\sym) -SYM_CODE_END(\sym) -.endm - -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - - - /* - * Reload gs selector with exception handling - * edi: new selector - */ -SYM_FUNC_START(native_load_gs_index) +SYM_FUNC_START(asm_load_gs_index) FRAME_BEGIN - pushfq - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) - TRACE_IRQS_OFF - SWAPGS + swapgs .Lgs_change: movl %edi, %gs 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE - SWAPGS - TRACE_IRQS_FLAGS (%rsp) - popfq + swapgs FRAME_END ret -SYM_FUNC_END(native_load_gs_index) -EXPORT_SYMBOL(native_load_gs_index) +SYM_FUNC_END(asm_load_gs_index) +EXPORT_SYMBOL(asm_load_gs_index) _ASM_EXTABLE(.Lgs_change, .Lbad_gs) .section .fixup, "ax" /* running with kernelgs */ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) - SWAPGS /* switch back to user gs */ + swapgs /* switch back to user gs */ .macro ZAP_GS /* This can't be a string because the preprocessor needs to see it. */ movl $__USER_DS, %eax @@ -1074,20 +676,48 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) SYM_CODE_END(.Lbad_gs) .previous -/* Call softirq on interrupt stack. Interrupts are off. */ -SYM_FUNC_START(do_softirq_own_stack) - pushq %rbp - mov %rsp, %rbp - ENTER_IRQ_STACK regs=0 old_rsp=%r11 - call __do_softirq - LEAVE_IRQ_STACK regs=0 +/* + * rdi: New stack pointer points to the top word of the stack + * rsi: Function pointer + * rdx: Function argument (can be NULL if none) + */ +SYM_FUNC_START(asm_call_on_stack) +SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL) +SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL) + /* + * Save the frame pointer unconditionally. This allows the ORC + * unwinder to handle the stack switch. + */ + pushq %rbp + mov %rsp, %rbp + + /* + * The unwinder relies on the word at the top of the new stack + * page linking back to the previous RSP. + */ + mov %rsp, (%rdi) + mov %rdi, %rsp + /* Move the argument to the right place */ + mov %rdx, %rdi + +1: + .pushsection .discard.instr_begin + .long 1b - . + .popsection + + CALL_NOSPEC rsi + +2: + .pushsection .discard.instr_end + .long 2b - . + .popsection + + /* Restore the previous stack pointer from RBP. */ leaveq ret -SYM_FUNC_END(do_softirq_own_stack) +SYM_FUNC_END(asm_call_on_stack) #ifdef CONFIG_XEN_PV -idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 - /* * A note on the "critical region" in our callback handler. * We want to avoid stacking callback handlers due to events occurring @@ -1100,9 +730,10 @@ idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * So, on entry to the handler we detect whether we interrupted an * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. + * + * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs) */ -/* do_hypervisor_callback(struct *pt_regs) */ -SYM_CODE_START_LOCAL(xen_do_hypervisor_callback) +SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback) /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will @@ -1112,15 +743,10 @@ SYM_CODE_START_LOCAL(xen_do_hypervisor_callback) movq %rdi, %rsp /* we don't return, adjust the stack frame */ UNWIND_HINT_REGS - ENTER_IRQ_STACK old_rsp=%r10 - call xen_evtchn_do_upcall - LEAVE_IRQ_STACK + call xen_pv_evtchn_do_upcall -#ifndef CONFIG_PREEMPTION - call xen_maybe_preempt_hcall -#endif - jmp error_exit -SYM_CODE_END(xen_do_hypervisor_callback) + jmp error_return +SYM_CODE_END(exc_xen_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1155,7 +781,7 @@ SYM_CODE_START(xen_failsafe_callback) addq $0x30, %rsp pushq $0 /* RIP */ UNWIND_HINT_IRET_REGS offset=8 - jmp general_protection + jmp asm_exc_general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 @@ -1164,71 +790,26 @@ SYM_CODE_START(xen_failsafe_callback) pushq $-1 /* orig_ax = -1 => not a system call */ PUSH_AND_CLEAR_REGS ENCODE_FRAME_POINTER - jmp error_exit + jmp error_return SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ -#ifdef CONFIG_XEN_PVHVM -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - xen_hvm_callback_vector xen_evtchn_do_upcall -#endif - - -#if IS_ENABLED(CONFIG_HYPERV) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - hyperv_callback_vector hyperv_vector_handler - -apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ - hyperv_reenlightenment_vector hyperv_reenlightenment_intr - -apicinterrupt3 HYPERV_STIMER0_VECTOR \ - hv_stimer0_callback_vector hv_stimer0_vector_handler -#endif /* CONFIG_HYPERV */ - -#if IS_ENABLED(CONFIG_ACRN_GUEST) -apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ - acrn_hv_callback_vector acrn_hv_vector_handler -#endif - -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET -idtentry int3 do_int3 has_error_code=0 create_gap=1 -idtentry stack_segment do_stack_segment has_error_code=1 - -#ifdef CONFIG_XEN_PV -idtentry xennmi do_nmi has_error_code=0 -idtentry xendebug do_debug has_error_code=0 -#endif - -idtentry general_protection do_general_protection has_error_code=1 -idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 - -#ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1 -#endif - -#ifdef CONFIG_X86_MCE -idtentry machine_check do_mce has_error_code=0 paranoid=1 -#endif - /* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Save all registers in pt_regs. Return GSBASE related information + * in EBX depending on the availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y GSBASE value at entry, must be restored in paranoid_exit */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx, %ebx -1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -1238,16 +819,60 @@ SYM_CODE_START_LOCAL(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. + * + * Switching CR3 does not depend on kernel GSBASE so it can + * be done before switching to the kernel GSBASE. This is + * required for FSGSBASE because the kernel GSBASE has to + * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 /* + * Handling GSBASE depends on the availability of FSGSBASE. + * + * Without FSGSBASE the kernel enforces that negative GSBASE + * values indicate kernel GSBASE. With FSGSBASE no assumptions + * can be made about the GSBASE value when entering from user + * space. + */ + ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE + + /* + * Read the current GSBASE and store it in %rbx unconditionally, + * retrieve and set the current CPUs kernel GSBASE. The stored value + * has to be restored in paranoid_exit unconditionally. + * + * The MSR write ensures that no subsequent load is based on a + * mispredicted GSBASE. No extra FENCE required. + */ + SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx + ret + +.Lparanoid_entry_checkgs: + /* EBX = 1 -> kernel GSBASE active, no restore required */ + movl $1, %ebx + /* + * The kernel-enforced convention is a negative GSBASE indicates + * a kernel value. No SWAPGS needed on entry and exit. + */ + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + jns .Lparanoid_entry_swapgs + ret + +.Lparanoid_entry_swapgs: + SWAPGS + + /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an * unconditional CR3 write, even in the PTI case. So do an lfence * to prevent GS speculation, regardless of whether PTI is enabled. */ FENCE_SWAPGS_KERNEL_ENTRY + /* EBX = 0 -> SWAPGS required on exit */ + xorl %ebx, %ebx ret SYM_CODE_END(paranoid_entry) @@ -1258,27 +883,45 @@ SYM_CODE_END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. + * be complicated. Fortunately, there's no good reason to try + * to handle preemption here. + * + * R/EBX contains the GSBASE related information depending on the + * availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * Y User space GSBASE, must be restored unconditionally */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF_DEBUG - testl %ebx, %ebx /* swapgs needed? */ - jnz .Lparanoid_exit_no_swapgs - TRACE_IRQS_IRETQ - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + /* + * The order of operations is important. RESTORE_CR3 requires + * kernel GSBASE. + * + * NB to anyone to try to optimize this code: this code does + * not execute at all for exceptions from user mode. Those + * exceptions go through error_exit instead. + */ + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE + + /* With FSGSBASE enabled, unconditionally restore GSBASE */ + wrgsbase %rbx + jmp restore_regs_and_return_to_kernel + +.Lparanoid_exit_checkgs: + /* On non-FSGSBASE systems, conditionally do SWAPGS */ + testl %ebx, %ebx + jnz restore_regs_and_return_to_kernel + + /* We are returning to a context with user GSBASE */ SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel -.Lparanoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - jmp restore_regs_and_return_to_kernel + jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) /* @@ -1339,7 +982,6 @@ SYM_CODE_START_LOCAL(error_entry) */ SWAPGS FENCE_SWAPGS_USER_ENTRY - SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done .Lbstep_iret: @@ -1366,14 +1008,13 @@ SYM_CODE_START_LOCAL(error_entry) jmp .Lerror_entry_from_usermode_after_swapgs SYM_CODE_END(error_entry) -SYM_CODE_START_LOCAL(error_exit) +SYM_CODE_START_LOCAL(error_return) UNWIND_HINT_REGS - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF + DEBUG_ENTRY_ASSERT_IRQS_OFF testb $3, CS(%rsp) - jz retint_kernel - jmp .Lretint_user -SYM_CODE_END(error_exit) + jz restore_regs_and_return_to_kernel + jmp swapgs_restore_regs_and_return_to_usermode +SYM_CODE_END(error_return) /* * Runs on exception stack. Xen PV does not go through this path at all, @@ -1383,7 +1024,7 @@ SYM_CODE_END(error_exit) * %r14: Used to save/restore the CR3 of the interrupted context * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ -SYM_CODE_START(nmi) +SYM_CODE_START(asm_exc_nmi) UNWIND_HINT_IRET_REGS /* @@ -1468,7 +1109,7 @@ SYM_CODE_START(nmi) movq %rsp, %rdi movq $-1, %rsi - call do_nmi + call exc_nmi /* * Return back to user mode. We must *not* do the normal exit @@ -1525,7 +1166,7 @@ SYM_CODE_START(nmi) * end_repeat_nmi, then we are a nested NMI. We must not * modify the "iret" frame because it's being written by * the outer NMI. That's okay; the outer NMI handler is - * about to about to call do_nmi anyway, so we can just + * about to about to call exc_nmi() anyway, so we can just * resume the outer NMI. */ @@ -1644,7 +1285,7 @@ repeat_nmi: * RSP is pointing to "outermost RIP". gsbase is unknown, but, if * we're repeating an NMI, gsbase has the same value that it had on * the first iteration. paranoid_entry will load the kernel - * gsbase if needed before we call do_nmi. "NMI executing" + * gsbase if needed before we call exc_nmi(). "NMI executing" * is zero. */ movq $1, 10*8(%rsp) /* Set "NMI executing". */ @@ -1678,18 +1319,34 @@ end_repeat_nmi: call paranoid_entry UNWIND_HINT_REGS - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi - call do_nmi + call exc_nmi /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - testl %ebx, %ebx /* swapgs needed? */ + /* + * The above invocation of paranoid_entry stored the GSBASE + * related information in R/EBX depending on the availability + * of FSGSBASE. + * + * If FSGSBASE is enabled, restore the saved GSBASE value + * unconditionally, otherwise take the conditional SWAPGS path. + */ + ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE + + wrgsbase %rbx + jmp nmi_restore + +nmi_no_fsgsbase: + /* EBX == 0 -> invoke SWAPGS */ + testl %ebx, %ebx jnz nmi_restore + nmi_swapgs: SWAPGS_UNSAFE_STACK + nmi_restore: POP_REGS @@ -1718,7 +1375,7 @@ nmi_restore: * about espfix64 on the way back to kernel mode. */ iretq -SYM_CODE_END(nmi) +SYM_CODE_END(asm_exc_nmi) #ifndef CONFIG_IA32_EMULATION /* @@ -1732,6 +1389,7 @@ SYM_CODE_START(ignore_sysret) SYM_CODE_END(ignore_sysret) #endif +.pushsection .text, "ax" SYM_CODE_START(rewind_stack_do_exit) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ @@ -1743,3 +1401,4 @@ SYM_CODE_START(rewind_stack_do_exit) call do_exit SYM_CODE_END(rewind_stack_do_exit) +.popsection diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index f1d3ccae5dd5..541fdaf64045 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -46,38 +46,41 @@ * ebp user stack * 0(%ebp) arg6 */ -SYM_FUNC_START(entry_SYSENTER_compat) +SYM_CODE_START(entry_SYSENTER_compat) + UNWIND_HINT_EMPTY /* Interrupts are off on entry. */ SWAPGS - /* We are about to clobber %rsp anyway, clobbering here is OK */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + pushq %rax + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + popq %rax movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* - * User tracing code (ptrace or signal handlers) might assume that - * the saved RAX contains a 32-bit number when we're invoking a 32-bit - * syscall. Just in case the high bits are nonzero, zero-extend - * the syscall number. (This could almost certainly be deleted - * with no ill effects.) - */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp (stashed in bp) */ + pushq $0 /* pt_regs->sp = 0 (placeholder) */ /* * Push flags. This is nasty. First, interrupts are currently - * off, but we need pt_regs->flags to have IF set. Second, even - * if TF was set when SYSENTER started, it's clear by now. We fix - * that later using TIF_SINGLESTEP. + * off, but we need pt_regs->flags to have IF set. Second, if TS + * was set in usermode, it's still set, and we're singlestepping + * through this code. do_SYSENTER_32() will fix up IF. */ pushfq /* pt_regs->flags (except IF = 0) */ - orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq $0 /* pt_regs->ip = 0 (placeholder) */ +SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL) + + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ + movl %eax, %eax + pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -104,6 +107,9 @@ SYM_FUNC_START(entry_SYSENTER_compat) xorl %r14d, %r14d /* nospec r14 */ pushq $0 /* pt_regs->r15 = 0 */ xorl %r15d, %r15d /* nospec r15 */ + + UNWIND_HINT_REGS + cld /* @@ -129,17 +135,11 @@ SYM_FUNC_START(entry_SYSENTER_compat) jnz .Lsysenter_fix_flags .Lsysenter_flags_fixed: - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF - movq %rsp, %rdi - call do_fast_syscall_32 + call do_SYSENTER_32 /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ - "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV jmp sysret32_from_system_call .Lsysenter_fix_flags: @@ -147,7 +147,7 @@ SYM_FUNC_START(entry_SYSENTER_compat) popfq jmp .Lsysenter_flags_fixed SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL) -SYM_FUNC_END(entry_SYSENTER_compat) +SYM_CODE_END(entry_SYSENTER_compat) /* * 32-bit SYSCALL entry. @@ -197,6 +197,7 @@ SYM_FUNC_END(entry_SYSENTER_compat) * 0(%esp) arg6 */ SYM_CODE_START(entry_SYSCALL_compat) + UNWIND_HINT_EMPTY /* Interrupts are off on entry. */ swapgs @@ -247,17 +248,13 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL) pushq $0 /* pt_regs->r15 = 0 */ xorl %r15d, %r15d /* nospec r15 */ - /* - * User mode is traced as though IRQs are on, and SYSENTER - * turned them off. - */ - TRACE_IRQS_OFF + UNWIND_HINT_REGS movq %rsp, %rdi call do_fast_syscall_32 /* XEN PV guests always use IRET path */ - ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ - "jmp .Lsyscall_32_done", X86_FEATURE_XENPV + ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \ + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV /* Opportunistic SYSRET */ sysret32_from_system_call: @@ -266,7 +263,7 @@ sysret32_from_system_call: * stack. So let's erase the thread stack right now. */ STACKLEAK_ERASE - TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ movq RBP(%rsp), %rbp /* pt_regs->rbp */ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ @@ -340,6 +337,7 @@ SYM_CODE_END(entry_SYSCALL_compat) * ebp arg6 */ SYM_CODE_START(entry_INT80_compat) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. */ @@ -361,8 +359,11 @@ SYM_CODE_START(entry_INT80_compat) /* Need to switch before accessing the thread stack. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + /* In the Xen PV case we already run on the thread stack. */ - ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV + ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV + + movq %rsp, %rdi movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq 6*8(%rdi) /* regs->ss */ @@ -401,19 +402,12 @@ SYM_CODE_START(entry_INT80_compat) xorl %r14d, %r14d /* nospec r14 */ pushq %r15 /* pt_regs->r15 */ xorl %r15d, %r15d /* nospec r15 */ - cld - /* - * User mode is traced as though IRQs are on, and the interrupt - * gate turned them off. - */ - TRACE_IRQS_OFF + UNWIND_HINT_REGS + + cld movq %rsp, %rdi call do_int80_syscall_32 -.Lsyscall_32_done: - - /* Go back to user mode. */ - TRACE_IRQS_ON jmp swapgs_restore_regs_and_return_to_usermode SYM_CODE_END(entry_INT80_compat) diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 3d8d70d3896c..1583831f61a9 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,6 +8,13 @@ #include <asm/unistd.h> #include <asm/syscall.h> +/* + * Reuse the 64-bit entry points for the x32 versions that occupy different + * slots in the syscall table. + */ +#define __x32_sys_getsockopt __x64_sys_getsockopt +#define __x32_sys_setsockopt __x64_sys_setsockopt + #define __SYSCALL_64(nr, sym) #define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 54581ac671b4..9d1102873666 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -160,7 +160,7 @@ 146 i386 writev sys_writev compat_sys_writev 147 i386 getsid sys_getsid 148 i386 fdatasync sys_fdatasync -149 i386 _sysctl sys_sysctl compat_sys_sysctl +149 i386 _sysctl sys_ni_syscall 150 i386 mlock sys_mlock 151 i386 munlock sys_munlock 152 i386 mlockall sys_mlockall @@ -376,8 +376,8 @@ 362 i386 connect sys_connect 363 i386 listen sys_listen 364 i386 accept4 sys_accept4 -365 i386 getsockopt sys_getsockopt compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +365 i386 getsockopt sys_getsockopt sys_getsockopt +366 i386 setsockopt sys_setsockopt sys_setsockopt 367 i386 getsockname sys_getsockname 368 i386 getpeername sys_getpeername 369 i386 sendto sys_sendto @@ -440,5 +440,7 @@ 433 i386 fspick sys_fspick 434 i386 pidfd_open sys_pidfd_open 435 i386 clone3 sys_clone3 +436 i386 close_range sys_close_range 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd +439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 37b844f839bc..f30d6ae9a688 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -164,7 +164,7 @@ 153 common vhangup sys_vhangup 154 common modify_ldt sys_modify_ldt 155 common pivot_root sys_pivot_root -156 64 _sysctl sys_sysctl +156 64 _sysctl sys_ni_syscall 157 common prctl sys_prctl 158 common arch_prctl sys_arch_prctl 159 common adjtimex sys_adjtimex @@ -357,8 +357,10 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 # # x32-specific system call numbers start at 512 to avoid cache impact @@ -395,8 +397,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index 3a07ce3ec70b..f1f96d4d8cd6 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -29,11 +29,6 @@ SYM_CODE_START_NOALIGN(\name) SYM_CODE_END(\name) .endm -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - #ifdef CONFIG_PREEMPTION THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index dbe4493b534e..ccd32877a3c4 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -3,7 +3,6 @@ * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. */ #include <linux/linkage.h> #include "calling.h" @@ -37,15 +36,6 @@ SYM_FUNC_END(\name) _ASM_NOKPROBE(\name) .endm -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - THUNK lockdep_sys_exit_thunk,lockdep_sys_exit -#endif - #ifdef CONFIG_PREEMPTION THUNK preempt_schedule_thunk, preempt_schedule THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace @@ -53,9 +43,7 @@ SYM_FUNC_END(\name) EXPORT_SYMBOL(preempt_schedule_notrace_thunk) #endif -#if defined(CONFIG_TRACE_IRQFLAGS) \ - || defined(CONFIG_DEBUG_LOCK_ALLOC) \ - || defined(CONFIG_PREEMPTION) +#ifdef CONFIG_PREEMPTION SYM_CODE_START_LOCAL_NOALIGN(.L_restore) popq %r11 popq %r10 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 433a1259f61d..215376d975a2 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -10,8 +10,11 @@ ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE include $(srctree)/lib/vdso/Makefile KBUILD_CFLAGS += $(DISABLE_LTO) + +# Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n UBSAN_SANITIZE := n +KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. @@ -24,9 +27,14 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o +vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o +vobjs32-y += vdso32/vclock_gettime.o # files to link into kernel obj-y += vma.o +KASAN_SANITIZE_vma.o := y +UBSAN_SANITIZE_vma.o := y +KCSAN_SANITIZE_vma.o := y OBJECT_FILES_NON_STANDARD_vma.o := n # vDSO images to build @@ -37,10 +45,12 @@ vdso_img-$(VDSO32-y) += 32 obj-$(VDSO32-y) += vdso32-setup.o vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) +vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so targets += vdso.lds $(vobjs-y) +targets += vdso32/vdso32.lds $(vobjs32-y) # Build the vDSO image C files and link them in. vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) @@ -72,7 +82,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE # optimize sibling calls. # CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ - $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO @@ -130,10 +140,6 @@ $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 -targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o -targets += vdso32/vclock_gettime.o - KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) $(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 @@ -145,7 +151,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic -KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING @@ -158,12 +164,7 @@ endif $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) -$(obj)/vdso32.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/system_call.o \ - $(obj)/vdso32/sigreturn.o +$(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE $(call if_changed,vdso_and_check) # diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 3842873b3ae3..7380908045c7 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -187,7 +187,7 @@ static void map_input(const char *name, void **addr, size_t *len, int prot) int fd = open(name, O_RDONLY); if (fd == -1) - err(1, "%s", name); + err(1, "open(%s)", name); tmp_len = lseek(fd, 0, SEEK_END); if (tmp_len == (off_t)-1) @@ -240,7 +240,7 @@ int main(int argc, char **argv) outfilename = argv[3]; outfile = fopen(outfilename, "w"); if (!outfile) - err(1, "%s", argv[2]); + err(1, "fopen(%s)", outfilename); go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index a20b134de2a8..6f46e11ce539 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -13,8 +13,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, unsigned long load_size = -1; /* Work around bogus warning */ unsigned long mapping_size; ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; - int i; - unsigned long j; + unsigned long i, syms_nr; ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, *alt_sec = NULL; ELF(Dyn) *dyn = 0, *dyn_end = 0; @@ -86,11 +85,10 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); + syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); /* Walk the symbol table */ - for (i = 0; - i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); - i++) { - int k; + for (i = 0; i < syms_nr; i++) { + unsigned int k; ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; const char *sym_name = raw_addr + @@ -150,11 +148,11 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {", mapping_size); - for (j = 0; j < stripped_len; j++) { - if (j % 10 == 0) + for (i = 0; i < stripped_len; i++) { + if (i % 10 == 0) fprintf(outfile, "\n\t"); fprintf(outfile, "0x%02X, ", - (int)((unsigned char *)stripped_addr)[j]); + (int)((unsigned char *)stripped_addr)[i]); } fprintf(outfile, "\n};\n\n"); diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S index e78047d119f6..2cbd39939dc6 100644 --- a/arch/x86/entry/vdso/vdso32/note.S +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -16,33 +16,3 @@ ELFNOTE_START(Linux, 0, "a") ELFNOTE_END BUILD_SALT - -#ifdef CONFIG_XEN -/* - * Add a special note telling glibc's dynamic linker a fake hardware - * flavor that it will use to choose the search path for libraries in the - * same way it uses real hardware capabilities like "mmx". - * We supply "nosegneg" as the fake capability, to indicate that we - * do not like negative offsets in instructions using segment overrides, - * since we implement those inefficiently. This makes it possible to - * install libraries optimized to avoid those access patterns in someplace - * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file - * corresponding to the bits here is needed to make ldconfig work right. - * It should contain: - * hwcap 1 nosegneg - * to match the mapping of bit to name that we give here. - * - * At runtime, the fake hardware feature will be considered to be present - * if its bit is set in the mask word. So, we start with the mask 0, and - * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. - */ - -#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ - -ELFNOTE_START(GNU, 2, "a") - .long 1 /* ncaps */ -VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ - .long 0 /* mask */ - .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ -ELFNOTE_END -#endif diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 43428cc514c8..9185cb1d13b9 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) struct mm_struct *mm = task->mm; struct vm_area_struct *vma; - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long size = vma->vm_end - vma->vm_start; @@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) zap_page_range(vma, vma->vm_start, size); } - up_write(&mm->mmap_sem); + mmap_read_unlock(mm); return 0; } #else @@ -268,7 +267,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) unsigned long text_start; int ret = 0; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; addr = get_unmapped_area(NULL, addr, @@ -311,7 +310,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } up_fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } @@ -373,7 +372,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); /* * Check if we have already mapped vdso blob - fail to prevent * abusing from userspace install_speciall_mapping, which may @@ -384,11 +383,11 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma_is_special_mapping(vma, &vdso_mapping) || vma_is_special_mapping(vma, &vvar_mapping)) { - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return -EEXIST; } } - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return map_vdso(image, addr); } |