summaryrefslogtreecommitdiffstats
path: root/arch/x86/entry
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/Makefile16
-rw-r--r--arch/x86/entry/calling.h55
-rw-r--r--arch/x86/entry/common.c516
-rw-r--r--arch/x86/entry/entry_32.S589
-rw-r--r--arch/x86/entry/entry_64.S975
-rw-r--r--arch/x86/entry/entry_64_compat.S86
-rw-r--r--arch/x86/entry/syscall_x32.c7
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl8
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl8
-rw-r--r--arch/x86/entry/thunk_32.S5
-rw-r--r--arch/x86/entry/thunk_64.S14
-rw-r--r--arch/x86/entry/vdso/Makefile25
-rw-r--r--arch/x86/entry/vdso/vdso2c.c4
-rw-r--r--arch/x86/entry/vdso/vdso2c.h16
-rw-r--r--arch/x86/entry/vdso/vdso32/note.S30
-rw-r--r--arch/x86/entry/vdso/vma.c15
16 files changed, 771 insertions, 1598 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 85eb381259c2..08bf95dbc911 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -3,10 +3,24 @@
# Makefile for the x86 low level entry code
#
-OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE)
+
+CFLAGS_common.o += -fno-stack-protector
+CFLAGS_syscall_64.o += -fno-stack-protector
+CFLAGS_syscall_32.o += -fno-stack-protector
+CFLAGS_syscall_x32.o += -fno-stack-protector
CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,)
+CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,)
+
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 1c7f13bb6728..ae9b0d4615b3 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,6 +6,7 @@
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
+#include <asm/inst.h>
/*
@@ -341,7 +342,16 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
-#endif /* CONFIG_X86_64 */
+.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
+ rdgsbase \save_reg
+ GET_PERCPU_BASE \scratch_reg
+ wrgsbase \scratch_reg
+.endm
+
+#else /* CONFIG_X86_64 */
+# undef UNWIND_HINT_IRET_REGS
+# define UNWIND_HINT_IRET_REGS
+#endif /* !CONFIG_X86_64 */
.macro STACKLEAK_ERASE
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
@@ -349,22 +359,37 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
+#ifdef CONFIG_SMP
+
/*
- * This does 'call enter_from_user_mode' unless we can avoid it based on
- * kernel config or using the static jump infrastructure.
+ * CPU/node NR is loaded from the limit (size) field of a special segment
+ * descriptor entry in GDT.
*/
-.macro CALL_enter_from_user_mode
-#ifdef CONFIG_CONTEXT_TRACKING
-#ifdef CONFIG_JUMP_LABEL
- STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0
-#endif
- call enter_from_user_mode
-.Lafter_call_\@:
-#endif
+.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
+ movq $__CPUNODE_SEG, \reg
+ lsl \reg, \reg
+.endm
+
+/*
+ * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
+ * We normally use %gs for accessing per-CPU data, but we are setting up
+ * %gs here and obviously can not use %gs itself to access per-CPU data.
+ *
+ * Do not use RDPID, because KVM loads guest's TSC_AUX on vm-entry and
+ * may not restore the host's value until the CPU returns to userspace.
+ * Thus the kernel would consume a guest's TSC_AUX if an NMI arrives
+ * while running KVM's run loop.
+ */
+.macro GET_PERCPU_BASE reg:req
+ LOAD_CPU_AND_NODE_SEG_LIMIT \reg
+ andq $VDSO_CPUNODE_MASK, \reg
+ movq __per_cpu_offset(, \reg, 8), \reg
.endm
-#ifdef CONFIG_PARAVIRT_XXL
-#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
#else
-#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
-#endif
+
+.macro GET_PERCPU_BASE reg:req
+ movq pcpu_unit_offsets(%rip), \reg
+.endm
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 76735ec813e6..870efeec8bda 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -10,23 +10,21 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
+#include <linux/entry-common.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
-#include <linux/tracehook.h>
-#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/signal.h>
#include <linux/export.h>
-#include <linux/context_tracking.h>
-#include <linux/user-return-notifier.h>
#include <linux/nospec.h>
-#include <linux/uprobes.h>
-#include <linux/livepatch.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#ifdef CONFIG_XEN_PV
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#endif
+
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
@@ -35,329 +33,121 @@
#include <asm/nospec-branch.h>
#include <asm/io_bitmap.h>
#include <asm/syscall.h>
+#include <asm/irq_stack.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
-#ifdef CONFIG_CONTEXT_TRACKING
-/* Called on entry from user mode with IRQs off. */
-__visible inline void enter_from_user_mode(void)
-{
- CT_WARN_ON(ct_state() != CONTEXT_USER);
- user_exit_irqoff();
-}
-#else
-static inline void enter_from_user_mode(void) {}
-#endif
-
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
#ifdef CONFIG_X86_64
- if (arch == AUDIT_ARCH_X86_64) {
- audit_syscall_entry(regs->orig_ax, regs->di,
- regs->si, regs->dx, regs->r10);
- } else
-#endif
- {
- audit_syscall_entry(regs->orig_ax, regs->bx,
- regs->cx, regs->dx, regs->si);
- }
-}
-
-/*
- * Returns the syscall nr to run (which should match regs->orig_ax) or -1
- * to skip the syscall.
- */
-static long syscall_trace_enter(struct pt_regs *regs)
+__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
- u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-
- struct thread_info *ti = current_thread_info();
- unsigned long ret = 0;
- u32 work;
-
- if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
- BUG_ON(regs != task_pt_regs(current));
-
- work = READ_ONCE(ti->flags);
-
- if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
- ret = tracehook_report_syscall_entry(regs);
- if (ret || (work & _TIF_SYSCALL_EMU))
- return -1L;
- }
-
-#ifdef CONFIG_SECCOMP
- /*
- * Do seccomp after ptrace, to catch any tracer changes.
- */
- if (work & _TIF_SECCOMP) {
- struct seccomp_data sd;
+ nr = syscall_enter_from_user_mode(regs, nr);
- sd.arch = arch;
- sd.nr = regs->orig_ax;
- sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
- if (arch == AUDIT_ARCH_X86_64) {
- sd.args[0] = regs->di;
- sd.args[1] = regs->si;
- sd.args[2] = regs->dx;
- sd.args[3] = regs->r10;
- sd.args[4] = regs->r8;
- sd.args[5] = regs->r9;
- } else
+ instrumentation_begin();
+ if (likely(nr < NR_syscalls)) {
+ nr = array_index_nospec(nr, NR_syscalls);
+ regs->ax = sys_call_table[nr](regs);
+#ifdef CONFIG_X86_X32_ABI
+ } else if (likely((nr & __X32_SYSCALL_BIT) &&
+ (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
+ nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
+ X32_NR_syscalls);
+ regs->ax = x32_sys_call_table[nr](regs);
#endif
- {
- sd.args[0] = regs->bx;
- sd.args[1] = regs->cx;
- sd.args[2] = regs->dx;
- sd.args[3] = regs->si;
- sd.args[4] = regs->di;
- sd.args[5] = regs->bp;
- }
-
- ret = __secure_computing(&sd);
- if (ret == -1)
- return ret;
}
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
#endif
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_enter(regs, regs->orig_ax);
-
- do_audit_syscall_entry(regs, arch);
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_IA32_EMULATION))
+ current_thread_info()->status |= TS_COMPAT;
- return ret ?: regs->orig_ax;
+ return (unsigned int)regs->orig_ax;
}
-#define EXIT_TO_USERMODE_LOOP_FLAGS \
- (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
-
-static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
+/*
+ * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
+ */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
+ unsigned int nr)
{
- /*
- * In order to return to user mode, we need to have IRQs off with
- * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags
- * can be set at any time on preemptible kernels if we have IRQs on,
- * so we need to loop. Disabling preemption wouldn't help: doing the
- * work to clear some of the flags can sleep.
- */
- while (true) {
- /* We have work to do. */
- local_irq_enable();
-
- if (cached_flags & _TIF_NEED_RESCHED)
- schedule();
-
- if (cached_flags & _TIF_UPROBE)
- uprobe_notify_resume(regs);
-
- if (cached_flags & _TIF_PATCH_PENDING)
- klp_update_patch_state(current);
-
- /* deal with pending signal delivery */
- if (cached_flags & _TIF_SIGPENDING)
- do_signal(regs);
-
- if (cached_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
- }
-
- if (cached_flags & _TIF_USER_RETURN_NOTIFY)
- fire_user_return_notifiers();
-
- /* Disable IRQs and retry */
- local_irq_disable();
-
- cached_flags = READ_ONCE(current_thread_info()->flags);
-
- if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
- break;
+ if (likely(nr < IA32_NR_syscalls)) {
+ instrumentation_begin();
+ nr = array_index_nospec(nr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call_table[nr](regs);
+ instrumentation_end();
}
}
-/* Called with IRQs disabled. */
-__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+/* Handles int $0x80 */
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
- struct thread_info *ti = current_thread_info();
- u32 cached_flags;
+ unsigned int nr = syscall_32_enter(regs);
- addr_limit_user_check();
-
- lockdep_assert_irqs_disabled();
- lockdep_sys_exit();
-
- cached_flags = READ_ONCE(ti->flags);
-
- if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
- exit_to_usermode_loop(regs, cached_flags);
-
- /* Reload ti->flags; we may have rescheduled above. */
- cached_flags = READ_ONCE(ti->flags);
-
- if (unlikely(cached_flags & _TIF_IO_BITMAP))
- tss_update_io_bitmap();
-
- fpregs_assert_state_consistent();
- if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
- switch_fpu_return();
-
-#ifdef CONFIG_COMPAT
/*
- * Compat syscalls set TS_COMPAT. Make sure we clear it before
- * returning to user mode. We need to clear it *after* signal
- * handling, because syscall restart has a fixup for compat
- * syscalls. The fixup is exercised by the ptrace_syscall_32
- * selftest.
- *
- * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
- * special case only applies after poking regs and before the
- * very next return to user mode.
+ * Subtlety here: if ptrace pokes something larger than 2^32-1 into
+ * orig_ax, the unsigned int return value truncates it. This may
+ * or may not be necessary, but it matches the old asm behavior.
*/
- ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
-#endif
+ nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
- user_enter_irqoff();
-
- mds_user_clear_cpu_buffers();
+ do_syscall_32_irqs_on(regs, nr);
+ syscall_exit_to_user_mode(regs);
}
-#define SYSCALL_EXIT_WORK_FLAGS \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
- _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
-
-static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
+static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
- bool step;
-
- audit_syscall_exit(regs);
-
- if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, regs->ax);
+ unsigned int nr = syscall_32_enter(regs);
+ int res;
/*
- * If TIF_SYSCALL_EMU is set, we only get here because of
- * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
- * We already reported this syscall instruction in
- * syscall_trace_enter().
+ * This cannot use syscall_enter_from_user_mode() as it has to
+ * fetch EBP before invoking any of the syscall entry work
+ * functions.
*/
- step = unlikely(
- (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
- == _TIF_SINGLESTEP);
- if (step || cached_flags & _TIF_SYSCALL_TRACE)
- tracehook_report_syscall_exit(regs, step);
-}
+ syscall_enter_from_user_mode_prepare(regs);
-/*
- * Called with IRQs on and fully valid regs. Returns with IRQs off in a
- * state such that we can immediately switch to user mode.
- */
-__visible inline void syscall_return_slowpath(struct pt_regs *regs)
-{
- struct thread_info *ti = current_thread_info();
- u32 cached_flags = READ_ONCE(ti->flags);
-
- CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
-
- if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
- WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
- local_irq_enable();
-
- rseq_syscall(regs);
-
- /*
- * First do one-time work. If these work items are enabled, we
- * want to run them exactly once per syscall exit with IRQs on.
- */
- if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
- syscall_slow_exit_work(regs, cached_flags);
-
- local_irq_disable();
- prepare_exit_to_usermode(regs);
-}
-
-#ifdef CONFIG_X86_64
-__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
-{
- struct thread_info *ti;
-
- enter_from_user_mode();
- local_irq_enable();
- ti = current_thread_info();
- if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
- nr = syscall_trace_enter(regs);
-
- if (likely(nr < NR_syscalls)) {
- nr = array_index_nospec(nr, NR_syscalls);
- regs->ax = sys_call_table[nr](regs);
-#ifdef CONFIG_X86_X32_ABI
- } else if (likely((nr & __X32_SYSCALL_BIT) &&
- (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
- nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
- X32_NR_syscalls);
- regs->ax = x32_sys_call_table[nr](regs);
-#endif
- }
-
- syscall_return_slowpath(regs);
-}
-#endif
-
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-/*
- * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
- * all entry and exit work and returns with IRQs off. This function is
- * extremely hot in workloads that use it, and it's usually called from
- * do_fast_syscall_32, so forcibly inline it to improve performance.
- */
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
-{
- struct thread_info *ti = current_thread_info();
- unsigned int nr = (unsigned int)regs->orig_ax;
-
-#ifdef CONFIG_IA32_EMULATION
- ti->status |= TS_COMPAT;
-#endif
-
- if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
+ instrumentation_begin();
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (IS_ENABLED(CONFIG_X86_64)) {
/*
- * Subtlety here: if ptrace pokes something larger than
- * 2^32-1 into orig_ax, this truncates it. This may or
- * may not be necessary, but it matches the old asm
- * behavior.
+ * Micro-optimization: the pointer we're following is
+ * explicitly 32 bits, so it can't be out of range.
*/
- nr = syscall_trace_enter(regs);
+ res = __get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ } else {
+ res = get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
}
+ instrumentation_end();
- if (likely(nr < IA32_NR_syscalls)) {
- nr = array_index_nospec(nr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call_table[nr](regs);
+ if (res) {
+ /* User code screwed up. */
+ regs->ax = -EFAULT;
+ syscall_exit_to_user_mode(regs);
+ return false;
}
- syscall_return_slowpath(regs);
-}
+ /* The case truncates any ptrace induced syscall nr > 2^32 -1 */
+ nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
-/* Handles int $0x80 */
-__visible void do_int80_syscall_32(struct pt_regs *regs)
-{
- enter_from_user_mode();
- local_irq_enable();
- do_syscall_32_irqs_on(regs);
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs, nr);
+ syscall_exit_to_user_mode(regs);
+ return true;
}
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible long do_fast_syscall_32(struct pt_regs *regs)
+__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
* convention. Adjust regs so it looks like we entered using int80.
*/
-
unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
- vdso_image_32.sym_int80_landing_pad;
+ vdso_image_32.sym_int80_landing_pad;
/*
* SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
@@ -366,34 +156,9 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
*/
regs->ip = landing_pad;
- enter_from_user_mode();
-
- local_irq_enable();
-
- /* Fetch EBP from where the vDSO stashed it. */
- if (
-#ifdef CONFIG_X86_64
- /*
- * Micro-optimization: the pointer we're following is explicitly
- * 32 bits, so it can't be out of range.
- */
- __get_user(*(u32 *)&regs->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
-#else
- get_user(*(u32 *)&regs->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
-#endif
- ) {
-
- /* User code screwed up. */
- local_irq_disable();
- regs->ax = -EFAULT;
- prepare_exit_to_usermode(regs);
- return 0; /* Keep it simple: use IRET. */
- }
-
- /* Now this is just like a normal syscall. */
- do_syscall_32_irqs_on(regs);
+ /* Invoke the syscall. If it failed, keep it simple: use IRET. */
+ if (!__do_fast_syscall_32(regs))
+ return 0;
#ifdef CONFIG_X86_64
/*
@@ -425,9 +190,128 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
#endif
}
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
+{
+ /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
+ regs->sp = regs->bp;
+
+ /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
+ regs->flags |= X86_EFLAGS_IF;
+
+ return do_fast_syscall_32(regs);
+}
#endif
SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
}
+
+noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
+{
+ bool irq_state = lockdep_hardirqs_enabled();
+
+ __nmi_enter();
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirq_enter();
+ rcu_nmi_enter();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ ftrace_nmi_enter();
+ instrumentation_end();
+
+ return irq_state;
+}
+
+noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore)
+{
+ instrumentation_begin();
+ ftrace_nmi_exit();
+ if (restore) {
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ }
+ instrumentation_end();
+
+ rcu_nmi_exit();
+ lockdep_hardirq_exit();
+ if (restore)
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ __nmi_exit();
+}
+
+#ifdef CONFIG_XEN_PV
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __this_cpu_write(xen_in_preemptible_hcall, false);
+ return inhcall;
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+#else
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
+#endif
+
+static void __xen_pv_evtchn_do_upcall(void)
+{
+ irq_enter_rcu();
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_hvm_evtchn_do_upcall();
+
+ irq_exit_rcu();
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+ bool inhcall;
+ irqentry_state_t state;
+
+ state = irqentry_enter(regs);
+ old_regs = set_irq_regs(regs);
+
+ instrumentation_begin();
+ run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
+ instrumentation_begin();
+
+ set_irq_regs(old_regs);
+
+ inhcall = get_and_clear_inhcall();
+ if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
+ instrumentation_begin();
+ irqentry_exit_cond_resched();
+ instrumentation_end();
+ restore_inhcall(inhcall);
+ } else {
+ irqentry_exit(regs, state);
+ }
+}
+#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b67bae7091d7..df8c017e6161 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -44,40 +44,13 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/frame.h>
+#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
#include "calling.h"
.section .entry.text, "ax"
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization. The following will never clobber any registers:
- * INTERRUPT_RETURN (aka. "iret")
- * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#ifdef CONFIG_PREEMPTION
-# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-# define preempt_stop(clobbers)
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off?
- jz 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
/*
@@ -476,8 +449,6 @@
.macro SWITCH_TO_KERNEL_STACK
- ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
-
BUG_IF_WRONG_CR3
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
@@ -626,8 +597,6 @@
*/
.macro SWITCH_TO_ENTRY_STACK
- ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
-
/* Bytes to copy */
movl $PTREGS_SIZE, %ecx
@@ -726,10 +695,68 @@
.Lend_\@:
.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+ ASM_CLAC
+ cld
+
+ .if \has_error_code == 0
+ pushl $0 /* Clear the error code */
+ .endif
+
+ /* Push the C-function address into the GS slot */
+ pushl $\cfunc
+ /* Invoke the common exception entry */
+ jmp handle_exception
+SYM_CODE_END(\asmsym)
+.endm
+
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+SYM_CODE_START_LOCAL(asm_\cfunc)
+ ASM_CLAC
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
+ call \cfunc
+ jmp handle_exception_return
+SYM_CODE_END(asm_\cfunc)
+.endm
+
+.macro idtentry_sysvec vector cfunc
+ idtentry \vector asm_\cfunc \cfunc has_error_code=0
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+ .align 16
+ .globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+ .align 16
+ .globl __irqentry_text_end
+__irqentry_text_end:
+
/*
* %eax: prev task
* %edx: next task
*/
+.pushsection .text, "ax"
SYM_CODE_START(__switch_to_asm)
/*
* Save callee-saved registers
@@ -776,6 +803,7 @@ SYM_CODE_START(__switch_to_asm)
jmp __switch_to
SYM_CODE_END(__switch_to_asm)
+.popsection
/*
* The unwinder expects the last frame on the stack to always be at the same
@@ -784,6 +812,7 @@ SYM_CODE_END(__switch_to_asm)
* asmlinkage function so its argument has to be pushed on the stack. This
* wrapper creates a proper "end of stack" frame header before the call.
*/
+.pushsection .text, "ax"
SYM_FUNC_START(schedule_tail_wrapper)
FRAME_BEGIN
@@ -794,6 +823,8 @@ SYM_FUNC_START(schedule_tail_wrapper)
FRAME_END
ret
SYM_FUNC_END(schedule_tail_wrapper)
+.popsection
+
/*
* A newly forked process directly context switches into this address.
*
@@ -801,6 +832,7 @@ SYM_FUNC_END(schedule_tail_wrapper)
* ebx: kernel thread func (NULL for user thread)
* edi: kernel thread arg
*/
+.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork)
call schedule_tail_wrapper
@@ -810,53 +842,21 @@ SYM_CODE_START(ret_from_fork)
2:
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
- call syscall_return_slowpath
- STACKLEAK_ERASE
- jmp restore_all
+ call syscall_exit_to_user_mode
+ jmp .Lsyscall_32_done
/* kernel thread */
1: movl %edi, %eax
- CALL_NOSPEC %ebx
+ CALL_NOSPEC ebx
/*
* A kernel thread is allowed to return here after successfully
- * calling do_execve(). Exit to userspace to complete the execve()
+ * calling kernel_execve(). Exit to userspace to complete the execve()
* syscall.
*/
movl $0, PT_EAX(%esp)
jmp 2b
SYM_CODE_END(ret_from_fork)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
- # userspace resumption stub bypassing syscall exit tracing
-SYM_CODE_START_LOCAL(ret_from_exception)
- preempt_stop(CLBR_ANY)
-ret_from_intr:
-#ifdef CONFIG_VM86
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
-#else
- /*
- * We can be coming here from child spawned by kernel_thread().
- */
- movl PT_CS(%esp), %eax
- andl $SEGMENT_RPL_MASK, %eax
-#endif
- cmpl $USER_RPL, %eax
- jb restore_all_kernel # not returning to v8086 or userspace
-
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl %esp, %eax
- call prepare_exit_to_usermode
- jmp restore_all
-SYM_CODE_END(ret_from_exception)
+.popsection
SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
/*
@@ -868,17 +868,6 @@ SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
* will ignore all of the single-step traps generated in this range.
*/
-#ifdef CONFIG_XEN_PV
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-SYM_CODE_START(xen_sysenter_target)
- addl $5*4, %esp /* remove xen-provided frame */
- jmp .Lsysenter_past_esp
-SYM_CODE_END(xen_sysenter_target)
-#endif
-
/*
* 32-bit SYSENTER entry.
*
@@ -929,9 +918,8 @@ SYM_FUNC_START(entry_SYSENTER_32)
.Lsysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
- pushl %ebp /* pt_regs->sp (stashed in bp) */
+ pushl $0 /* pt_regs->sp (placeholder) */
pushfl /* pt_regs->flags (except IF = 0) */
- orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
pushl $__USER_CS /* pt_regs->cs */
pushl $0 /* pt_regs->ip = 0 (placeholder) */
pushl %eax /* pt_regs->orig_ax */
@@ -960,22 +948,14 @@ SYM_FUNC_START(entry_SYSENTER_32)
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movl %esp, %eax
- call do_fast_syscall_32
- /* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ call do_SYSENTER_32
+ testl %eax, %eax
+ jz .Lsyscall_32_done
STACKLEAK_ERASE
-/* Opportunistic SYSEXIT */
- TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ /* Opportunistic SYSEXIT */
/*
* Setup entry stack - we keep the pointer in %eax and do the
@@ -1075,20 +1055,12 @@ SYM_FUNC_START(entry_INT80_32)
SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */
- /*
- * User mode is traced as though IRQs are on, and the interrupt gate
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movl %esp, %eax
call do_int80_syscall_32
.Lsyscall_32_done:
-
STACKLEAK_ERASE
-restore_all:
- TRACE_IRQS_ON
+restore_all_switch_stack:
SWITCH_TO_ENTRY_STACK
CHECK_AND_APPLY_ESPFIX
@@ -1107,26 +1079,10 @@ restore_all:
*/
INTERRUPT_RETURN
-restore_all_kernel:
-#ifdef CONFIG_PREEMPTION
- DISABLE_INTERRUPTS(CLBR_ANY)
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz .Lno_preempt
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz .Lno_preempt
- call preempt_schedule_irq
-.Lno_preempt:
-#endif
- TRACE_IRQS_IRET
- PARANOID_EXIT_TO_KERNEL_MODE
- BUG_IF_WRONG_CR3
- RESTORE_REGS 4
- jmp .Lirq_return
-
.section .fixup, "ax"
-SYM_CODE_START(iret_exc)
+SYM_CODE_START(asm_iret_error)
pushl $0 # no error code
- pushl $do_iret_error
+ pushl $iret_error
#ifdef CONFIG_DEBUG_ENTRY
/*
@@ -1140,10 +1096,10 @@ SYM_CODE_START(iret_exc)
popl %eax
#endif
- jmp common_exception
-SYM_CODE_END(iret_exc)
+ jmp handle_exception
+SYM_CODE_END(asm_iret_error)
.previous
- _ASM_EXTABLE(.Lirq_return, iret_exc)
+ _ASM_EXTABLE(.Lirq_return, asm_iret_error)
SYM_FUNC_END(entry_INT80_32)
.macro FIXUP_ESPFIX_STACK
@@ -1193,319 +1149,7 @@ SYM_FUNC_END(entry_INT80_32)
#endif
.endm
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- .align 8
- .endr
-SYM_CODE_END(irq_entries_start)
-
-#ifdef CONFIG_X86_LOCAL_APIC
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_spurious
- .align 8
- .endr
-SYM_CODE_END(spurious_entries_start)
-
-SYM_CODE_START_LOCAL(common_spurious)
- ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
- SAVE_ALL switch_stacks=1
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- movl %esp, %eax
- call smp_spurious_interrupt
- jmp ret_from_intr
-SYM_CODE_END(common_spurious)
-#endif
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-SYM_CODE_START_LOCAL(common_interrupt)
- ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
-
- SAVE_ALL switch_stacks=1
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- movl %esp, %eax
- call do_IRQ
- jmp ret_from_intr
-SYM_CODE_END(common_interrupt)
-
-#define BUILD_INTERRUPT3(name, nr, fn) \
-SYM_FUNC_START(name) \
- ASM_CLAC; \
- pushl $~(nr); \
- SAVE_ALL switch_stacks=1; \
- ENCODE_FRAME_POINTER; \
- TRACE_IRQS_OFF \
- movl %esp, %eax; \
- call fn; \
- jmp ret_from_intr; \
-SYM_FUNC_END(name)
-
-#define BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(name, nr, smp_##name); \
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-SYM_CODE_START(coprocessor_error)
- ASM_CLAC
- pushl $0
- pushl $do_coprocessor_error
- jmp common_exception
-SYM_CODE_END(coprocessor_error)
-
-SYM_CODE_START(simd_coprocessor_error)
- ASM_CLAC
- pushl $0
-#ifdef CONFIG_X86_INVD_BUG
- /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
- ALTERNATIVE "pushl $do_general_protection", \
- "pushl $do_simd_coprocessor_error", \
- X86_FEATURE_XMM
-#else
- pushl $do_simd_coprocessor_error
-#endif
- jmp common_exception
-SYM_CODE_END(simd_coprocessor_error)
-
-SYM_CODE_START(device_not_available)
- ASM_CLAC
- pushl $0
- pushl $do_device_not_available
- jmp common_exception
-SYM_CODE_END(device_not_available)
-
-#ifdef CONFIG_PARAVIRT
-SYM_CODE_START(native_iret)
- iret
- _ASM_EXTABLE(native_iret, iret_exc)
-SYM_CODE_END(native_iret)
-#endif
-
-SYM_CODE_START(overflow)
- ASM_CLAC
- pushl $0
- pushl $do_overflow
- jmp common_exception
-SYM_CODE_END(overflow)
-
-SYM_CODE_START(bounds)
- ASM_CLAC
- pushl $0
- pushl $do_bounds
- jmp common_exception
-SYM_CODE_END(bounds)
-
-SYM_CODE_START(invalid_op)
- ASM_CLAC
- pushl $0
- pushl $do_invalid_op
- jmp common_exception
-SYM_CODE_END(invalid_op)
-
-SYM_CODE_START(coprocessor_segment_overrun)
- ASM_CLAC
- pushl $0
- pushl $do_coprocessor_segment_overrun
- jmp common_exception
-SYM_CODE_END(coprocessor_segment_overrun)
-
-SYM_CODE_START(invalid_TSS)
- ASM_CLAC
- pushl $do_invalid_TSS
- jmp common_exception
-SYM_CODE_END(invalid_TSS)
-
-SYM_CODE_START(segment_not_present)
- ASM_CLAC
- pushl $do_segment_not_present
- jmp common_exception
-SYM_CODE_END(segment_not_present)
-
-SYM_CODE_START(stack_segment)
- ASM_CLAC
- pushl $do_stack_segment
- jmp common_exception
-SYM_CODE_END(stack_segment)
-
-SYM_CODE_START(alignment_check)
- ASM_CLAC
- pushl $do_alignment_check
- jmp common_exception
-SYM_CODE_END(alignment_check)
-
-SYM_CODE_START(divide_error)
- ASM_CLAC
- pushl $0 # no error code
- pushl $do_divide_error
- jmp common_exception
-SYM_CODE_END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-SYM_CODE_START(machine_check)
- ASM_CLAC
- pushl $0
- pushl $do_mce
- jmp common_exception
-SYM_CODE_END(machine_check)
-#endif
-
-SYM_CODE_START(spurious_interrupt_bug)
- ASM_CLAC
- pushl $0
- pushl $do_spurious_interrupt_bug
- jmp common_exception
-SYM_CODE_END(spurious_interrupt_bug)
-
-#ifdef CONFIG_XEN_PV
-SYM_FUNC_START(xen_hypervisor_callback)
- /*
- * Check to see if we got the event in the critical
- * region in xen_iret_direct, after we've reenabled
- * events and checked for pending events. This simulates
- * iret instruction's behaviour where it delivers a
- * pending interrupt when enabling interrupts:
- */
- cmpl $xen_iret_start_crit, (%esp)
- jb 1f
- cmpl $xen_iret_end_crit, (%esp)
- jae 1f
- call xen_iret_crit_fixup
-1:
- pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- mov %esp, %eax
- call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp ret_from_intr
-SYM_FUNC_END(xen_hypervisor_callback)
-
-/*
- * Hypervisor uses this for application faults while it executes.
- * We get here for two reasons:
- * 1. Fault while reloading DS, ES, FS or GS
- * 2. Fault while executing IRET
- * Category 1 we fix up by reattempting the load, and zeroing the segment
- * register if the load fails.
- * Category 2 we fix up by jumping to do_iret_error. We cannot use the
- * normal Linux return path in this case because if we use the IRET hypercall
- * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
- * We distinguish between categories by maintaining a status value in EAX.
- */
-SYM_FUNC_START(xen_failsafe_callback)
- pushl %eax
- movl $1, %eax
-1: mov 4(%esp), %ds
-2: mov 8(%esp), %es
-3: mov 12(%esp), %fs
-4: mov 16(%esp), %gs
- /* EAX == 0 => Category 1 (Bad segment)
- EAX != 0 => Category 2 (Bad IRET) */
- testl %eax, %eax
- popl %eax
- lea 16(%esp), %esp
- jz 5f
- jmp iret_exc
-5: pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- ENCODE_FRAME_POINTER
- jmp ret_from_exception
-
-.section .fixup, "ax"
-6: xorl %eax, %eax
- movl %eax, 4(%esp)
- jmp 1b
-7: xorl %eax, %eax
- movl %eax, 8(%esp)
- jmp 2b
-8: xorl %eax, %eax
- movl %eax, 12(%esp)
- jmp 3b
-9: xorl %eax, %eax
- movl %eax, 16(%esp)
- jmp 4b
-.previous
- _ASM_EXTABLE(1b, 6b)
- _ASM_EXTABLE(2b, 7b)
- _ASM_EXTABLE(3b, 8b)
- _ASM_EXTABLE(4b, 9b)
-SYM_FUNC_END(xen_failsafe_callback)
-#endif /* CONFIG_XEN_PV */
-
-#ifdef CONFIG_XEN_PVHVM
-BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- xen_evtchn_do_upcall)
-#endif
-
-
-#if IS_ENABLED(CONFIG_HYPERV)
-
-BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- hyperv_vector_handler)
-
-BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
- hyperv_reenlightenment_intr)
-
-BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
- hv_stimer0_vector_handler)
-
-#endif /* CONFIG_HYPERV */
-
-SYM_CODE_START(page_fault)
- ASM_CLAC
- pushl $do_page_fault
- jmp common_exception_read_cr2
-SYM_CODE_END(page_fault)
-
-SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2)
- /* the function address is in %gs's slot on the stack */
- SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
-
- ENCODE_FRAME_POINTER
-
- /* fixup %gs */
- GS_TO_REG %ecx
- movl PT_GS(%esp), %edi
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
-
- GET_CR2_INTO(%ecx) # might clobber %eax
-
- /* fixup orig %eax */
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
-
- TRACE_IRQS_OFF
- movl %esp, %eax # pt_regs pointer
- CALL_NOSPEC %edi
- jmp ret_from_exception
-SYM_CODE_END(common_exception_read_cr2)
-
-SYM_CODE_START_LOCAL_NOALIGN(common_exception)
+SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
/* the function address is in %gs's slot on the stack */
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER
@@ -1520,24 +1164,35 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception)
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- TRACE_IRQS_OFF
movl %esp, %eax # pt_regs pointer
- CALL_NOSPEC %edi
- jmp ret_from_exception
-SYM_CODE_END(common_exception)
+ CALL_NOSPEC edi
-SYM_CODE_START(debug)
+handle_exception_return:
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
/*
- * Entry from sysenter is now handled in common_exception
+ * We can be coming here from child spawned by kernel_thread().
*/
- ASM_CLAC
- pushl $0
- pushl $do_debug
- jmp common_exception
-SYM_CODE_END(debug)
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
+ cmpl $USER_RPL, %eax # returning to v8086 or userspace ?
+ jnb ret_to_user
+
+ PARANOID_EXIT_TO_KERNEL_MODE
+ BUG_IF_WRONG_CR3
+ RESTORE_REGS 4
+ jmp .Lirq_return
+
+ret_to_user:
+ movl %esp, %eax
+ jmp restore_all_switch_stack
+SYM_CODE_END(handle_exception)
-#ifdef CONFIG_DOUBLEFAULT
-SYM_CODE_START(double_fault)
+SYM_CODE_START(asm_exc_double_fault)
1:
/*
* This is a task gate handler, not an interrupt gate handler.
@@ -1575,8 +1230,7 @@ SYM_CODE_START(double_fault)
1:
hlt
jmp 1b
-SYM_CODE_END(double_fault)
-#endif
+SYM_CODE_END(asm_exc_double_fault)
/*
* NMI is doubly nasty. It can happen on the first instruction of
@@ -1585,7 +1239,7 @@ SYM_CODE_END(double_fault)
* switched stacks. We handle both conditions by simply checking whether we
* interrupted kernel code running on the SYSENTER stack.
*/
-SYM_CODE_START(nmi)
+SYM_CODE_START(asm_exc_nmi)
ASM_CLAC
#ifdef CONFIG_X86_ESPFIX32
@@ -1614,7 +1268,7 @@ SYM_CODE_START(nmi)
jb .Lnmi_from_sysenter_stack
/* Not on SYSENTER stack. */
- call do_nmi
+ call exc_nmi
jmp .Lnmi_return
.Lnmi_from_sysenter_stack:
@@ -1624,7 +1278,7 @@ SYM_CODE_START(nmi)
*/
movl %esp, %ebx
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
- call do_nmi
+ call exc_nmi
movl %ebx, %esp
.Lnmi_return:
@@ -1678,29 +1332,9 @@ SYM_CODE_START(nmi)
lss (1+5+6)*4(%esp), %esp # back to espfix stack
jmp .Lirq_return
#endif
-SYM_CODE_END(nmi)
-
-SYM_CODE_START(int3)
- ASM_CLAC
- pushl $0
- pushl $do_int3
- jmp common_exception
-SYM_CODE_END(int3)
-
-SYM_CODE_START(general_protection)
- ASM_CLAC
- pushl $do_general_protection
- jmp common_exception
-SYM_CODE_END(general_protection)
-
-#ifdef CONFIG_KVM_GUEST
-SYM_CODE_START(async_page_fault)
- ASM_CLAC
- pushl $do_async_page_fault
- jmp common_exception_read_cr2
-SYM_CODE_END(async_page_fault)
-#endif
+SYM_CODE_END(asm_exc_nmi)
+.pushsection .text, "ax"
SYM_CODE_START(rewind_stack_do_exit)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
@@ -1711,3 +1345,4 @@ SYM_CODE_START(rewind_stack_do_exit)
call do_exit
1: jmp 1b
SYM_CODE_END(rewind_stack_do_exit)
+.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3063aa9090f9..d977079a7d02 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -16,7 +16,6 @@
*
* Some macro usage:
* - SYM_FUNC_START/END:Define functions in the symbol table.
- * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
* - idtentry: Define exception entry points.
*/
#include <linux/linkage.h>
@@ -37,7 +36,9 @@
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/frame.h>
+#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
+#include <asm/fsgsbase.h>
#include <linux/err.h>
#include "calling.h"
@@ -53,57 +54,6 @@ SYM_CODE_START(native_usergs_sysret64)
SYM_CODE_END(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-.macro TRACE_IRQS_FLAGS flags:req
-#ifdef CONFIG_TRACE_IRQFLAGS
- btl $9, \flags /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-.macro TRACE_IRQS_IRETQ
- TRACE_IRQS_FLAGS EFLAGS(%rsp)
-.endm
-
-/*
- * When dynamic function tracer is enabled it will add a breakpoint
- * to all locations that it is about to modify, sync CPUs, update
- * all the code, sync CPUs, then remove the breakpoints. In this time
- * if lockdep is enabled, it might jump back into the debug handler
- * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
- *
- * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
- * make sure the stack pointer does not get reset back to the top
- * of the debug stack, and instead just reuses the current stack.
- */
-#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
-
-.macro TRACE_IRQS_OFF_DEBUG
- call debug_stack_set_zero
- TRACE_IRQS_OFF
- call debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_ON_DEBUG
- call debug_stack_set_zero
- TRACE_IRQS_ON
- call debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_IRETQ_DEBUG
- btl $9, EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON_DEBUG
-1:
-.endm
-
-#else
-# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
-#endif
-
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
@@ -144,11 +94,6 @@ SYM_CODE_END(native_usergs_sysret64)
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
swapgs
/* tss.sp2 is scratch space. */
@@ -167,15 +112,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
- TRACE_IRQS_OFF
-
/* IRQs are off. */
movq %rax, %rdi
movq %rsp, %rsi
call do_syscall_64 /* returns with IRQs disabled */
- TRACE_IRQS_ON /* return enables interrupts */
-
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
@@ -279,6 +220,7 @@ SYM_CODE_END(entry_SYSCALL_64)
* %rdi: prev task
* %rsi: next task
*/
+.pushsection .text, "ax"
SYM_FUNC_START(__switch_to_asm)
/*
* Save callee-saved registers
@@ -321,6 +263,7 @@ SYM_FUNC_START(__switch_to_asm)
jmp __switch_to
SYM_FUNC_END(__switch_to_asm)
+.popsection
/*
* A newly forked process directly context switches into this address.
@@ -329,6 +272,7 @@ SYM_FUNC_END(__switch_to_asm)
* rbx: kernel thread func (NULL for user thread)
* r12: kernel thread arg
*/
+.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork)
UNWIND_HINT_EMPTY
movq %rax, %rdi
@@ -340,51 +284,23 @@ SYM_CODE_START(ret_from_fork)
2:
UNWIND_HINT_REGS
movq %rsp, %rdi
- call syscall_return_slowpath /* returns with IRQs disabled */
- TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ call syscall_exit_to_user_mode /* returns with IRQs disabled */
jmp swapgs_restore_regs_and_return_to_usermode
1:
/* kernel thread */
UNWIND_HINT_EMPTY
movq %r12, %rdi
- CALL_NOSPEC %rbx
+ CALL_NOSPEC rbx
/*
* A kernel thread is allowed to return here after successfully
- * calling do_execve(). Exit to userspace to complete the execve()
+ * calling kernel_execve(). Exit to userspace to complete the execve()
* syscall.
*/
movq $0, RAX(%rsp)
jmp 2b
SYM_CODE_END(ret_from_fork)
-
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_interrupt
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(irq_entries_start)
-
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_spurious
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(spurious_entries_start)
+.popsection
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
@@ -398,228 +314,179 @@ SYM_CODE_END(spurious_entries_start)
#endif
.endm
-/*
- * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
- * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
- * Requires kernel GSBASE.
- *
- * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+/**
+ * idtentry_body - Macro to emit code calling the C function
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
*/
-.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
- DEBUG_ENTRY_ASSERT_IRQS_OFF
-
- .if \save_ret
- /*
- * If save_ret is set, the original stack contains one additional
- * entry -- the return address. Therefore, move the address one
- * entry below %rsp to \old_rsp.
- */
- leaq 8(%rsp), \old_rsp
- .else
- movq %rsp, \old_rsp
- .endif
+.macro idtentry_body cfunc has_error_code:req
- .if \regs
- UNWIND_HINT_REGS base=\old_rsp
- .endif
+ call error_entry
+ UNWIND_HINT_REGS
- incl PER_CPU_VAR(irq_count)
- jnz .Lirq_stack_push_old_rsp_\@
+ movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
- /*
- * Right now, if we just incremented irq_count to zero, we've
- * claimed the IRQ stack but we haven't switched to it yet.
- *
- * If anything is added that can interrupt us here without using IST,
- * it must be *extremely* careful to limit its stack usage. This
- * could include kprobes and a hypothetical future IST-less #DB
- * handler.
- *
- * The OOPS unwinder relies on the word at the top of the IRQ
- * stack linking back to the previous RSP for the entire time we're
- * on the IRQ stack. For this to work reliably, we need to write
- * it before we actually move ourselves to the IRQ stack.
- */
+ .if \has_error_code == 1
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ .endif
- movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8)
- movq PER_CPU_VAR(hardirq_stack_ptr), %rsp
+ call \cfunc
-#ifdef CONFIG_DEBUG_ENTRY
- /*
- * If the first movq above becomes wrong due to IRQ stack layout
- * changes, the only way we'll notice is if we try to unwind right
- * here. Assert that we set up the stack right to catch this type
- * of bug quickly.
- */
- cmpq -8(%rsp), \old_rsp
- je .Lirq_stack_okay\@
- ud2
- .Lirq_stack_okay\@:
-#endif
+ jmp error_return
+.endm
-.Lirq_stack_push_old_rsp_\@:
- pushq \old_rsp
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ *
+ * The macro emits code to set up the kernel context for straight forward
+ * and simple IDT entries. No IST stack, no paranoid entry checks.
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+ ASM_CLAC
- .if \regs
- UNWIND_HINT_REGS indirect=1
+ .if \has_error_code == 0
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
- .if \save_ret
- /*
- * Push the return address to the stack. This return address can
- * be found at the "real" original RSP, which was offset by 8 at
- * the beginning of this macro.
- */
- pushq -8(\old_rsp)
+ .if \vector == X86_TRAP_BP
+ /*
+ * If coming from kernel space, create a 6-word gap to allow the
+ * int3 handler to emulate a call instruction.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
.endif
+
+ idtentry_body \cfunc \has_error_code
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
.endm
/*
- * Undoes ENTER_IRQ_STACK.
+ * Interrupt entry/exit.
+ *
+ + The interrupt stubs push (vector) onto the stack, which is the error_code
+ * position of idtentry exceptions, and jump to one of the two idtentry points
+ * (common/spurious).
+ *
+ * common_interrupt is a hotpath, align it to a cache line
*/
-.macro LEAVE_IRQ_STACK regs=1
- DEBUG_ENTRY_ASSERT_IRQS_OFF
- /* We need to be off the IRQ stack before decrementing irq_count. */
- popq %rsp
-
- .if \regs
- UNWIND_HINT_REGS
- .endif
-
- /*
- * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
- * the irq stack but we're not on it.
- */
-
- decl PER_CPU_VAR(irq_count)
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+ idtentry \vector asm_\cfunc \cfunc has_error_code=1
.endm
/*
- * Interrupt entry helper function.
+ * System vectors which invoke their handlers directly and are not
+ * going through the regular common device interrupt handling code.
+ */
+.macro idtentry_sysvec vector cfunc
+ idtentry \vector asm_\cfunc \cfunc has_error_code=0
+.endm
+
+/**
+ * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ *
+ * The macro emits code to set up the kernel context for #MC and #DB
*
- * Entry runs with interrupts off. Stack layout at entry:
- * +----------------------------------------------------+
- * | regs->ss |
- * | regs->rsp |
- * | regs->eflags |
- * | regs->cs |
- * | regs->ip |
- * +----------------------------------------------------+
- * | regs->orig_ax = ~(interrupt number) |
- * +----------------------------------------------------+
- * | return address |
- * +----------------------------------------------------+
+ * If the entry comes from user space it uses the normal entry path
+ * including the return to user space work and preemption checks on
+ * exit.
+ *
+ * If hits in kernel mode then it needs to go through the paranoid
+ * entry as the exception can hit any random state. No preemption
+ * check on exit to keep the paranoid path simple.
*/
-SYM_CODE_START(interrupt_entry)
- UNWIND_HINT_IRET_REGS offset=16
+.macro idtentry_mce_db vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS
ASM_CLAC
- cld
- testb $3, CS-ORIG_RAX+8(%rsp)
- jz 1f
- SWAPGS
- FENCE_SWAPGS_USER_ENTRY
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+
/*
- * Switch to the thread stack. The IRET frame and orig_ax are
- * on the stack, as well as the return address. RDI..R12 are
- * not (yet) on the stack and space has not (yet) been
- * allocated for them.
+ * If the entry is from userspace, switch stacks and treat it as
+ * a normal entry.
*/
- pushq %rdi
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_switch_stack_\@
- /* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
- movq %rsp, %rdi
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
+ call paranoid_entry
- /*
- * We have RDI, return address, and orig_ax on the stack on
- * top of the IRET frame. That means offset=24
- */
- UNWIND_HINT_IRET_REGS base=%rdi offset=24
-
- pushq 7*8(%rdi) /* regs->ss */
- pushq 6*8(%rdi) /* regs->rsp */
- pushq 5*8(%rdi) /* regs->eflags */
- pushq 4*8(%rdi) /* regs->cs */
- pushq 3*8(%rdi) /* regs->ip */
- UNWIND_HINT_IRET_REGS
- pushq 2*8(%rdi) /* regs->orig_ax */
- pushq 8(%rdi) /* return address */
+ UNWIND_HINT_REGS
- movq (%rdi), %rdi
- jmp 2f
-1:
- FENCE_SWAPGS_KERNEL_ENTRY
-2:
- PUSH_AND_CLEAR_REGS save_ret=1
- ENCODE_FRAME_POINTER 8
+ movq %rsp, %rdi /* pt_regs pointer */
- testb $3, CS+8(%rsp)
- jz 1f
+ call \cfunc
- /*
- * IRQ from user mode.
- *
- * We need to tell lockdep that IRQs are off. We can't do this until
- * we fix gsbase, and we should do it before enter_from_user_mode
- * (which can take locks). Since TRACE_IRQS_OFF is idempotent,
- * the simplest way to handle it is to just call it twice if
- * we enter from user mode. There's no reason to optimize this since
- * TRACE_IRQS_OFF is a no-op if lockdep is off.
- */
- TRACE_IRQS_OFF
+ jmp paranoid_exit
- CALL_enter_from_user_mode
+ /* Switch to the regular task stack and use the noist entry point */
+.Lfrom_usermode_switch_stack_\@:
+ idtentry_body noist_\cfunc, has_error_code=0
-1:
- ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
- /* We entered an interrupt context - irqs are off: */
- TRACE_IRQS_OFF
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
- ret
-SYM_CODE_END(interrupt_entry)
-_ASM_NOKPROBE(interrupt_entry)
+/*
+ * Double fault entry. Straight paranoid. No checks from which context
+ * this comes because for the espfix induced #DF this would do the wrong
+ * thing.
+ */
+.macro idtentry_df vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS offset=8
+ ASM_CLAC
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
+ call paranoid_entry
+ UNWIND_HINT_REGS
-/* Interrupt entry/exit. */
+ movq %rsp, %rdi /* pt_regs pointer into first argument */
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ call \cfunc
+
+ jmp paranoid_exit
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
/*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_spurious/interrupt.
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
*/
-SYM_CODE_START_LOCAL(common_spurious)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call smp_spurious_interrupt /* rdi points to pt_regs */
- jmp ret_from_intr
-SYM_CODE_END(common_spurious)
-_ASM_NOKPROBE(common_spurious)
-
-/* common_interrupt is a hotpath. Align it */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-SYM_CODE_START_LOCAL(common_interrupt)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call do_IRQ /* rdi points to pt_regs */
- /* 0(%rsp): old RSP */
-ret_from_intr:
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
-
- LEAVE_IRQ_STACK
+ .align 16
+ .globl __irqentry_text_start
+__irqentry_text_start:
- testb $3, CS(%rsp)
- jz retint_kernel
+#include <asm/idtentry.h>
- /* Interrupt came from user space */
-.Lretint_user:
- mov %rsp,%rdi
- call prepare_exit_to_usermode
- TRACE_IRQS_ON
+ .align 16
+ .globl __irqentry_text_end
+__irqentry_text_end:
+SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates user mode. */
@@ -662,23 +529,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
INTERRUPT_RETURN
-/* Returning to kernel space */
-retint_kernel:
-#ifdef CONFIG_PREEMPTION
- /* Interrupts are off */
- /* Check if we need preemption */
- btl $9, EFLAGS(%rsp) /* were interrupts off? */
- jnc 1f
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz 1f
- call preempt_schedule_irq
-1:
-#endif
- /*
- * The iretq could re-enable interrupts:
- */
- TRACE_IRQS_IRETQ
-
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates kernel mode. */
@@ -710,7 +560,7 @@ SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
/*
* This may fault. Non-paranoid faults on return to userspace are
* handled by fixup_bad_iret. These include #SS, #GP, and #NP.
- * Double-faults due to espfix64 are handled in do_double_fault.
+ * Double-faults due to espfix64 are handled in exc_double_fault.
* Other faults here are fatal.
*/
iretq
@@ -788,280 +638,32 @@ native_irq_return_ldt:
*/
jmp native_irq_return_iret
#endif
-SYM_CODE_END(common_interrupt)
-_ASM_NOKPROBE(common_interrupt)
+SYM_CODE_END(common_interrupt_return)
+_ASM_NOKPROBE(common_interrupt_return)
/*
- * APIC interrupts.
- */
-.macro apicinterrupt3 num sym do_sym
-SYM_CODE_START(\sym)
- UNWIND_HINT_IRET_REGS
- pushq $~(\num)
-.Lcommon_\sym:
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call \do_sym /* rdi points to pt_regs */
- jmp ret_from_intr
-SYM_CODE_END(\sym)
-_ASM_NOKPROBE(\sym)
-.endm
-
-/* Make sure APIC interrupt handlers end up in the irqentry section: */
-#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
-#define POP_SECTION_IRQENTRY .popsection
-
-.macro apicinterrupt num sym do_sym
-PUSH_SECTION_IRQENTRY
-apicinterrupt3 \num \sym \do_sym
-POP_SECTION_IRQENTRY
-.endm
-
-#ifdef CONFIG_SMP
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
-#endif
-
-#ifdef CONFIG_X86_UV
-apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
-#endif
-
-apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
-
-#ifdef CONFIG_HAVE_KVM
-apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
-apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
-apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi
-#endif
-
-#ifdef CONFIG_X86_MCE_THRESHOLD
-apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
-#endif
-
-#ifdef CONFIG_X86_MCE_AMD
-apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
-#endif
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
-#endif
-
-#ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
-#endif
-
-apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
-
-#ifdef CONFIG_IRQ_WORK
-apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
-#endif
-
-/*
- * Exception entry points.
- */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
-
-.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
-
- .if \paranoid
- call paranoid_entry
- /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
- .else
- call error_entry
- .endif
- UNWIND_HINT_REGS
-
- .if \read_cr2
- /*
- * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
- * intermediate storage as RDX can be clobbered in enter_from_user_mode().
- * GET_CR2_INTO can clobber RAX.
- */
- GET_CR2_INTO(%r12);
- .endif
-
- .if \shift_ist != -1
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
- .else
- TRACE_IRQS_OFF
- .endif
-
- .if \paranoid == 0
- testb $3, CS(%rsp)
- jz .Lfrom_kernel_no_context_tracking_\@
- CALL_enter_from_user_mode
-.Lfrom_kernel_no_context_tracking_\@:
- .endif
-
- movq %rsp, %rdi /* pt_regs pointer */
-
- .if \has_error_code
- movq ORIG_RAX(%rsp), %rsi /* get error code */
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
- .else
- xorl %esi, %esi /* no error code */
- .endif
-
- .if \shift_ist != -1
- subq $\ist_offset, CPU_TSS_IST(\shift_ist)
- .endif
-
- .if \read_cr2
- movq %r12, %rdx /* Move CR2 into 3rd argument */
- .endif
-
- call \do_sym
-
- .if \shift_ist != -1
- addq $\ist_offset, CPU_TSS_IST(\shift_ist)
- .endif
-
- .if \paranoid
- /* this procedure expect "no swapgs" flag in ebx */
- jmp paranoid_exit
- .else
- jmp error_exit
- .endif
-
-.endm
-
-/**
- * idtentry - Generate an IDT entry stub
- * @sym: Name of the generated entry point
- * @do_sym: C function to be called
- * @has_error_code: True if this IDT vector has an error code on the stack
- * @paranoid: non-zero means that this vector may be invoked from
- * kernel mode with user GSBASE and/or user CR3.
- * 2 is special -- see below.
- * @shift_ist: Set to an IST index if entries from kernel mode should
- * decrement the IST stack so that nested entries get a
- * fresh stack. (This is for #DB, which has a nasty habit
- * of recursing.)
- * @create_gap: create a 6-word stack gap when coming from kernel mode.
- * @read_cr2: load CR2 into the 3rd argument; done before calling any C code
- *
- * idtentry generates an IDT stub that sets up a usable kernel context,
- * creates struct pt_regs, and calls @do_sym. The stub has the following
- * special behaviors:
- *
- * On an entry from user mode, the stub switches from the trampoline or
- * IST stack to the normal thread stack. On an exit to user mode, the
- * normal exit-to-usermode path is invoked.
+ * Reload gs selector with exception handling
+ * edi: new selector
*
- * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
- * whereas we omit the preemption check if @paranoid != 0. This is purely
- * because the implementation is simpler this way. The kernel only needs
- * to check for asynchronous kernel preemption when IRQ handlers return.
- *
- * If @paranoid == 0, then the stub will handle IRET faults by pretending
- * that the fault came from user mode. It will handle gs_change faults by
- * pretending that the fault happened with kernel GSBASE. Since this handling
- * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
- * @paranoid == 0. This special handling will do the wrong thing for
- * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
- *
- * @paranoid == 2 is special: the stub will never switch stacks. This is for
- * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+ * Is in entry.text as it shouldn't be instrumented.
*/
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
-SYM_CODE_START(\sym)
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8
-
- /* Sanity check */
- .if \shift_ist != -1 && \paranoid != 1
- .error "using shift_ist requires paranoid=1"
- .endif
-
- .if \create_gap && \paranoid
- .error "using create_gap requires paranoid=0"
- .endif
-
- ASM_CLAC
-
- .if \has_error_code == 0
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- .endif
-
- .if \paranoid == 1
- testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */
- jnz .Lfrom_usermode_switch_stack_\@
- .endif
-
- .if \create_gap == 1
- /*
- * If coming from kernel space, create a 6-word gap to allow the
- * int3 handler to emulate a call instruction.
- */
- testb $3, CS-ORIG_RAX(%rsp)
- jnz .Lfrom_usermode_no_gap_\@
- .rept 6
- pushq 5*8(%rsp)
- .endr
- UNWIND_HINT_IRET_REGS offset=8
-.Lfrom_usermode_no_gap_\@:
- .endif
-
- idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
-
- .if \paranoid == 1
- /*
- * Entry from userspace. Switch stacks and treat it
- * as a normal entry. This means that paranoid handlers
- * run in real process context if user_mode(regs).
- */
-.Lfrom_usermode_switch_stack_\@:
- idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0
- .endif
-
-_ASM_NOKPROBE(\sym)
-SYM_CODE_END(\sym)
-.endm
-
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
-
- /*
- * Reload gs selector with exception handling
- * edi: new selector
- */
-SYM_FUNC_START(native_load_gs_index)
+SYM_FUNC_START(asm_load_gs_index)
FRAME_BEGIN
- pushfq
- DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
- TRACE_IRQS_OFF
- SWAPGS
+ swapgs
.Lgs_change:
movl %edi, %gs
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
- SWAPGS
- TRACE_IRQS_FLAGS (%rsp)
- popfq
+ swapgs
FRAME_END
ret
-SYM_FUNC_END(native_load_gs_index)
-EXPORT_SYMBOL(native_load_gs_index)
+SYM_FUNC_END(asm_load_gs_index)
+EXPORT_SYMBOL(asm_load_gs_index)
_ASM_EXTABLE(.Lgs_change, .Lbad_gs)
.section .fixup, "ax"
/* running with kernelgs */
SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
- SWAPGS /* switch back to user gs */
+ swapgs /* switch back to user gs */
.macro ZAP_GS
/* This can't be a string because the preprocessor needs to see it. */
movl $__USER_DS, %eax
@@ -1074,20 +676,48 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
SYM_CODE_END(.Lbad_gs)
.previous
-/* Call softirq on interrupt stack. Interrupts are off. */
-SYM_FUNC_START(do_softirq_own_stack)
- pushq %rbp
- mov %rsp, %rbp
- ENTER_IRQ_STACK regs=0 old_rsp=%r11
- call __do_softirq
- LEAVE_IRQ_STACK regs=0
+/*
+ * rdi: New stack pointer points to the top word of the stack
+ * rsi: Function pointer
+ * rdx: Function argument (can be NULL if none)
+ */
+SYM_FUNC_START(asm_call_on_stack)
+SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL)
+SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL)
+ /*
+ * Save the frame pointer unconditionally. This allows the ORC
+ * unwinder to handle the stack switch.
+ */
+ pushq %rbp
+ mov %rsp, %rbp
+
+ /*
+ * The unwinder relies on the word at the top of the new stack
+ * page linking back to the previous RSP.
+ */
+ mov %rsp, (%rdi)
+ mov %rdi, %rsp
+ /* Move the argument to the right place */
+ mov %rdx, %rdi
+
+1:
+ .pushsection .discard.instr_begin
+ .long 1b - .
+ .popsection
+
+ CALL_NOSPEC rsi
+
+2:
+ .pushsection .discard.instr_end
+ .long 2b - .
+ .popsection
+
+ /* Restore the previous stack pointer from RBP. */
leaveq
ret
-SYM_FUNC_END(do_softirq_own_stack)
+SYM_FUNC_END(asm_call_on_stack)
#ifdef CONFIG_XEN_PV
-idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
-
/*
* A note on the "critical region" in our callback handler.
* We want to avoid stacking callback handlers due to events occurring
@@ -1100,9 +730,10 @@ idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
* So, on entry to the handler we detect whether we interrupted an
* existing activation in its critical region -- if so, we pop the current
* activation and restart the handler using the previous one.
+ *
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
*/
-/* do_hypervisor_callback(struct *pt_regs) */
-SYM_CODE_START_LOCAL(xen_do_hypervisor_callback)
+SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
@@ -1112,15 +743,10 @@ SYM_CODE_START_LOCAL(xen_do_hypervisor_callback)
movq %rdi, %rsp /* we don't return, adjust the stack frame */
UNWIND_HINT_REGS
- ENTER_IRQ_STACK old_rsp=%r10
- call xen_evtchn_do_upcall
- LEAVE_IRQ_STACK
+ call xen_pv_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp error_exit
-SYM_CODE_END(xen_do_hypervisor_callback)
+ jmp error_return
+SYM_CODE_END(exc_xen_hypervisor_callback)
/*
* Hypervisor uses this for application faults while it executes.
@@ -1155,7 +781,7 @@ SYM_CODE_START(xen_failsafe_callback)
addq $0x30, %rsp
pushq $0 /* RIP */
UNWIND_HINT_IRET_REGS offset=8
- jmp general_protection
+ jmp asm_exc_general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp), %rcx
movq 8(%rsp), %r11
@@ -1164,71 +790,26 @@ SYM_CODE_START(xen_failsafe_callback)
pushq $-1 /* orig_ax = -1 => not a system call */
PUSH_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
- jmp error_exit
+ jmp error_return
SYM_CODE_END(xen_failsafe_callback)
#endif /* CONFIG_XEN_PV */
-#ifdef CONFIG_XEN_PVHVM
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- xen_hvm_callback_vector xen_evtchn_do_upcall
-#endif
-
-
-#if IS_ENABLED(CONFIG_HYPERV)
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- hyperv_callback_vector hyperv_vector_handler
-
-apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \
- hyperv_reenlightenment_vector hyperv_reenlightenment_intr
-
-apicinterrupt3 HYPERV_STIMER0_VECTOR \
- hv_stimer0_callback_vector hv_stimer0_vector_handler
-#endif /* CONFIG_HYPERV */
-
-#if IS_ENABLED(CONFIG_ACRN_GUEST)
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- acrn_hv_callback_vector acrn_hv_vector_handler
-#endif
-
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
-idtentry int3 do_int3 has_error_code=0 create_gap=1
-idtentry stack_segment do_stack_segment has_error_code=1
-
-#ifdef CONFIG_XEN_PV
-idtentry xennmi do_nmi has_error_code=0
-idtentry xendebug do_debug has_error_code=0
-#endif
-
-idtentry general_protection do_general_protection has_error_code=1
-idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
-
-#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1
-#endif
-
-#ifdef CONFIG_X86_MCE
-idtentry machine_check do_mce has_error_code=0 paranoid=1
-#endif
-
/*
- * Save all registers in pt_regs, and switch gs if needed.
- * Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ * Save all registers in pt_regs. Return GSBASE related information
+ * in EBX depending on the availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE R/EBX
+ * N 0 -> SWAPGS on exit
+ * 1 -> no SWAPGS on exit
+ *
+ * Y GSBASE value at entry, must be restored in paranoid_exit
*/
SYM_CODE_START_LOCAL(paranoid_entry)
UNWIND_HINT_FUNC
cld
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
- movl $1, %ebx
- movl $MSR_GS_BASE, %ecx
- rdmsr
- testl %edx, %edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx, %ebx
-1:
/*
* Always stash CR3 in %r14. This value will be restored,
* verbatim, at exit. Needed if paranoid_entry interrupted
@@ -1238,16 +819,60 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* This is also why CS (stashed in the "iret frame" by the
* hardware at entry) can not be used: this may be a return
* to kernel code, but with a user CR3 value.
+ *
+ * Switching CR3 does not depend on kernel GSBASE so it can
+ * be done before switching to the kernel GSBASE. This is
+ * required for FSGSBASE because the kernel GSBASE has to
+ * be retrieved from a kernel internal table.
*/
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
/*
+ * Handling GSBASE depends on the availability of FSGSBASE.
+ *
+ * Without FSGSBASE the kernel enforces that negative GSBASE
+ * values indicate kernel GSBASE. With FSGSBASE no assumptions
+ * can be made about the GSBASE value when entering from user
+ * space.
+ */
+ ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
+
+ /*
+ * Read the current GSBASE and store it in %rbx unconditionally,
+ * retrieve and set the current CPUs kernel GSBASE. The stored value
+ * has to be restored in paranoid_exit unconditionally.
+ *
+ * The MSR write ensures that no subsequent load is based on a
+ * mispredicted GSBASE. No extra FENCE required.
+ */
+ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
+ ret
+
+.Lparanoid_entry_checkgs:
+ /* EBX = 1 -> kernel GSBASE active, no restore required */
+ movl $1, %ebx
+ /*
+ * The kernel-enforced convention is a negative GSBASE indicates
+ * a kernel value. No SWAPGS needed on entry and exit.
+ */
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ jns .Lparanoid_entry_swapgs
+ ret
+
+.Lparanoid_entry_swapgs:
+ SWAPGS
+
+ /*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
* unconditional CR3 write, even in the PTI case. So do an lfence
* to prevent GS speculation, regardless of whether PTI is enabled.
*/
FENCE_SWAPGS_KERNEL_ENTRY
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
ret
SYM_CODE_END(paranoid_entry)
@@ -1258,27 +883,45 @@ SYM_CODE_END(paranoid_entry)
*
* We may be returning to very strange contexts (e.g. very early
* in syscall entry), so checking for preemption here would
- * be complicated. Fortunately, we there's no good reason
- * to try to handle preemption here.
+ * be complicated. Fortunately, there's no good reason to try
+ * to handle preemption here.
+ *
+ * R/EBX contains the GSBASE related information depending on the
+ * availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE R/EBX
+ * N 0 -> SWAPGS on exit
+ * 1 -> no SWAPGS on exit
*
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * Y User space GSBASE, must be restored unconditionally
*/
SYM_CODE_START_LOCAL(paranoid_exit)
UNWIND_HINT_REGS
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF_DEBUG
- testl %ebx, %ebx /* swapgs needed? */
- jnz .Lparanoid_exit_no_swapgs
- TRACE_IRQS_IRETQ
- /* Always restore stashed CR3 value (see paranoid_entry) */
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
+ /*
+ * The order of operations is important. RESTORE_CR3 requires
+ * kernel GSBASE.
+ *
+ * NB to anyone to try to optimize this code: this code does
+ * not execute at all for exceptions from user mode. Those
+ * exceptions go through error_exit instead.
+ */
+ RESTORE_CR3 scratch_reg=%rax save_reg=%r14
+
+ /* Handle the three GSBASE cases */
+ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
+
+ /* With FSGSBASE enabled, unconditionally restore GSBASE */
+ wrgsbase %rbx
+ jmp restore_regs_and_return_to_kernel
+
+.Lparanoid_exit_checkgs:
+ /* On non-FSGSBASE systems, conditionally do SWAPGS */
+ testl %ebx, %ebx
+ jnz restore_regs_and_return_to_kernel
+
+ /* We are returning to a context with user GSBASE */
SWAPGS_UNSAFE_STACK
- jmp restore_regs_and_return_to_kernel
-.Lparanoid_exit_no_swapgs:
- TRACE_IRQS_IRETQ_DEBUG
- /* Always restore stashed CR3 value (see paranoid_entry) */
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
- jmp restore_regs_and_return_to_kernel
+ jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
/*
@@ -1339,7 +982,6 @@ SYM_CODE_START_LOCAL(error_entry)
*/
SWAPGS
FENCE_SWAPGS_USER_ENTRY
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done
.Lbstep_iret:
@@ -1366,14 +1008,13 @@ SYM_CODE_START_LOCAL(error_entry)
jmp .Lerror_entry_from_usermode_after_swapgs
SYM_CODE_END(error_entry)
-SYM_CODE_START_LOCAL(error_exit)
+SYM_CODE_START_LOCAL(error_return)
UNWIND_HINT_REGS
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
testb $3, CS(%rsp)
- jz retint_kernel
- jmp .Lretint_user
-SYM_CODE_END(error_exit)
+ jz restore_regs_and_return_to_kernel
+ jmp swapgs_restore_regs_and_return_to_usermode
+SYM_CODE_END(error_return)
/*
* Runs on exception stack. Xen PV does not go through this path at all,
@@ -1383,7 +1024,7 @@ SYM_CODE_END(error_exit)
* %r14: Used to save/restore the CR3 of the interrupted context
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
-SYM_CODE_START(nmi)
+SYM_CODE_START(asm_exc_nmi)
UNWIND_HINT_IRET_REGS
/*
@@ -1468,7 +1109,7 @@ SYM_CODE_START(nmi)
movq %rsp, %rdi
movq $-1, %rsi
- call do_nmi
+ call exc_nmi
/*
* Return back to user mode. We must *not* do the normal exit
@@ -1525,7 +1166,7 @@ SYM_CODE_START(nmi)
* end_repeat_nmi, then we are a nested NMI. We must not
* modify the "iret" frame because it's being written by
* the outer NMI. That's okay; the outer NMI handler is
- * about to about to call do_nmi anyway, so we can just
+ * about to about to call exc_nmi() anyway, so we can just
* resume the outer NMI.
*/
@@ -1644,7 +1285,7 @@ repeat_nmi:
* RSP is pointing to "outermost RIP". gsbase is unknown, but, if
* we're repeating an NMI, gsbase has the same value that it had on
* the first iteration. paranoid_entry will load the kernel
- * gsbase if needed before we call do_nmi. "NMI executing"
+ * gsbase if needed before we call exc_nmi(). "NMI executing"
* is zero.
*/
movq $1, 10*8(%rsp) /* Set "NMI executing". */
@@ -1678,18 +1319,34 @@ end_repeat_nmi:
call paranoid_entry
UNWIND_HINT_REGS
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
movq $-1, %rsi
- call do_nmi
+ call exc_nmi
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
- testl %ebx, %ebx /* swapgs needed? */
+ /*
+ * The above invocation of paranoid_entry stored the GSBASE
+ * related information in R/EBX depending on the availability
+ * of FSGSBASE.
+ *
+ * If FSGSBASE is enabled, restore the saved GSBASE value
+ * unconditionally, otherwise take the conditional SWAPGS path.
+ */
+ ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
+
+ wrgsbase %rbx
+ jmp nmi_restore
+
+nmi_no_fsgsbase:
+ /* EBX == 0 -> invoke SWAPGS */
+ testl %ebx, %ebx
jnz nmi_restore
+
nmi_swapgs:
SWAPGS_UNSAFE_STACK
+
nmi_restore:
POP_REGS
@@ -1718,7 +1375,7 @@ nmi_restore:
* about espfix64 on the way back to kernel mode.
*/
iretq
-SYM_CODE_END(nmi)
+SYM_CODE_END(asm_exc_nmi)
#ifndef CONFIG_IA32_EMULATION
/*
@@ -1732,6 +1389,7 @@ SYM_CODE_START(ignore_sysret)
SYM_CODE_END(ignore_sysret)
#endif
+.pushsection .text, "ax"
SYM_CODE_START(rewind_stack_do_exit)
UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
@@ -1743,3 +1401,4 @@ SYM_CODE_START(rewind_stack_do_exit)
call do_exit
SYM_CODE_END(rewind_stack_do_exit)
+.popsection
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index f1d3ccae5dd5..541fdaf64045 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -46,38 +46,41 @@
* ebp user stack
* 0(%ebp) arg6
*/
-SYM_FUNC_START(entry_SYSENTER_compat)
+SYM_CODE_START(entry_SYSENTER_compat)
+ UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
SWAPGS
- /* We are about to clobber %rsp anyway, clobbering here is OK */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+ pushq %rax
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ popq %rax
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- /*
- * User tracing code (ptrace or signal handlers) might assume that
- * the saved RAX contains a 32-bit number when we're invoking a 32-bit
- * syscall. Just in case the high bits are nonzero, zero-extend
- * the syscall number. (This could almost certainly be deleted
- * with no ill effects.)
- */
- movl %eax, %eax
-
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
- pushq %rbp /* pt_regs->sp (stashed in bp) */
+ pushq $0 /* pt_regs->sp = 0 (placeholder) */
/*
* Push flags. This is nasty. First, interrupts are currently
- * off, but we need pt_regs->flags to have IF set. Second, even
- * if TF was set when SYSENTER started, it's clear by now. We fix
- * that later using TIF_SINGLESTEP.
+ * off, but we need pt_regs->flags to have IF set. Second, if TS
+ * was set in usermode, it's still set, and we're singlestepping
+ * through this code. do_SYSENTER_32() will fix up IF.
*/
pushfq /* pt_regs->flags (except IF = 0) */
- orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */
pushq $__USER32_CS /* pt_regs->cs */
pushq $0 /* pt_regs->ip = 0 (placeholder) */
+SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
+
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+ * syscall. Just in case the high bits are nonzero, zero-extend
+ * the syscall number. (This could almost certainly be deleted
+ * with no ill effects.)
+ */
+ movl %eax, %eax
+
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -104,6 +107,9 @@ SYM_FUNC_START(entry_SYSENTER_compat)
xorl %r14d, %r14d /* nospec r14 */
pushq $0 /* pt_regs->r15 = 0 */
xorl %r15d, %r15d /* nospec r15 */
+
+ UNWIND_HINT_REGS
+
cld
/*
@@ -129,17 +135,11 @@ SYM_FUNC_START(entry_SYSENTER_compat)
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movq %rsp, %rdi
- call do_fast_syscall_32
+ call do_SYSENTER_32
/* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
jmp sysret32_from_system_call
.Lsysenter_fix_flags:
@@ -147,7 +147,7 @@ SYM_FUNC_START(entry_SYSENTER_compat)
popfq
jmp .Lsysenter_flags_fixed
SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL)
-SYM_FUNC_END(entry_SYSENTER_compat)
+SYM_CODE_END(entry_SYSENTER_compat)
/*
* 32-bit SYSCALL entry.
@@ -197,6 +197,7 @@ SYM_FUNC_END(entry_SYSENTER_compat)
* 0(%esp) arg6
*/
SYM_CODE_START(entry_SYSCALL_compat)
+ UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
swapgs
@@ -247,17 +248,13 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
pushq $0 /* pt_regs->r15 = 0 */
xorl %r15d, %r15d /* nospec r15 */
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
+ UNWIND_HINT_REGS
movq %rsp, %rdi
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/* Opportunistic SYSRET */
sysret32_from_system_call:
@@ -266,7 +263,7 @@ sysret32_from_system_call:
* stack. So let's erase the thread stack right now.
*/
STACKLEAK_ERASE
- TRACE_IRQS_ON /* User mode traces as IRQs on. */
+
movq RBX(%rsp), %rbx /* pt_regs->rbx */
movq RBP(%rsp), %rbp /* pt_regs->rbp */
movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
@@ -340,6 +337,7 @@ SYM_CODE_END(entry_SYSCALL_compat)
* ebp arg6
*/
SYM_CODE_START(entry_INT80_compat)
+ UNWIND_HINT_EMPTY
/*
* Interrupts are off on entry.
*/
@@ -361,8 +359,11 @@ SYM_CODE_START(entry_INT80_compat)
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
/* In the Xen PV case we already run on the thread stack. */
- ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
+ ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
+
+ movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 6*8(%rdi) /* regs->ss */
@@ -401,19 +402,12 @@ SYM_CODE_START(entry_INT80_compat)
xorl %r14d, %r14d /* nospec r14 */
pushq %r15 /* pt_regs->r15 */
xorl %r15d, %r15d /* nospec r15 */
- cld
- /*
- * User mode is traced as though IRQs are on, and the interrupt
- * gate turned them off.
- */
- TRACE_IRQS_OFF
+ UNWIND_HINT_REGS
+
+ cld
movq %rsp, %rdi
call do_int80_syscall_32
-.Lsyscall_32_done:
-
- /* Go back to user mode. */
- TRACE_IRQS_ON
jmp swapgs_restore_regs_and_return_to_usermode
SYM_CODE_END(entry_INT80_compat)
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index 3d8d70d3896c..1583831f61a9 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -8,6 +8,13 @@
#include <asm/unistd.h>
#include <asm/syscall.h>
+/*
+ * Reuse the 64-bit entry points for the x32 versions that occupy different
+ * slots in the syscall table.
+ */
+#define __x32_sys_getsockopt __x64_sys_getsockopt
+#define __x32_sys_setsockopt __x64_sys_setsockopt
+
#define __SYSCALL_64(nr, sym)
#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *);
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 54581ac671b4..9d1102873666 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -160,7 +160,7 @@
146 i386 writev sys_writev compat_sys_writev
147 i386 getsid sys_getsid
148 i386 fdatasync sys_fdatasync
-149 i386 _sysctl sys_sysctl compat_sys_sysctl
+149 i386 _sysctl sys_ni_syscall
150 i386 mlock sys_mlock
151 i386 munlock sys_munlock
152 i386 mlockall sys_mlockall
@@ -376,8 +376,8 @@
362 i386 connect sys_connect
363 i386 listen sys_listen
364 i386 accept4 sys_accept4
-365 i386 getsockopt sys_getsockopt compat_sys_getsockopt
-366 i386 setsockopt sys_setsockopt compat_sys_setsockopt
+365 i386 getsockopt sys_getsockopt sys_getsockopt
+366 i386 setsockopt sys_setsockopt sys_setsockopt
367 i386 getsockname sys_getsockname
368 i386 getpeername sys_getpeername
369 i386 sendto sys_sendto
@@ -440,5 +440,7 @@
433 i386 fspick sys_fspick
434 i386 pidfd_open sys_pidfd_open
435 i386 clone3 sys_clone3
+436 i386 close_range sys_close_range
437 i386 openat2 sys_openat2
438 i386 pidfd_getfd sys_pidfd_getfd
+439 i386 faccessat2 sys_faccessat2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 37b844f839bc..f30d6ae9a688 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -164,7 +164,7 @@
153 common vhangup sys_vhangup
154 common modify_ldt sys_modify_ldt
155 common pivot_root sys_pivot_root
-156 64 _sysctl sys_sysctl
+156 64 _sysctl sys_ni_syscall
157 common prctl sys_prctl
158 common arch_prctl sys_arch_prctl
159 common adjtimex sys_adjtimex
@@ -357,8 +357,10 @@
433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
435 common clone3 sys_clone3
+436 common close_range sys_close_range
437 common openat2 sys_openat2
438 common pidfd_getfd sys_pidfd_getfd
+439 common faccessat2 sys_faccessat2
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -395,8 +397,8 @@
538 x32 sendmmsg compat_sys_sendmmsg
539 x32 process_vm_readv compat_sys_process_vm_readv
540 x32 process_vm_writev compat_sys_process_vm_writev
-541 x32 setsockopt compat_sys_setsockopt
-542 x32 getsockopt compat_sys_getsockopt
+541 x32 setsockopt sys_setsockopt
+542 x32 getsockopt sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
545 x32 execveat compat_sys_execveat
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index 3a07ce3ec70b..f1f96d4d8cd6 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -29,11 +29,6 @@ SYM_CODE_START_NOALIGN(\name)
SYM_CODE_END(\name)
.endm
-#ifdef CONFIG_TRACE_IRQFLAGS
- THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
- THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
-#endif
-
#ifdef CONFIG_PREEMPTION
THUNK preempt_schedule_thunk, preempt_schedule
THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index dbe4493b534e..ccd32877a3c4 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -3,7 +3,6 @@
* Save registers before calling assembly functions. This avoids
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
- * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
*/
#include <linux/linkage.h>
#include "calling.h"
@@ -37,15 +36,6 @@ SYM_FUNC_END(\name)
_ASM_NOKPROBE(\name)
.endm
-#ifdef CONFIG_TRACE_IRQFLAGS
- THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
- THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
-#endif
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
-#endif
-
#ifdef CONFIG_PREEMPTION
THUNK preempt_schedule_thunk, preempt_schedule
THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
@@ -53,9 +43,7 @@ SYM_FUNC_END(\name)
EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
#endif
-#if defined(CONFIG_TRACE_IRQFLAGS) \
- || defined(CONFIG_DEBUG_LOCK_ALLOC) \
- || defined(CONFIG_PREEMPTION)
+#ifdef CONFIG_PREEMPTION
SYM_CODE_START_LOCAL_NOALIGN(.L_restore)
popq %r11
popq %r10
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 433a1259f61d..215376d975a2 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -10,8 +10,11 @@ ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE
include $(srctree)/lib/vdso/Makefile
KBUILD_CFLAGS += $(DISABLE_LTO)
+
+# Sanitizer runtimes are unavailable and cannot be linked here.
KASAN_SANITIZE := n
UBSAN_SANITIZE := n
+KCSAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD := y
# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
@@ -24,9 +27,14 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
+vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
+vobjs32-y += vdso32/vclock_gettime.o
# files to link into kernel
obj-y += vma.o
+KASAN_SANITIZE_vma.o := y
+UBSAN_SANITIZE_vma.o := y
+KCSAN_SANITIZE_vma.o := y
OBJECT_FILES_NON_STANDARD_vma.o := n
# vDSO images to build
@@ -37,10 +45,12 @@ vdso_img-$(VDSO32-y) += 32
obj-$(VDSO32-y) += vdso32-setup.o
vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F)
$(obj)/vdso.o: $(obj)/vdso.so
targets += vdso.lds $(vobjs-y)
+targets += vdso32/vdso32.lds $(vobjs32-y)
# Build the vDSO image C files and link them in.
vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
@@ -72,7 +82,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
# optimize sibling calls.
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
- $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \
-fno-omit-frame-pointer -foptimize-sibling-calls \
-DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
@@ -130,10 +140,6 @@ $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
-targets += vdso32/vdso32.lds
-targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
-targets += vdso32/vclock_gettime.o
-
KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
@@ -145,7 +151,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
-KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
+KBUILD_CFLAGS_32 += -fno-stack-protector
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
@@ -158,12 +164,7 @@ endif
$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
-$(obj)/vdso32.so.dbg: FORCE \
- $(obj)/vdso32/vdso32.lds \
- $(obj)/vdso32/vclock_gettime.o \
- $(obj)/vdso32/note.o \
- $(obj)/vdso32/system_call.o \
- $(obj)/vdso32/sigreturn.o
+$(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
$(call if_changed,vdso_and_check)
#
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 3842873b3ae3..7380908045c7 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -187,7 +187,7 @@ static void map_input(const char *name, void **addr, size_t *len, int prot)
int fd = open(name, O_RDONLY);
if (fd == -1)
- err(1, "%s", name);
+ err(1, "open(%s)", name);
tmp_len = lseek(fd, 0, SEEK_END);
if (tmp_len == (off_t)-1)
@@ -240,7 +240,7 @@ int main(int argc, char **argv)
outfilename = argv[3];
outfile = fopen(outfilename, "w");
if (!outfile)
- err(1, "%s", argv[2]);
+ err(1, "fopen(%s)", outfilename);
go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index a20b134de2a8..6f46e11ce539 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -13,8 +13,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
unsigned long load_size = -1; /* Work around bogus warning */
unsigned long mapping_size;
ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
- int i;
- unsigned long j;
+ unsigned long i, syms_nr;
ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
*alt_sec = NULL;
ELF(Dyn) *dyn = 0, *dyn_end = 0;
@@ -86,11 +85,10 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
+ syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
/* Walk the symbol table */
- for (i = 0;
- i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
- i++) {
- int k;
+ for (i = 0; i < syms_nr; i++) {
+ unsigned int k;
ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
const char *sym_name = raw_addr +
@@ -150,11 +148,11 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile,
"static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
mapping_size);
- for (j = 0; j < stripped_len; j++) {
- if (j % 10 == 0)
+ for (i = 0; i < stripped_len; i++) {
+ if (i % 10 == 0)
fprintf(outfile, "\n\t");
fprintf(outfile, "0x%02X, ",
- (int)((unsigned char *)stripped_addr)[j]);
+ (int)((unsigned char *)stripped_addr)[i]);
}
fprintf(outfile, "\n};\n\n");
diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S
index e78047d119f6..2cbd39939dc6 100644
--- a/arch/x86/entry/vdso/vdso32/note.S
+++ b/arch/x86/entry/vdso/vdso32/note.S
@@ -16,33 +16,3 @@ ELFNOTE_START(Linux, 0, "a")
ELFNOTE_END
BUILD_SALT
-
-#ifdef CONFIG_XEN
-/*
- * Add a special note telling glibc's dynamic linker a fake hardware
- * flavor that it will use to choose the search path for libraries in the
- * same way it uses real hardware capabilities like "mmx".
- * We supply "nosegneg" as the fake capability, to indicate that we
- * do not like negative offsets in instructions using segment overrides,
- * since we implement those inefficiently. This makes it possible to
- * install libraries optimized to avoid those access patterns in someplace
- * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
- * corresponding to the bits here is needed to make ldconfig work right.
- * It should contain:
- * hwcap 1 nosegneg
- * to match the mapping of bit to name that we give here.
- *
- * At runtime, the fake hardware feature will be considered to be present
- * if its bit is set in the mask word. So, we start with the mask 0, and
- * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
- */
-
-#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
-
-ELFNOTE_START(GNU, 2, "a")
- .long 1 /* ncaps */
-VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */
- .long 0 /* mask */
- .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
-ELFNOTE_END
-#endif
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 43428cc514c8..9185cb1d13b9 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
- if (down_write_killable(&mm->mmap_sem))
- return -EINTR;
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
unsigned long size = vma->vm_end - vma->vm_start;
@@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
zap_page_range(vma, vma->vm_start, size);
}
- up_write(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return 0;
}
#else
@@ -268,7 +267,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
unsigned long text_start;
int ret = 0;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
addr = get_unmapped_area(NULL, addr,
@@ -311,7 +310,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
}
up_fail:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
@@ -373,7 +372,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
/*
* Check if we have already mapped vdso blob - fail to prevent
* abusing from userspace install_speciall_mapping, which may
@@ -384,11 +383,11 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma_is_special_mapping(vma, &vdso_mapping) ||
vma_is_special_mapping(vma, &vvar_mapping)) {
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return -EEXIST;
}
}
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return map_vdso(image, addr);
}