30 files changed, 268 insertions, 175 deletions
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index b35788c909f1..b481b4a7c011 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -83,9 +83,6 @@ endif
 
 ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
 KBUILD_LDFLAGS_MODULE	+= -T $(srctree)/arch/arm64/kernel/module.lds
-ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
-KBUILD_LDFLAGS_MODULE	+= $(objtree)/arch/arm64/kernel/ftrace-mod.o
-endif
 endif
 
 # Default value
diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile
index d7c22d51bc50..4aa50b9b26bc 100644
--- a/arch/arm64/boot/dts/Makefile
+++ b/arch/arm64/boot/dts/Makefile
@@ -12,6 +12,7 @@ subdir-y += cavium
 subdir-y += exynos
 subdir-y += freescale
 subdir-y += hisilicon
+subdir-y += lg
 subdir-y += marvell
 subdir-y += mediatek
 subdir-y += nvidia
@@ -22,5 +23,4 @@ subdir-y += rockchip
 subdir-y += socionext
 subdir-y += sprd
 subdir-y += xilinx
-subdir-y += lg
 subdir-y += zte
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
index ead895a4e9a5..1fb8b9d6cb4e 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb.dtsi
@@ -753,12 +753,12 @@
 
 &uart_B {
 	clocks = <&xtal>, <&clkc CLKID_UART1>, <&xtal>;
-	clock-names = "xtal", "core", "baud";
+	clock-names = "xtal", "pclk", "baud";
 };
 
 &uart_C {
 	clocks = <&xtal>, <&clkc CLKID_UART2>, <&xtal>;
-	clock-names = "xtal", "core", "baud";
+	clock-names = "xtal", "pclk", "baud";
 };
 
 &vpu {
diff --git a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi
index 8ed981f59e5a..6524b89e7115 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-gxl.dtsi
@@ -688,7 +688,7 @@
 
 &uart_A {
 	clocks = <&xtal>, <&clkc CLKID_UART0>, <&xtal>;
-	clock-names = "xtal", "core", "baud";
+	clock-names = "xtal", "pclk", "baud";
 };
 
 &uart_AO {
@@ -703,12 +703,12 @@
 
 &uart_B {
 	clocks = <&xtal>, <&clkc CLKID_UART1>, <&xtal>;
-	clock-names = "xtal", "core", "baud";
+	clock-names = "xtal", "pclk", "baud";
 };
 
 &uart_C {
 	clocks = <&xtal>, <&clkc CLKID_UART2>, <&xtal>;
-	clock-names = "xtal", "core", "baud";
+	clock-names = "xtal", "pclk", "baud";
 };
 
 &vpu {
diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld11-ref.dts b/arch/arm64/boot/dts/socionext/uniphier-ld11-ref.dts
index dd7193acc7df..6bdefb26b329 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld11-ref.dts
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld11-ref.dts
@@ -40,7 +40,6 @@
 };
 
 &ethsc {
-	interrupt-parent = <&gpio>;
 	interrupts = <0 8>;
 };
 
diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts b/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
index d99e3731358c..254d6795c67e 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld20-ref.dts
@@ -40,7 +40,6 @@
 };
 
 &ethsc {
-	interrupt-parent = <&gpio>;
 	interrupts = <0 8>;
 };
 
diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3-ref.dts b/arch/arm64/boot/dts/socionext/uniphier-pxs3-ref.dts
index 864feeb35180..f9f06fcfb94a 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-pxs3-ref.dts
+++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3-ref.dts
@@ -38,8 +38,7 @@
 };
 
 &ethsc {
-	interrupt-parent = <&gpio>;
-	interrupts = <0 8>;
+	interrupts = <4 8>;
 };
 
 &serial0 {
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 76d1cc85d5b1..955130762a3c 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -38,7 +38,7 @@
  *
  *	See Documentation/cachetlb.txt for more information. Please note that
  *	the implementation assumes non-aliasing VIPT D-cache and (aliasing)
- *	VIPT or ASID-tagged VIVT I-cache.
+ *	VIPT I-cache.
  *
  *	flush_cache_mm(mm)
  *
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 650344d01124..c4cd5081d78b 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -132,11 +132,9 @@ static inline void efi_set_pgd(struct mm_struct *mm)
 			 * Defer the switch to the current thread's TTBR0_EL1
 			 * until uaccess_enable(). Restore the current
 			 * thread's saved ttbr0 corresponding to its active_mm
-			 * (if different from init_mm).
 			 */
 			cpu_set_reserved_ttbr0();
-			if (current->active_mm != &init_mm)
-				update_saved_ttbr0(current, current->active_mm);
+			update_saved_ttbr0(current, current->active_mm);
 		}
 	}
 }
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 7f069ff37f06..715d395ef45b 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -170,8 +170,7 @@
 #define VTCR_EL2_FLAGS			(VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
 #define VTTBR_X				(VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
 
-#define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
-#define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
+#define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X)
 #define VTTBR_VMID_SHIFT  (UL(48))
 #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 674912d7a571..ea6cb5b24258 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -370,6 +370,7 @@ void kvm_arm_init_debug(void);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
+bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
 			       struct kvm_device_attr *attr);
 int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 3257895a9b5e..9d155fa9a507 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -156,29 +156,21 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
 
 #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.id, 0); 0; })
 
-/*
- * This is called when "tsk" is about to enter lazy TLB mode.
- *
- * mm:  describes the currently active mm context
- * tsk: task which is entering lazy tlb
- * cpu: cpu number which is entering lazy tlb
- *
- * tsk->mm will be NULL
- */
-static inline void
-enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 static inline void update_saved_ttbr0(struct task_struct *tsk,
 				      struct mm_struct *mm)
 {
-	if (system_uses_ttbr0_pan()) {
-		BUG_ON(mm->pgd == swapper_pg_dir);
-		task_thread_info(tsk)->ttbr0 =
-			virt_to_phys(mm->pgd) | ASID(mm) << 48;
-	}
+	u64 ttbr;
+
+	if (!system_uses_ttbr0_pan())
+		return;
+
+	if (mm == &init_mm)
+		ttbr = __pa_symbol(empty_zero_page);
+	else
+		ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
+
+	task_thread_info(tsk)->ttbr0 = ttbr;
 }
 #else
 static inline void update_saved_ttbr0(struct task_struct *tsk,
@@ -187,6 +179,16 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
 }
 #endif
 
+static inline void
+enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+	/*
+	 * We don't actually care about the ttbr0 mapping, so point it at the
+	 * zero page.
+	 */
+	update_saved_ttbr0(tsk, &init_mm);
+}
+
 static inline void __switch_mm(struct mm_struct *next)
 {
 	unsigned int cpu = smp_processor_id();
@@ -214,11 +216,9 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	 * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
 	 * value may have not been initialised yet (activate_mm caller) or the
 	 * ASID has changed since the last run (following the context switch
-	 * of another thread of the same process). Avoid setting the reserved
-	 * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit).
+	 * of another thread of the same process).
 	 */
-	if (next != &init_mm)
-		update_saved_ttbr0(tsk, next);
+	update_saved_ttbr0(tsk, next);
 }
 
 #define deactivate_mm(tsk,mm)	do { } while (0)
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index 19bd97671bb8..4f766178fa6f 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -32,7 +32,7 @@ struct mod_arch_specific {
 	struct mod_plt_sec	init;
 
 	/* for CONFIG_DYNAMIC_FTRACE */
-	void			*ftrace_trampoline;
+	struct plt_entry 	*ftrace_trampoline;
 };
 #endif
 
@@ -45,4 +45,48 @@ extern u64 module_alloc_base;
 #define module_alloc_base	((u64)_etext - MODULES_VSIZE)
 #endif
 
+struct plt_entry {
+	/*
+	 * A program that conforms to the AArch64 Procedure Call Standard
+	 * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or
+	 * IP1 (x17) may be inserted at any branch instruction that is
+	 * exposed to a relocation that supports long branches. Since that
+	 * is exactly what we are dealing with here, we are free to use x16
+	 * as a scratch register in the PLT veneers.
+	 */
+	__le32	mov0;	/* movn	x16, #0x....			*/
+	__le32	mov1;	/* movk	x16, #0x...., lsl #16		*/
+	__le32	mov2;	/* movk	x16, #0x...., lsl #32		*/
+	__le32	br;	/* br	x16				*/
+};
+
+static inline struct plt_entry get_plt_entry(u64 val)
+{
+	/*
+	 * MOVK/MOVN/MOVZ opcode:
+	 * +--------+------------+--------+-----------+-------------+---------+
+	 * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] |
+	 * +--------+------------+--------+-----------+-------------+---------+
+	 *
+	 * Rd     := 0x10 (x16)
+	 * hw     := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32)
+	 * opc    := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ)
+	 * sf     := 1 (64-bit variant)
+	 */
+	return (struct plt_entry){
+		cpu_to_le32(0x92800010 | (((~val      ) & 0xffff)) << 5),
+		cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5),
+		cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5),
+		cpu_to_le32(0xd61f0200)
+	};
+}
+
+static inline bool plt_entries_equal(const struct plt_entry *a,
+				     const struct plt_entry *b)
+{
+	return a->mov0 == b->mov0 &&
+	       a->mov1 == b->mov1 &&
+	       a->mov2 == b->mov2;
+}
+
 #endif /* __ASM_MODULE_H */
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 8d5cbec17d80..f9ccc36d3dc3 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -18,6 +18,7 @@
 #define __ASM_PERF_EVENT_H
 
 #include <asm/stack_pointer.h>
+#include <asm/ptrace.h>
 
 #define	ARMV8_PMU_MAX_COUNTERS	32
 #define	ARMV8_PMU_COUNTER_MASK	(ARMV8_PMU_MAX_COUNTERS - 1)
@@ -79,6 +80,7 @@ struct pt_regs;
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
+#define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs
 #endif
 
 #define perf_arch_fetch_caller_regs(regs, __ip) { \
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c9530b5b5ca8..149d05fb9421 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -345,7 +345,6 @@ static inline int pmd_protnone(pmd_t pmd)
 
 #define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))
 
-#define __HAVE_ARCH_PMD_WRITE
 #define pmd_write(pmd)		pte_write(pmd_pte(pmd))
 
 #define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/arm64/include/uapi/asm/bpf_perf_event.h b/arch/arm64/include/uapi/asm/bpf_perf_event.h
new file mode 100644
index 000000000000..b551b741653d
--- /dev/null
+++ b/arch/arm64/include/uapi/asm/bpf_perf_event.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__
+#define _UAPI__ASM_BPF_PERF_EVENT_H__
+
+#include <asm/ptrace.h>
+
+typedef struct user_pt_regs bpf_user_pt_regs_t;
+
+#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 8265dd790895..067baace74a0 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -61,6 +61,3 @@ extra-y					+= $(head-y) vmlinux.lds
 ifeq ($(CONFIG_DEBUG_EFI),y)
 AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""
 endif
-
-# will be included by each individual module but not by the core kernel itself
-extra-$(CONFIG_DYNAMIC_FTRACE) += ftrace-mod.o
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c
index d16978213c5b..ea001241bdd4 100644
--- a/arch/arm64/kernel/cpu_ops.c
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -31,13 +31,13 @@ extern const struct cpu_operations cpu_psci_ops;
 
 const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;
 
-static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = {
+static const struct cpu_operations *const dt_supported_cpu_ops[] __initconst = {
 	&smp_spin_table_ops,
 	&cpu_psci_ops,
 	NULL,
 };
 
-static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = {
+static const struct cpu_operations *const acpi_supported_cpu_ops[] __initconst = {
 #ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
 	&acpi_parking_protocol_ops,
 #endif
@@ -47,7 +47,7 @@ static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = {
 
 static const struct cpu_operations * __init cpu_get_ops(const char *name)
 {
-	const struct cpu_operations **ops;
+	const struct cpu_operations *const *ops;
 
 	ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops;
 
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 143b3e72c25e..540a1e010eb5 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -114,7 +114,12 @@
  *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
  *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
  */
-static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
+struct fpsimd_last_state_struct {
+	struct fpsimd_state *st;
+	bool sve_in_use;
+};
+
+static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
 
 /* Default VL for tasks that don't set it explicitly: */
 static int sve_default_vl = -1;
@@ -905,7 +910,7 @@ void fpsimd_thread_switch(struct task_struct *next)
 		 */
 		struct fpsimd_state *st = &next->thread.fpsimd_state;
 
-		if (__this_cpu_read(fpsimd_last_state) == st
+		if (__this_cpu_read(fpsimd_last_state.st) == st
 		    && st->cpu == smp_processor_id())
 			clear_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE);
 		else
@@ -992,6 +997,21 @@ void fpsimd_signal_preserve_current_state(void)
 }
 
 /*
+ * Associate current's FPSIMD context with this cpu
+ * Preemption must be disabled when calling this function.
+ */
+static void fpsimd_bind_to_cpu(void)
+{
+	struct fpsimd_last_state_struct *last =
+		this_cpu_ptr(&fpsimd_last_state);
+	struct fpsimd_state *st = &current->thread.fpsimd_state;
+
+	last->st = st;
+	last->sve_in_use = test_thread_flag(TIF_SVE);
+	st->cpu = smp_processor_id();
+}
+
+/*
  * Load the userland FPSIMD state of 'current' from memory, but only if the
  * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
  * state of 'current'
@@ -1004,11 +1024,8 @@ void fpsimd_restore_current_state(void)
 	local_bh_disable();
 
 	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
-		struct fpsimd_state *st = &current->thread.fpsimd_state;
-
 		task_fpsimd_load();
-		__this_cpu_write(fpsimd_last_state, st);
-		st->cpu = smp_processor_id();
+		fpsimd_bind_to_cpu();
 	}
 
 	local_bh_enable();
@@ -1026,18 +1043,14 @@ void fpsimd_update_current_state(struct fpsimd_state *state)
 
 	local_bh_disable();
 
-	if (system_supports_sve() && test_thread_flag(TIF_SVE)) {
-		current->thread.fpsimd_state = *state;
+	current->thread.fpsimd_state = *state;
+	if (system_supports_sve() && test_thread_flag(TIF_SVE))
 		fpsimd_to_sve(current);
-	}
-	task_fpsimd_load();
 
-	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
-		struct fpsimd_state *st = &current->thread.fpsimd_state;
+	task_fpsimd_load();
 
-		__this_cpu_write(fpsimd_last_state, st);
-		st->cpu = smp_processor_id();
-	}
+	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE))
+		fpsimd_bind_to_cpu();
 
 	local_bh_enable();
 }
@@ -1052,7 +1065,7 @@ void fpsimd_flush_task_state(struct task_struct *t)
 
 static inline void fpsimd_flush_cpu_state(void)
 {
-	__this_cpu_write(fpsimd_last_state, NULL);
+	__this_cpu_write(fpsimd_last_state.st, NULL);
 }
 
 /*
@@ -1065,14 +1078,10 @@ static inline void fpsimd_flush_cpu_state(void)
 #ifdef CONFIG_ARM64_SVE
 void sve_flush_cpu_state(void)
 {
-	struct fpsimd_state *const fpstate = __this_cpu_read(fpsimd_last_state);
-	struct task_struct *tsk;
-
-	if (!fpstate)
-		return;
+	struct fpsimd_last_state_struct const *last =
+		this_cpu_ptr(&fpsimd_last_state);
 
-	tsk = container_of(fpstate, struct task_struct, thread.fpsimd_state);
-	if (test_tsk_thread_flag(tsk, TIF_SVE))
+	if (last->st && last->sve_in_use)
 		fpsimd_flush_cpu_state();
 }
 #endif /* CONFIG_ARM64_SVE */
@@ -1267,7 +1276,7 @@ static inline void fpsimd_pm_init(void) { }
 #ifdef CONFIG_HOTPLUG_CPU
 static int fpsimd_cpu_dead(unsigned int cpu)
 {
-	per_cpu(fpsimd_last_state, cpu) = NULL;
+	per_cpu(fpsimd_last_state.st, cpu) = NULL;
 	return 0;
 }
 
diff --git a/arch/arm64/kernel/ftrace-mod.S b/arch/arm64/kernel/ftrace-mod.S
deleted file mode 100644
index 00c4025be4ff..000000000000
--- a/arch/arm64/kernel/ftrace-mod.S
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.section	".text.ftrace_trampoline", "ax"
-	.align		3
-0:	.quad		0
-__ftrace_trampoline:
-	ldr		x16, 0b
-	br		x16
-ENDPROC(__ftrace_trampoline)
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index c13b1fca0e5b..50986e388d2b 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -76,7 +76,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 
 	if (offset < -SZ_128M || offset >= SZ_128M) {
 #ifdef CONFIG_ARM64_MODULE_PLTS
-		unsigned long *trampoline;
+		struct plt_entry trampoline;
 		struct module *mod;
 
 		/*
@@ -104,22 +104,24 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 		 * is added in the future, but for now, the pr_err() below
 		 * deals with a theoretical issue only.
 		 */
-		trampoline = (unsigned long *)mod->arch.ftrace_trampoline;
-		if (trampoline[0] != addr) {
-			if (trampoline[0] != 0) {
+		trampoline = get_plt_entry(addr);
+		if (!plt_entries_equal(mod->arch.ftrace_trampoline,
+				       &trampoline)) {
+			if (!plt_entries_equal(mod->arch.ftrace_trampoline,
+					       &(struct plt_entry){})) {
 				pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n");
 				return -EINVAL;
 			}
 
 			/* point the trampoline to our ftrace entry point */
 			module_disable_ro(mod);
-			trampoline[0] = addr;
+			*mod->arch.ftrace_trampoline = trampoline;
 			module_enable_ro(mod, true);
 
 			/* update trampoline before patching in the branch */
 			smp_wmb();
 		}
-		addr = (unsigned long)&trampoline[1];
+		addr = (unsigned long)(void *)mod->arch.ftrace_trampoline;
 #else /* CONFIG_ARM64_MODULE_PLTS */
 		return -EINVAL;
 #endif /* CONFIG_ARM64_MODULE_PLTS */
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index d05dbe658409..ea640f92fe5a 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -11,21 +11,6 @@
 #include <linux/module.h>
 #include <linux/sort.h>
 
-struct plt_entry {
-	/*
-	 * A program that conforms to the AArch64 Procedure Call Standard
-	 * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or
-	 * IP1 (x17) may be inserted at any branch instruction that is
-	 * exposed to a relocation that supports long branches. Since that
-	 * is exactly what we are dealing with here, we are free to use x16
-	 * as a scratch register in the PLT veneers.
-	 */
-	__le32	mov0;	/* movn	x16, #0x....			*/
-	__le32	mov1;	/* movk	x16, #0x...., lsl #16		*/
-	__le32	mov2;	/* movk	x16, #0x...., lsl #32		*/
-	__le32	br;	/* br	x16				*/
-};
-
 static bool in_init(const struct module *mod, void *loc)
 {
 	return (u64)loc - (u64)mod->init_layout.base < mod->init_layout.size;
@@ -40,33 +25,14 @@ u64 module_emit_plt_entry(struct module *mod, void *loc, const Elf64_Rela *rela,
 	int i = pltsec->plt_num_entries;
 	u64 val = sym->st_value + rela->r_addend;
 
-	/*
-	 * MOVK/MOVN/MOVZ opcode:
-	 * +--------+------------+--------+-----------+-------------+---------+
-	 * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] |
-	 * +--------+------------+--------+-----------+-------------+---------+
-	 *
-	 * Rd     := 0x10 (x16)
-	 * hw     := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32)
-	 * opc    := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ)
-	 * sf     := 1 (64-bit variant)
-	 */
-	plt[i] = (struct plt_entry){
-		cpu_to_le32(0x92800010 | (((~val      ) & 0xffff)) << 5),
-		cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5),
-		cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5),
-		cpu_to_le32(0xd61f0200)
-	};
+	plt[i] = get_plt_entry(val);
 
 	/*
 	 * Check if the entry we just created is a duplicate. Given that the
 	 * relocations are sorted, this will be the last entry we allocated.
 	 * (if one exists).
 	 */
-	if (i > 0 &&
-	    plt[i].mov0 == plt[i - 1].mov0 &&
-	    plt[i].mov1 == plt[i - 1].mov1 &&
-	    plt[i].mov2 == plt[i - 1].mov2)
+	if (i > 0 && plt_entries_equal(plt + i, plt + i - 1))
 		return (u64)&plt[i - 1];
 
 	pltsec->plt_num_entries++;
@@ -154,6 +120,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 	unsigned long core_plts = 0;
 	unsigned long init_plts = 0;
 	Elf64_Sym *syms = NULL;
+	Elf_Shdr *tramp = NULL;
 	int i;
 
 	/*
@@ -165,6 +132,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 			mod->arch.core.plt = sechdrs + i;
 		else if (!strcmp(secstrings + sechdrs[i].sh_name, ".init.plt"))
 			mod->arch.init.plt = sechdrs + i;
+		else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE) &&
+			 !strcmp(secstrings + sechdrs[i].sh_name,
+				 ".text.ftrace_trampoline"))
+			tramp = sechdrs + i;
 		else if (sechdrs[i].sh_type == SHT_SYMTAB)
 			syms = (Elf64_Sym *)sechdrs[i].sh_addr;
 	}
@@ -215,5 +186,12 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 	mod->arch.init.plt_num_entries = 0;
 	mod->arch.init.plt_max_entries = init_plts;
 
+	if (tramp) {
+		tramp->sh_type = SHT_NOBITS;
+		tramp->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+		tramp->sh_addralign = __alignof__(struct plt_entry);
+		tramp->sh_size = sizeof(struct plt_entry);
+	}
+
 	return 0;
 }
diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds
index f7c9781a9d48..22e36a21c113 100644
--- a/arch/arm64/kernel/module.lds
+++ b/arch/arm64/kernel/module.lds
@@ -1,4 +1,5 @@
 SECTIONS {
 	.plt (NOLOAD) : { BYTE(0) }
 	.init.plt (NOLOAD) : { BYTE(0) }
+	.text.ftrace_trampoline (NOLOAD) : { BYTE(0) }
 }
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 9eaef51f83ff..3affca3dd96a 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -262,12 +262,6 @@ static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 
 	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
 	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
-
-	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
-	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
-
-	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
-	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
 };
 
 static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index b2adcce7bc18..6b7dcf4310ac 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -314,6 +314,15 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	clear_tsk_thread_flag(p, TIF_SVE);
 	p->thread.sve_state = NULL;
 
+	/*
+	 * In case p was allocated the same task_struct pointer as some
+	 * other recently-exited task, make sure p is disassociated from
+	 * any cpu that may have run that now-exited task recently.
+	 * Otherwise we could erroneously skip reloading the FPSIMD
+	 * registers for p.
+	 */
+	fpsimd_flush_task_state(p);
+
 	if (likely(!(p->flags & PF_KTHREAD))) {
 		*childregs = *current_pt_regs();
 		childregs->regs[0] = 0;
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index dbadfaf850a7..fa63b28c65e0 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -221,3 +221,24 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
 		}
 	}
 }
+
+
+/*
+ * After successfully emulating an instruction, we might want to
+ * return to user space with a KVM_EXIT_DEBUG. We can only do this
+ * once the emulation is complete, though, so for userspace emulations
+ * we have to wait until we have re-entered KVM before calling this
+ * helper.
+ *
+ * Return true (and set exit_reason) to return to userspace or false
+ * if no further action is required.
+ */
+bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+		run->exit_reason = KVM_EXIT_DEBUG;
+		run->debug.arch.hsr = ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT;
+		return true;
+	}
+	return false;
+}
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index b71247995469..304203fa9e33 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -28,6 +28,7 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_psci.h>
+#include <asm/debug-monitors.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -187,14 +188,46 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
 }
 
 /*
+ * We may be single-stepping an emulated instruction. If the emulation
+ * has been completed in the kernel, we can return to userspace with a
+ * KVM_EXIT_DEBUG, otherwise userspace needs to complete its
+ * emulation first.
+ */
+static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	int handled;
+
+	/*
+	 * See ARM ARM B1.14.1: "Hyp traps on instructions
+	 * that fail their condition code check"
+	 */
+	if (!kvm_condition_valid(vcpu)) {
+		kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+		handled = 1;
+	} else {
+		exit_handle_fn exit_handler;
+
+		exit_handler = kvm_get_exit_handler(vcpu);
+		handled = exit_handler(vcpu, run);
+	}
+
+	/*
+	 * kvm_arm_handle_step_debug() sets the exit_reason on the kvm_run
+	 * structure if we need to return to userspace.
+	 */
+	if (handled > 0 && kvm_arm_handle_step_debug(vcpu, run))
+		handled = 0;
+
+	return handled;
+}
+
+/*
  * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on
  * proper exit to userspace.
  */
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		       int exception_index)
 {
-	exit_handle_fn exit_handler;
-
 	if (ARM_SERROR_PENDING(exception_index)) {
 		u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu));
 
@@ -220,20 +253,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		return 1;
 	case ARM_EXCEPTION_EL1_SERROR:
 		kvm_inject_vabt(vcpu);
-		return 1;
-	case ARM_EXCEPTION_TRAP:
-		/*
-		 * See ARM ARM B1.14.1: "Hyp traps on instructions
-		 * that fail their condition code check"
-		 */
-		if (!kvm_condition_valid(vcpu)) {
-			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+		/* We may still need to return for single-step */
+		if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)
+			&& kvm_arm_handle_step_debug(vcpu, run))
+			return 0;
+		else
 			return 1;
-		}
-
-		exit_handler = kvm_get_exit_handler(vcpu);
-
-		return exit_handler(vcpu, run);
+	case ARM_EXCEPTION_TRAP:
+		return handle_trap_exceptions(vcpu, run);
 	case ARM_EXCEPTION_HYP_GONE:
 		/*
 		 * EL2 has been reset to the hyp-stub. This happens when a guest
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 525c01f48867..f7c651f3a8c0 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -22,6 +22,7 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
 #include <asm/fpsimd.h>
+#include <asm/debug-monitors.h>
 
 static bool __hyp_text __fpsimd_enabled_nvhe(void)
 {
@@ -269,7 +270,11 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
 	return true;
 }
 
-static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu)
+/* Skip an instruction which has been emulated. Returns true if
+ * execution can continue or false if we need to exit hyp mode because
+ * single-step was in effect.
+ */
+static bool __hyp_text __skip_instr(struct kvm_vcpu *vcpu)
 {
 	*vcpu_pc(vcpu) = read_sysreg_el2(elr);
 
@@ -282,6 +287,14 @@ static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu)
 	}
 
 	write_sysreg_el2(*vcpu_pc(vcpu), elr);
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+		vcpu->arch.fault.esr_el2 =
+			(ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT) | 0x22;
+		return false;
+	} else {
+		return true;
+	}
 }
 
 int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
@@ -342,13 +355,21 @@ again:
 			int ret = __vgic_v2_perform_cpuif_access(vcpu);
 
 			if (ret == 1) {
-				__skip_instr(vcpu);
-				goto again;
+				if (__skip_instr(vcpu))
+					goto again;
+				else
+					exit_code = ARM_EXCEPTION_TRAP;
 			}
 
 			if (ret == -1) {
-				/* Promote an illegal access to an SError */
-				__skip_instr(vcpu);
+				/* Promote an illegal access to an
+				 * SError. If we would be returning
+				 * due to single-step clear the SS
+				 * bit so handle_exit knows what to
+				 * do after dealing with the error.
+				 */
+				if (!__skip_instr(vcpu))
+					*vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
 				exit_code = ARM_EXCEPTION_EL1_SERROR;
 			}
 
@@ -363,8 +384,10 @@ again:
 		int ret = __vgic_v3_perform_cpuif_access(vcpu);
 
 		if (ret == 1) {
-			__skip_instr(vcpu);
-			goto again;
+			if (__skip_instr(vcpu))
+				goto again;
+			else
+				exit_code = ARM_EXCEPTION_TRAP;
 		}
 
 		/* 0 falls through to be handled out of EL2 */
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index ab9f5f0fb2c7..6f4017046323 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -96,12 +96,6 @@ static void flush_context(unsigned int cpu)
 
 	set_reserved_asid_bits();
 
-	/*
-	 * Ensure the generation bump is observed before we xchg the
-	 * active_asids.
-	 */
-	smp_wmb();
-
 	for_each_possible_cpu(i) {
 		asid = atomic64_xchg_relaxed(&per_cpu(active_asids, i), 0);
 		/*
@@ -117,7 +111,10 @@ static void flush_context(unsigned int cpu)
 		per_cpu(reserved_asids, i) = asid;
 	}
 
-	/* Queue a TLB invalidate and flush the I-cache if necessary. */
+	/*
+	 * Queue a TLB invalidation for each CPU to perform on next
+	 * context-switch
+	 */
 	cpumask_setall(&tlb_flush_pending);
 }
 
@@ -202,11 +199,18 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	asid = atomic64_read(&mm->context.id);
 
 	/*
-	 * The memory ordering here is subtle. We rely on the control
-	 * dependency between the generation read and the update of
-	 * active_asids to ensure that we are synchronised with a
-	 * parallel rollover (i.e. this pairs with the smp_wmb() in
-	 * flush_context).
+	 * The memory ordering here is subtle.
+	 * If our ASID matches the current generation, then we update
+	 * our active_asids entry with a relaxed xchg. Racing with a
+	 * concurrent rollover means that either:
+	 *
+	 * - We get a zero back from the xchg and end up waiting on the
+	 *   lock. Taking the lock synchronises with the rollover and so
+	 *   we are forced to see the updated generation.
+	 *
+	 * - We get a valid ASID back from the xchg, which means the
+	 *   relaxed xchg in flush_context will treat us as reserved
+	 *   because atomic RmWs are totally ordered for a given location.
 	 */
 	if (!((asid ^ atomic64_read(&asid_generation)) >> asid_bits)
 	    && atomic64_xchg_relaxed(&per_cpu(active_asids, cpu), asid))
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 371c5f03a170..051e71ec3335 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -26,7 +26,7 @@
 #include <asm/page.h>
 #include <asm/tlbflush.h>
 
-static struct kmem_cache *pgd_cache;
+static struct kmem_cache *pgd_cache __ro_after_init;
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {