diff options
Diffstat (limited to 'arch/x86')
88 files changed, 966 insertions, 4169 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d3338a87761f..595193bc2d31 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -32,6 +32,7 @@ config X86_64 select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select SWIOTLB + select ARCH_HAS_ELFCORE_COMPAT config FORCE_DYNAMIC_FTRACE def_bool y @@ -206,7 +207,6 @@ config X86 select HAVE_MOVE_PMD select HAVE_MOVE_PUD select HAVE_NMI - select HAVE_OPROFILE select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS @@ -891,7 +891,7 @@ config HPET_TIMER config HPET_EMULATE_RTC def_bool y - depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) + depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) config APB_TIMER def_bool y if X86_INTEL_MID @@ -1159,10 +1159,6 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_THERMAL_VECTOR - def_bool y - depends on X86_MCE_INTEL - source "arch/x86/events/Kconfig" config X86_LEGACY_VM86 @@ -2865,7 +2861,6 @@ config IA32_EMULATION depends on X86_64 select ARCH_WANT_OLD_COMPAT_IPC select BINFMT_ELF - select COMPAT_BINFMT_ELF select COMPAT_OLD_SIGACTION help Include code to run legacy 32-bit programs under a diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 30920d70b48b..b797f1561943 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -232,9 +232,6 @@ core-y += arch/x86/ drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ drivers-$(CONFIG_PCI) += arch/x86/pci/ -# must be linked after kernel/ -drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ - # suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ @@ -295,16 +292,20 @@ archclean: $(Q)$(MAKE) $(clean)=arch/x86/tools define archhelp - echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' - echo ' install - Install kernel using' - echo ' (your) ~/bin/$(INSTALLKERNEL) or' - echo ' (distribution) /sbin/$(INSTALLKERNEL) or' - echo ' install to $$(INSTALL_PATH) and run lilo' - echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' - echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' - echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)' - echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' - echo ' bzdisk/fdimage*/isoimage also accept:' - echo ' FDARGS="..." arguments for the booted kernel' - echo ' FDINITRD=file initrd for the booted kernel' + echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' + echo ' install - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or' + echo ' (distribution) /sbin/$(INSTALLKERNEL) or install to ' + echo ' $$(INSTALL_PATH) and run lilo' + echo '' + echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' + echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' + echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)' + echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' + echo ' bzdisk/fdimage*/isoimage also accept:' + echo ' FDARGS="..." arguments for the booted kernel' + echo ' FDINITRD=file initrd for the booted kernel' + echo '' + echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest' + echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest' + endef diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index cad08703c4ad..ce0464d630a2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -46,14 +46,6 @@ .code64 .section .entry.text, "ax" -#ifdef CONFIG_PARAVIRT_XXL -SYM_CODE_START(native_usergs_sysret64) - UNWIND_HINT_EMPTY - swapgs - sysretq -SYM_CODE_END(native_usergs_sysret64) -#endif /* CONFIG_PARAVIRT_XXL */ - /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -123,7 +115,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, * go to the slow exit path. + * In the Xen PV case we must use iret anyway. */ + + ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ + X86_FEATURE_XENPV + movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 @@ -215,7 +212,8 @@ syscall_return_via_sysret: popq %rdi popq %rsp - USERGS_SYSRET64 + swapgs + sysretq SYM_CODE_END(entry_SYSCALL_64) /* @@ -669,7 +667,7 @@ native_irq_return_ldt: */ pushq %rdi /* Stash user RDI */ - SWAPGS /* to kernel GS */ + swapgs /* to kernel GS */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ movq PER_CPU_VAR(espfix_waddr), %rdi @@ -699,7 +697,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi - SWAPGS /* to user GS */ + swapgs /* to user GS */ popq %rdi /* Restore user RDI */ movq %rax, %rsp @@ -943,7 +941,7 @@ SYM_CODE_START_LOCAL(paranoid_entry) ret .Lparanoid_entry_swapgs: - SWAPGS + swapgs /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an @@ -1001,7 +999,7 @@ SYM_CODE_START_LOCAL(paranoid_exit) jnz restore_regs_and_return_to_kernel /* We are returning to a context with user GSBASE */ - SWAPGS_UNSAFE_STACK + swapgs jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) @@ -1426,7 +1424,7 @@ nmi_no_fsgsbase: jnz nmi_restore nmi_swapgs: - SWAPGS_UNSAFE_STACK + swapgs nmi_restore: POP_REGS diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index f145e3326c6d..be09c7eac89f 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -159,17 +159,6 @@ struct compat_shmid64_ds { compat_ulong_t __unused5; }; -/* - * The type of struct elf_prstatus.pr_reg in compatible core dumps. - */ -typedef struct user_regs_struct compat_elf_gregset_t; - -/* Full regset -- prstatus on x32, otherwise on ia32 */ -#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296) -#define SET_PR_FPVALID(S, V, R) \ - do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ - while (0) - #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 59bf91c57aa8..1728d4ce5730 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,6 +30,7 @@ enum cpuid_leafs CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, + CPUID_8000_001F_EAX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 84b887825f12..1feb6c089ba2 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ +/* FREE! ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -201,7 +201,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +/* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ +/* FREE! ( 7*32+20) */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -236,8 +236,6 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -385,6 +383,13 @@ #define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ +#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 7947cb1782da..b7dd944dc867 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -91,6 +91,7 @@ DISABLE_ENQCMD) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c98f78330b09..4d0b126835b8 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -12,6 +12,7 @@ #include <linux/pgtable.h> extern unsigned long efi_fw_vendor, efi_config_table; +extern unsigned long efi_mixed_mode_stack_pa; /* * We map the EFI regions needed for runtime services non-contiguously, @@ -68,17 +69,33 @@ extern unsigned long efi_fw_vendor, efi_config_table; #f " called with too many arguments (" #p ">" #n ")"); \ }) +static inline void efi_fpu_begin(void) +{ + /* + * The UEFI calling convention (UEFI spec 2.3.2 and 2.3.4) requires + * that FCW and MXCSR (64-bit) must be initialized prior to calling + * UEFI code. (Oddly the spec does not require that the FPU stack + * be empty.) + */ + kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); +} + +static inline void efi_fpu_end(void) +{ + kernel_fpu_end(); +} + #ifdef CONFIG_X86_32 #define arch_efi_call_virt_setup() \ ({ \ - kernel_fpu_begin(); \ + efi_fpu_begin(); \ firmware_restrict_branch_speculation_start(); \ }) #define arch_efi_call_virt_teardown() \ ({ \ firmware_restrict_branch_speculation_end(); \ - kernel_fpu_end(); \ + efi_fpu_end(); \ }) #define arch_efi_call_virt(p, f, args...) p->f(args) @@ -94,22 +111,12 @@ extern asmlinkage u64 __efi_call(void *fp, ...); __efi_call(__VA_ARGS__); \ }) -/* - * struct efi_scratch - Scratch space used while switching to/from efi_mm - * @phys_stack: stack used during EFI Mixed Mode - * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm - */ -struct efi_scratch { - u64 phys_stack; - struct mm_struct *prev_mm; -} __packed; - #define arch_efi_call_virt_setup() \ ({ \ efi_sync_low_kernel_mappings(); \ - kernel_fpu_begin(); \ + efi_fpu_begin(); \ firmware_restrict_branch_speculation_start(); \ - efi_switch_mm(&efi_mm); \ + efi_enter_mm(); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -117,9 +124,9 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - efi_switch_mm(efi_scratch.prev_mm); \ + efi_leave_mm(); \ firmware_restrict_branch_speculation_end(); \ - kernel_fpu_end(); \ + efi_fpu_end(); \ }) #ifdef CONFIG_KASAN @@ -136,7 +143,6 @@ struct efi_scratch { #endif /* CONFIG_X86_32 */ -extern struct efi_scratch efi_scratch; extern int __init efi_memblock_x86_reserve_range(void); extern void __init efi_print_memmap(void); extern void __init efi_map_region(efi_memory_desc_t *md); @@ -149,10 +155,12 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); -extern void efi_switch_mm(struct mm_struct *mm); -extern void efi_recover_from_page_fault(unsigned long phys_addr); +extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); +void efi_enter_mm(void); +void efi_leave_mm(void); + /* kexec external ABI */ struct efi_setup_data { u64 fw_vendor; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 66bdfe838d61..9224d40cdefe 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -364,7 +364,7 @@ do { \ #define COMPAT_ARCH_DLINFO \ if (exec->e_machine == EM_X86_64) \ ARCH_DLINFO_X32; \ -else \ +else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \ ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) diff --git a/arch/x86/include/asm/elfcore-compat.h b/arch/x86/include/asm/elfcore-compat.h new file mode 100644 index 000000000000..f1b6c7a8d8fc --- /dev/null +++ b/arch/x86/include/asm/elfcore-compat.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_ELFCORE_COMPAT_H +#define _ASM_X86_ELFCORE_COMPAT_H + +#include <asm/user32.h> + +/* + * On amd64 we have two 32bit ABIs - i386 and x32. The latter + * has bigger registers, so we use it for compat_elf_regset_t. + * The former uses i386_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID + * are used to choose the size and location of ->pr_fpvalid of + * the layout actually used. + */ +typedef struct user_regs_struct compat_elf_gregset_t; + +struct i386_elf_prstatus +{ + struct compat_elf_prstatus_common common; + struct user_regs_struct32 pr_reg; + compat_int_t pr_fpvalid; +}; + +#define PRSTATUS_SIZE \ + (user_64bit_mode(task_pt_regs(current)) \ + ? sizeof(struct compat_elf_prstatus) \ + : sizeof(struct i386_elf_prstatus)) +#define SET_PR_FPVALID(S) \ + (*(user_64bit_mode(task_pt_regs(current)) \ + ? &(S)->pr_fpvalid \ + : &((struct i386_elf_prstatus *)(S))->pr_fpvalid) = 1) + +#endif diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 67a4f1cb2aac..ed33a14188f6 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -32,7 +32,19 @@ extern void fpregs_mark_activate(void); /* Code that is unaware of kernel_fpu_begin_mask() can use this */ static inline void kernel_fpu_begin(void) { +#ifdef CONFIG_X86_64 + /* + * Any 64-bit code that uses 387 instructions must explicitly request + * KFPU_387. + */ + kernel_fpu_begin_mask(KFPU_MXCSR); +#else + /* + * 32-bit kernel code may use 387 operations as well as SSE2, etc, + * as long as it checks that the CPU has the required capability. + */ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); +#endif } /* diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index f656aabd1545..41e2e2e1b439 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -585,6 +585,9 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); #else DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check); #endif +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); +#endif #endif /* NMI */ @@ -605,6 +608,9 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); +#ifdef CONFIG_XEN_PV +DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault); +#endif /* #VC */ #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 528c8a71fe7f..76d389691b5b 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -40,8 +40,6 @@ extern void native_init_IRQ(void); extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); -extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector); - extern void init_ISA_irqs(void); extern void __init init_IRQ(void); diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 2dfc8d380dab..144d70ea4393 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -35,15 +35,6 @@ extern __always_inline unsigned long native_save_fl(void) return flags; } -extern inline void native_restore_fl(unsigned long flags); -extern inline void native_restore_fl(unsigned long flags) -{ - asm volatile("push %0 ; popf" - : /* no output */ - :"g" (flags) - :"memory", "cc"); -} - static __always_inline void native_irq_disable(void) { asm volatile("cli": : :"memory"); @@ -79,11 +70,6 @@ static __always_inline unsigned long arch_local_save_flags(void) return native_save_fl(); } -static __always_inline void arch_local_irq_restore(unsigned long flags) -{ - native_restore_fl(flags); -} - static __always_inline void arch_local_irq_disable(void) { native_irq_disable(); @@ -131,25 +117,7 @@ static __always_inline unsigned long arch_local_irq_save(void) #define SAVE_FLAGS(x) pushfq; popq %rax #endif -#define SWAPGS swapgs -/* - * Currently paravirt can't handle swapgs nicely when we - * don't have a stack we can rely on (such as a user space - * stack). So we either find a way around these or just fault - * and emulate if a guest tries to call swapgs directly. - * - * Either way, this is a good way to document that we don't - * have a reliable stack. x86_64 only. - */ -#define SWAPGS_UNSAFE_STACK swapgs - #define INTERRUPT_RETURN jmp native_iret -#define USERGS_SYSRET64 \ - swapgs; \ - sysretq; -#define USERGS_SYSRET32 \ - swapgs; \ - sysretl #else #define INTERRUPT_RETURN iret @@ -170,6 +138,20 @@ static __always_inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(flags); } + +static __always_inline void arch_local_irq_restore(unsigned long flags) +{ + if (!arch_irqs_disabled_flags(flags)) + arch_local_irq_enable(); +} +#else +#ifdef CONFIG_X86_64 +#ifdef CONFIG_XEN_PV +#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV +#else +#define SWAPGS swapgs +#endif +#endif #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 56cdeaac76a0..ddfb3cad8dff 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -289,28 +289,6 @@ extern void (*mce_threshold_vector)(void); extern void (*deferred_error_int_vector)(void); /* - * Thermal handler - */ - -void intel_init_thermal(struct cpuinfo_x86 *c); - -/* Interrupt Handler for core thermal thresholds */ -extern int (*platform_thermal_notify)(__u64 msr_val); - -/* Interrupt Handler for package thermal thresholds */ -extern int (*platform_thermal_package_notify)(__u64 msr_val); - -/* Callback support of rate control, return true, if - * callback has rate control */ -extern bool (*platform_thermal_package_rate_control)(void); - -#ifdef CONFIG_X86_THERMAL_VECTOR -extern void mcheck_intel_therm_init(void); -#else -static inline void mcheck_intel_therm_init(void) { } -#endif - -/* * Used by APEI to report memory error via /dev/mcelog */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 2b7cc5397f80..ab45a220fac4 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -127,14 +127,12 @@ static inline unsigned int x86_cpuid_family(void) } #ifdef CONFIG_MICROCODE -int __init microcode_init(void); extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); extern bool initrd_gone; #else -static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void reload_early_microcode(void) { } diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 9d5d949e662e..1cb9c17a4cb4 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -9,7 +9,6 @@ #ifdef CONFIG_X86_LOCAL_APIC -extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 645bd1d0ee07..64297eabad63 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -66,7 +66,7 @@ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical * address, then that syscall will enter the kernel with a * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything executable + * We avoid this particular problem by preventing anything * from being mapped at the maximum canonical address. * * On AMD CPUs in the Ryzen family, there's a nasty bug in which the diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index f8dce11d2bc1..4abf110e2243 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -648,11 +648,6 @@ static inline notrace unsigned long arch_local_save_flags(void) return PVOP_CALLEE0(unsigned long, irq.save_fl); } -static inline notrace void arch_local_irq_restore(unsigned long f) -{ - PVOP_VCALLEE1(irq.restore_fl, f); -} - static inline notrace void arch_local_irq_disable(void) { PVOP_VCALLEE0(irq.irq_disable); @@ -776,31 +771,6 @@ extern void default_banner(void); #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL -/* - * If swapgs is used while the userspace stack is still current, - * there's no way to call a pvop. The PV replacement *must* be - * inlined, or the swapgs instruction must be trapped and emulated. - */ -#define SWAPGS_UNSAFE_STACK \ - PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs) - -/* - * Note: swapgs is very special, and in practise is either going to be - * implemented with a single "swapgs" instruction or something very - * special. Either way, we don't need to save any registers for - * it. - */ -#define SWAPGS \ - PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \ - ) - -#define USERGS_SYSRET64 \ - PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \ - ANNOTATE_RETPOLINE_SAFE; \ - jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);) - #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS(clobbers) \ PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b6b02b7c19cc..de87087d3bde 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -156,20 +156,10 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); - /* - * Switch to usermode gs and return to 64-bit usermode using - * sysret. Only used in 64-bit kernels to return to 64-bit - * processes. Usermode register state, including %rsp, must - * already be restored. - */ - void (*usergs_sysret64)(void); - /* Normal iret. Jump to this with the standard iret stack frame set up. */ void (*iret)(void); - void (*swapgs)(void); - void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); #endif @@ -178,16 +168,13 @@ struct pv_cpu_ops { struct pv_irq_ops { #ifdef CONFIG_PARAVIRT_XXL /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. + * Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF; + * all other bits returned from save_fl are undefined. * * NOTE: These functions callers expect the callee to preserve * more registers than the standard C calling convention. */ struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; struct paravirt_callee_save irq_disable; struct paravirt_callee_save irq_enable; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 394757ee030a..f24d7ef8fffa 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -177,8 +177,6 @@ enum page_cache_mode { #define __pgprot(x) ((pgprot_t) { (x) } ) #define __pg(x) __pgprot(x) -#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) - #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 3ff0d48469f2..b2d504f11937 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 07603064df8f..d60ed0668a59 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -56,19 +56,22 @@ static void __resctrl_sched_in(void) struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); u32 closid = state->default_closid; u32 rmid = state->default_rmid; + u32 tmp; /* * If this task has a closid/rmid assigned, use it. * Else use the closid/rmid assigned to this cpu. */ if (static_branch_likely(&rdt_alloc_enable_key)) { - if (current->closid) - closid = current->closid; + tmp = READ_ONCE(current->closid); + if (tmp) + closid = tmp; } if (static_branch_likely(&rdt_mon_enable_key)) { - if (current->rmid) - rmid = current->rmid; + tmp = READ_ONCE(current->rmid); + if (tmp) + rmid = tmp; } if (closid != state->cur_closid || rmid != state->cur_rmid) { diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index cc177b4431ae..1d3cbaef4bb7 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -243,10 +243,10 @@ static inline void serialize(void) } /* The dst parameter must be 64-bytes aligned */ -static inline void movdir64b(void *dst, const void *src) +static inline void movdir64b(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; - struct { char _[64]; } *__dst = dst; + struct { char _[64]; } __iomem *__dst = dst; /* * MOVDIR64B %(rdx), rax. @@ -286,7 +286,7 @@ static inline void movdir64b(void *dst, const void *src) static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; - struct { char _[64]; } *__dst = dst; + struct { char _[64]; } __iomem *__dst = dst; int zf; /* diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h new file mode 100644 index 000000000000..ddbdefd5b94f --- /dev/null +++ b/arch/x86/include/asm/thermal.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_THERMAL_H +#define _ASM_X86_THERMAL_H + +#ifdef CONFIG_X86_THERMAL_VECTOR +void intel_init_thermal(struct cpuinfo_x86 *c); +bool x86_thermal_enabled(void); +void intel_thermal_interrupt(void); +#else +static inline void intel_init_thermal(struct cpuinfo_x86 *c) { } +#endif + +#endif /* _ASM_X86_THERMAL_H */ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 820082bd6880..1bfe979bb9bc 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -4,7 +4,6 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 26efbec94448..9e8ac5073ecb 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -36,7 +36,6 @@ struct vm86 { unsigned long saved_sp0; unsigned long flags; - unsigned long screen_bitmap; unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h index d2ee4e307ef8..18909b8050bc 100644 --- a/arch/x86/include/uapi/asm/vm86.h +++ b/arch/x86/include/uapi/asm/vm86.h @@ -97,7 +97,7 @@ struct revectored_struct { struct vm86_struct { struct vm86_regs regs; unsigned long flags; - unsigned long screen_bitmap; + unsigned long screen_bitmap; /* unused, preserved by vm86() */ unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; @@ -106,7 +106,7 @@ struct vm86_struct { /* * flags masks */ -#define VM86_SCREEN_BITMAP 0x0001 +#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */ struct vm86plus_info_struct { unsigned long force_return_for_pic:1; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 828be792231e..b14533af7676 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -13,9 +13,6 @@ int main(void) { #ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT_XXL - OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template, - cpu.usergs_sysret64); - OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs); #ifdef CONFIG_DEBUG_ENTRY OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 35ad8480c464..9215b91bc044 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + init_scattered_cpuid_features(c); init_speculation_control(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 816fdbec795a..0e422a544835 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -24,6 +24,7 @@ #include <asm/traps.h> #include <asm/resctrl.h> #include <asm/numa.h> +#include <asm/thermal.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_disable(); split_lock_init(); + + intel_init_thermal(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile index 9f020c994154..015856abdbb1 100644 --- a/arch/x86/kernel/cpu/mce/Makefile +++ b/arch/x86/kernel/cpu/mce/Makefile @@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o mce-inject-y := inject.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o - obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index e133ce1e562b..7962355436da 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -878,6 +878,12 @@ static atomic_t mce_executing; static atomic_t mce_callin; /* + * Track which CPUs entered the MCA broadcast synchronization and which not in + * order to print holdouts. + */ +static cpumask_t mce_missing_cpus = CPU_MASK_ALL; + +/* * Check if a timeout waiting for other CPUs happened. */ static int mce_timed_out(u64 *t, const char *msg) @@ -894,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - if (mca_cfg.tolerant <= 1) + if (mca_cfg.tolerant <= 1) { + if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus)) + pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n", + cpumask_pr_args(&mce_missing_cpus)); mce_panic(msg, NULL, NULL); + } cpu_missing = 1; return 1; } @@ -1006,6 +1016,7 @@ static int mce_start(int *no_way_out) * is updated before mce_callin. */ order = atomic_inc_return(&mce_callin); + cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); /* * Wait for everyone. @@ -1114,6 +1125,7 @@ static int mce_end(int order) reset: atomic_set(&global_nwo, 0); atomic_set(&mce_callin, 0); + cpumask_setall(&mce_missing_cpus); barrier(); /* @@ -2178,7 +2190,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - mcheck_intel_therm_init(); mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); @@ -2713,6 +2724,7 @@ static void mce_reset(void) atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); + cpumask_setall(&mce_missing_cpus); } static int fake_panic_get(void *data, u64 *val) diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index c2476fe0682e..e309476743b7 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -531,7 +531,6 @@ static void intel_imc_init(struct cpuinfo_x86 *c) void mce_intel_feature_init(struct cpuinfo_x86 *c) { - intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); intel_ppin_init(c); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c deleted file mode 100644 index a7cd2d203ced..000000000000 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Thermal throttle event support code (such as syslog messaging and rate - * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). - * - * This allows consistent reporting of CPU thermal throttle events. - * - * Maintains a counter in /sys that keeps track of the number of thermal - * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog is rate limited). - * - * Author: Dmitriy Zavin (dmitriyz@google.com) - * - * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. - * Inspired by Ross Biro's and Al Borchers' counter code. - */ -#include <linux/interrupt.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <linux/export.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpu.h> - -#include <asm/processor.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/mce.h> -#include <asm/msr.h> -#include <asm/trace/irq_vectors.h> - -#include "internal.h" - -/* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) - -#define THERMAL_THROTTLING_EVENT 0 -#define POWER_LIMIT_EVENT 1 - -/** - * struct _thermal_state - Represent the current thermal event state - * @next_check: Stores the next timestamp, when it is allowed - * to log the next warning message. - * @last_interrupt_time: Stores the timestamp for the last threshold - * high event. - * @therm_work: Delayed workqueue structure - * @count: Stores the current running count for thermal - * or power threshold interrupts. - * @last_count: Stores the previous running count for thermal - * or power threshold interrupts. - * @max_time_ms: This shows the maximum amount of time CPU was - * in throttled state for a single thermal - * threshold high to low state. - * @total_time_ms: This is a cumulative time during which CPU was - * in the throttled state. - * @rate_control_active: Set when a throttling message is logged. - * This is used for the purpose of rate-control. - * @new_event: Stores the last high/low status of the - * THERM_STATUS_PROCHOT or - * THERM_STATUS_POWER_LIMIT. - * @level: Stores whether this _thermal_state instance is - * for a CORE level or for PACKAGE level. - * @sample_index: Index for storing the next sample in the buffer - * temp_samples[]. - * @sample_count: Total number of samples collected in the buffer - * temp_samples[]. - * @average: The last moving average of temperature samples - * @baseline_temp: Temperature at which thermal threshold high - * interrupt was generated. - * @temp_samples: Storage for temperature samples to calculate - * moving average. - * - * This structure is used to represent data related to thermal state for a CPU. - * There is a separate storage for core and package level for each CPU. - */ -struct _thermal_state { - u64 next_check; - u64 last_interrupt_time; - struct delayed_work therm_work; - unsigned long count; - unsigned long last_count; - unsigned long max_time_ms; - unsigned long total_time_ms; - bool rate_control_active; - bool new_event; - u8 level; - u8 sample_index; - u8 sample_count; - u8 average; - u8 baseline_temp; - u8 temp_samples[3]; -}; - -struct thermal_state { - struct _thermal_state core_throttle; - struct _thermal_state core_power_limit; - struct _thermal_state package_throttle; - struct _thermal_state package_power_limit; - struct _thermal_state core_thresh0; - struct _thermal_state core_thresh1; - struct _thermal_state pkg_thresh0; - struct _thermal_state pkg_thresh1; -}; - -/* Callback to handle core threshold interrupts */ -int (*platform_thermal_notify)(__u64 msr_val); -EXPORT_SYMBOL(platform_thermal_notify); - -/* Callback to handle core package threshold_interrupts */ -int (*platform_thermal_package_notify)(__u64 msr_val); -EXPORT_SYMBOL_GPL(platform_thermal_package_notify); - -/* Callback support of rate control, return true, if - * callback has rate control */ -bool (*platform_thermal_package_rate_control)(void); -EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); - - -static DEFINE_PER_CPU(struct thermal_state, thermal_state); - -static atomic_t therm_throt_en = ATOMIC_INIT(0); - -static u32 lvtthmr_init __read_mostly; - -#ifdef CONFIG_SYSFS -#define define_therm_throt_device_one_ro(_name) \ - static DEVICE_ATTR(_name, 0444, \ - therm_throt_device_show_##_name, \ - NULL) \ - -#define define_therm_throt_device_show_func(event, name) \ - \ -static ssize_t therm_throt_device_show_##event##_##name( \ - struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) { \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).event.name); \ - } else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ -} - -define_therm_throt_device_show_func(core_throttle, count); -define_therm_throt_device_one_ro(core_throttle_count); - -define_therm_throt_device_show_func(core_power_limit, count); -define_therm_throt_device_one_ro(core_power_limit_count); - -define_therm_throt_device_show_func(package_throttle, count); -define_therm_throt_device_one_ro(package_throttle_count); - -define_therm_throt_device_show_func(package_power_limit, count); -define_therm_throt_device_one_ro(package_power_limit_count); - -define_therm_throt_device_show_func(core_throttle, max_time_ms); -define_therm_throt_device_one_ro(core_throttle_max_time_ms); - -define_therm_throt_device_show_func(package_throttle, max_time_ms); -define_therm_throt_device_one_ro(package_throttle_max_time_ms); - -define_therm_throt_device_show_func(core_throttle, total_time_ms); -define_therm_throt_device_one_ro(core_throttle_total_time_ms); - -define_therm_throt_device_show_func(package_throttle, total_time_ms); -define_therm_throt_device_one_ro(package_throttle_total_time_ms); - -static struct attribute *thermal_throttle_attrs[] = { - &dev_attr_core_throttle_count.attr, - &dev_attr_core_throttle_max_time_ms.attr, - &dev_attr_core_throttle_total_time_ms.attr, - NULL -}; - -static const struct attribute_group thermal_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" -}; -#endif /* CONFIG_SYSFS */ - -#define CORE_LEVEL 0 -#define PACKAGE_LEVEL 1 - -#define THERM_THROT_POLL_INTERVAL HZ -#define THERM_STATUS_PROCHOT_LOG BIT(1) - -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) -#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) - -static void clear_therm_status_log(int level) -{ - int msr; - u64 mask, msr_val; - - if (level == CORE_LEVEL) { - msr = MSR_IA32_THERM_STATUS; - mask = THERM_STATUS_CLEAR_CORE_MASK; - } else { - msr = MSR_IA32_PACKAGE_THERM_STATUS; - mask = THERM_STATUS_CLEAR_PKG_MASK; - } - - rdmsrl(msr, msr_val); - msr_val &= mask; - wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); -} - -static void get_therm_status(int level, bool *proc_hot, u8 *temp) -{ - int msr; - u64 msr_val; - - if (level == CORE_LEVEL) - msr = MSR_IA32_THERM_STATUS; - else - msr = MSR_IA32_PACKAGE_THERM_STATUS; - - rdmsrl(msr, msr_val); - if (msr_val & THERM_STATUS_PROCHOT_LOG) - *proc_hot = true; - else - *proc_hot = false; - - *temp = (msr_val >> 16) & 0x7F; -} - -static void __maybe_unused throttle_active_work(struct work_struct *work) -{ - struct _thermal_state *state = container_of(to_delayed_work(work), - struct _thermal_state, therm_work); - unsigned int i, avg, this_cpu = smp_processor_id(); - u64 now = get_jiffies_64(); - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* temperature value is offset from the max so lesser means hotter */ - if (!hot && temp > state->baseline_temp) { - if (state->rate_control_active) - pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - - state->rate_control_active = false; - return; - } - - if (time_before64(now, state->next_check) && - state->rate_control_active) - goto re_arm; - - state->next_check = now + CHECK_INTERVAL; - - if (state->count != state->last_count) { - /* There was one new thermal interrupt */ - state->last_count = state->count; - state->average = 0; - state->sample_count = 0; - state->sample_index = 0; - } - - state->temp_samples[state->sample_index] = temp; - state->sample_count++; - state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); - if (state->sample_count < ARRAY_SIZE(state->temp_samples)) - goto re_arm; - - avg = 0; - for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) - avg += state->temp_samples[i]; - - avg /= ARRAY_SIZE(state->temp_samples); - - if (state->average > avg) { - pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - state->rate_control_active = true; - } - - state->average = avg; - -re_arm: - clear_therm_status_log(state->level); - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); -} - -/*** - * therm_throt_process - Process thermal throttling event from interrupt - * @curr: Whether the condition is current or not (boolean), since the - * thermal interrupt normally gets called both when the thermal - * event begins and once the event has ended. - * - * This function is called by the thermal interrupt after the - * IRQ has been acknowledged. - * - * It will take care of rate limiting and printing messages to the syslog. - */ -static void therm_throt_process(bool new_event, int event, int level) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - bool old_event; - u64 now; - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - - now = get_jiffies_64(); - if (level == CORE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->core_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->core_power_limit; - else - return; - } else if (level == PACKAGE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->package_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->package_power_limit; - else - return; - } else - return; - - old_event = state->new_event; - state->new_event = new_event; - - if (new_event) - state->count++; - - if (event != THERMAL_THROTTLING_EVENT) - return; - - if (new_event && !state->last_interrupt_time) { - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* - * Ignore short temperature spike as the system is not close - * to PROCHOT. 10C offset is large enough to ignore. It is - * already dropped from the high threshold temperature. - */ - if (temp > 10) - return; - - state->baseline_temp = temp; - state->last_interrupt_time = now; - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); - } else if (old_event && state->last_interrupt_time) { - unsigned long throttle_time; - - throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); - if (throttle_time > state->max_time_ms) - state->max_time_ms = throttle_time; - state->total_time_ms += throttle_time; - state->last_interrupt_time = 0; - } -} - -static int thresh_event_valid(int level, int event) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - u64 now = get_jiffies_64(); - - if (level == PACKAGE_LEVEL) - state = (event == 0) ? &pstate->pkg_thresh0 : - &pstate->pkg_thresh1; - else - state = (event == 0) ? &pstate->core_thresh0 : - &pstate->core_thresh1; - - if (time_before64(now, state->next_check)) - return 0; - - state->next_check = now + CHECK_INTERVAL; - - return 1; -} - -static bool int_pln_enable; -static int __init int_pln_enable_setup(char *s) -{ - int_pln_enable = true; - - return 1; -} -__setup("int_pln_enable", int_pln_enable_setup); - -#ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device: */ -static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) -{ - int err; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - err = sysfs_create_group(&dev->kobj, &thermal_attr_group); - if (err) - return err; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_core_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - - if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_max_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_total_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - } - - return 0; - -del_group: - sysfs_remove_group(&dev->kobj, &thermal_attr_group); - - return err; -} - -static void thermal_throttle_remove_dev(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &thermal_attr_group); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int thermal_throttle_online(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - state->package_throttle.level = PACKAGE_LEVEL; - state->core_throttle.level = CORE_LEVEL; - - INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); - INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); - - /* Unmask the thermal vector after the above workqueues are initialized. */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - return thermal_throttle_add_dev(dev, cpu); -} - -static int thermal_throttle_offline(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - /* Mask the thermal vector before draining evtl. pending work */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); - - cancel_delayed_work_sync(&state->package_throttle.therm_work); - cancel_delayed_work_sync(&state->core_throttle.therm_work); - - state->package_throttle.rate_control_active = false; - state->core_throttle.rate_control_active = false; - - thermal_throttle_remove_dev(dev); - return 0; -} - -static __init int thermal_throttle_init_device(void) -{ - int ret; - - if (!atomic_read(&therm_throt_en)) - return 0; - - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", - thermal_throttle_online, - thermal_throttle_offline); - return ret < 0 ? ret : 0; -} -device_initcall(thermal_throttle_init_device); - -#endif /* CONFIG_SYSFS */ - -static void notify_package_thresholds(__u64 msr_val) -{ - bool notify_thres_0 = false; - bool notify_thres_1 = false; - - if (!platform_thermal_package_notify) - return; - - /* lower threshold check */ - if (msr_val & THERM_LOG_THRESHOLD0) - notify_thres_0 = true; - /* higher threshold check */ - if (msr_val & THERM_LOG_THRESHOLD1) - notify_thres_1 = true; - - if (!notify_thres_0 && !notify_thres_1) - return; - - if (platform_thermal_package_rate_control && - platform_thermal_package_rate_control()) { - /* Rate control is implemented in callback */ - platform_thermal_package_notify(msr_val); - return; - } - - /* lower threshold reached */ - if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) - platform_thermal_package_notify(msr_val); - /* higher threshold reached */ - if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) - platform_thermal_package_notify(msr_val); -} - -static void notify_thresholds(__u64 msr_val) -{ - /* check whether the interrupt handler is defined; - * otherwise simply return - */ - if (!platform_thermal_notify) - return; - - /* lower threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD0) && - thresh_event_valid(CORE_LEVEL, 0)) - platform_thermal_notify(msr_val); - /* higher threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD1) && - thresh_event_valid(CORE_LEVEL, 1)) - platform_thermal_notify(msr_val); -} - -/* Thermal transition interrupt handler */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - if (static_cpu_has(X86_FEATURE_HWP)) - wrmsrl_safe(MSR_HWP_STATUS, 0); - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - - /* Check for violation of core thermal thresholds*/ - notify_thresholds(msr_val); - - therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PTS)) { - rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - /* check violations of package thermal thresholds */ - notify_package_thresholds(msr_val); - therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL); - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & - PACKAGE_THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - PACKAGE_LEVEL); - } -} - -static void unexpected_thermal_interrupt(void) -{ - pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", - smp_processor_id()); -} - -static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; - -DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) -{ - trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); - trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - ack_APIC_irq(); -} - -/* Thermal monitoring depends on APIC, ACPI and clock modulation */ -static int intel_thermal_supported(struct cpuinfo_x86 *c) -{ - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return 0; - return 1; -} - -void __init mcheck_intel_therm_init(void) -{ - /* - * This function is only called on boot CPU. Save the init thermal - * LVT value on BSP and use that value to restore APs' thermal LVT - * entry BIOS programmed later - */ - if (intel_thermal_supported(&boot_cpu_data)) - lvtthmr_init = apic_read(APIC_LVTTHMR); -} - -void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - if (!intel_thermal_supported(c)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - h = lvtthmr_init; - /* - * The initial value of thermal LVT entries on all APs always reads - * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI - * sequence to them and LVT registers are reset to 0s except for - * the mask bits which are set to 1s when APs receive INIT IPI. - * If BIOS takes over the thermal interrupt and sets its interrupt - * delivery mode to SMI (not fixed), it restores the value that the - * BIOS has programmed on AP based on BSP's info we saved since BIOS - * is always setting the same value for all threads/cores. - */ - if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) - apic_write(APIC_LVTTHMR, lvtthmr_init); - - - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - if (system_state == SYSTEM_BOOTING) - pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - /* early Pentium M models use different method for enabling TM2 */ - if (cpu_has(c, X86_FEATURE_TM2)) { - if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { - rdmsr(MSR_THERM2_CTL, l, h); - if (l & MSR_THERM2_CTL_TM_SELECT) - tm2 = 1; - } else if (l & MSR_IA32_MISC_ENABLE_TM2) - tm2 = 1; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - (l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - - if (cpu_has(c, X86_FEATURE_PTS)) { - rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - (l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE)) - & ~PACKAGE_THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE - | PACKAGE_THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE), h); - } - - smp_thermal_vector = intel_thermal_interrupt; - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", - tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index ec6f0415bc6d..b935e1b5f115 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = { .attrs = cpu_root_microcode_attrs, }; -int __init microcode_init(void) +static int __init microcode_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; int error; diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5bd011737272..9231640782fa 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void) if (!size_base) continue; - size_base = to_size_factor(size_base, &size_factor), + size_base = to_size_factor(size_base, &size_factor); start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), + start_base = to_size_factor(start_base, &start_factor); type = range_state[i].type; pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index a29997e6cf9e..b90f3f437765 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -3,7 +3,6 @@ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * because MTRRs can span up to 40 bits (36bits on most modern x86) */ -#define DEBUG #include <linux/export.h> #include <linux/init.h> diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 61eb26edc6d2..28c8a23aa42e 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -31,8 +31,6 @@ System Programming Guide; Section 9.11. (1997 edition - PPro). */ -#define DEBUG - #include <linux/types.h> /* FIXME: kvm_para.h needs this */ #include <linux/stop_machine.h> diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index a5ee607a3b89..3ef5868ac588 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -3,7 +3,7 @@ * local apic based NMI watchdog for various CPUs. * * This file also handles reservation of performance counters for coordination - * with other users (like oprofile). + * with other users. * * Note that these events normally don't tick when the CPU idles. This means * the frequency varies with CPU load. @@ -105,15 +105,6 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) } -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); - int reserve_perfctr_nmi(unsigned int msr) { unsigned int counter; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index ee71c47844cb..c4d320d02fd5 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -572,6 +572,7 @@ union cpuid_0x10_x_edx { void rdt_last_cmd_clear(void); void rdt_last_cmd_puts(const char *s); +__printf(1, 2) void rdt_last_cmd_printf(const char *fmt, ...); void rdt_ctrl_update(void *arg); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 460f3e0df106..f9190adc52cb 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -563,11 +563,11 @@ static int __rdtgroup_move_task(struct task_struct *tsk, */ if (rdtgrp->type == RDTCTRL_GROUP) { - tsk->closid = rdtgrp->closid; - tsk->rmid = rdtgrp->mon.rmid; + WRITE_ONCE(tsk->closid, rdtgrp->closid); + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); } else if (rdtgrp->type == RDTMON_GROUP) { if (rdtgrp->mon.parent->closid == tsk->closid) { - tsk->rmid = rdtgrp->mon.rmid; + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); } else { rdt_last_cmd_puts("Can't move task to different control group\n"); return -EINVAL; @@ -2310,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - t->closid = to->closid; - t->rmid = to->mon.rmid; + WRITE_ONCE(t->closid, to->closid); + WRITE_ONCE(t->rmid, to->mon.rmid); -#ifdef CONFIG_SMP /* - * This is safe on x86 w/o barriers as the ordering - * of writing to task_cpu() and t->on_cpu is - * reverse to the reading here. The detection is - * inaccurate as tasks might move or schedule - * before the smp function call takes place. In - * such a case the function call is pointless, but + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but * there is no other side effect. */ - if (mask && t->on_cpu) + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) cpumask_set_cpu(task_cpu(t), mask); -#endif } } read_unlock(&tasklist_lock); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 236924930bf0..972ec3bfa9c0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,11 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, - { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, - { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, - { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index f65564a94b9b..7449ef33f081 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -141,7 +141,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) struct sgx_encl_page *entry; unsigned long phys_addr; struct sgx_encl *encl; - unsigned long pfn; vm_fault_t ret; encl = vma->vm_private_data; @@ -168,13 +167,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) phys_addr = sgx_get_epc_phys_addr(entry->epc_page); - /* Check if another thread got here first to insert the PTE. */ - if (!follow_pfn(vma, addr, &pfn)) { - mutex_unlock(&encl->lock); - - return VM_FAULT_NOPAGE; - } - ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); if (ret != VM_FAULT_NOPAGE) { mutex_unlock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index c519fc5f6948..8df81a3ed945 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -700,25 +700,27 @@ static bool __init sgx_page_cache_init(void) return true; } -static void __init sgx_init(void) +static int __init sgx_init(void) { int ret; int i; if (!cpu_feature_enabled(X86_FEATURE_SGX)) - return; + return -ENODEV; if (!sgx_page_cache_init()) - return; + return -ENOMEM; - if (!sgx_page_reclaimer_init()) + if (!sgx_page_reclaimer_init()) { + ret = -ENOMEM; goto err_page_cache; + } ret = sgx_drv_init(); if (ret) goto err_kthread; - return; + return 0; err_kthread: kthread_stop(ksgxd_tsk); @@ -728,6 +730,8 @@ err_page_cache: vfree(sgx_epc_sections[i].pages); memunmap(sgx_epc_sections[i].virt_addr); } + + return ret; } device_initcall(sgx_init); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5d8047441a0a..683749b80ae2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu) fx->fop = 0; fx->rip = 0; fx->rdp = 0; - memset(&fx->st_space[0], 0, 128); + memset(fx->st_space, 0, sizeof(fx->st_space)); } /* * SSE is in init state */ if (!(xfeatures & XFEATURE_MASK_SSE)) - memset(&fx->xmm_space[0], 0, 256); + memset(fx->xmm_space, 0, sizeof(fx->xmm_space)); /* * First two features are FPU and SSE, which above we handled diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c5dd50369e2f..d4ad344e80bf 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -21,6 +21,7 @@ #include <asm/hw_irq.h> #include <asm/desc.h> #include <asm/traps.h> +#include <asm/thermal.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -374,3 +375,23 @@ void fixup_irqs(void) } } #endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +static void smp_thermal_vector(void) +{ + if (x86_thermal_enabled()) + intel_thermal_interrupt(); + else + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) +{ + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + ack_APIC_irq(); +} +#endif diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 0db0375235b4..8ef35063964b 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl) ret SYM_FUNC_END(native_save_fl) EXPORT_SYMBOL(native_save_fl) - -/* - * void native_restore_fl(unsigned long flags) - * %eax/%rdi: flags - */ -SYM_FUNC_START(native_restore_fl) - push %_ASM_ARG1 - popf - ret -SYM_FUNC_END(native_restore_fl) -EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index b8aee71840ae..aa15132228da 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -398,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm) if (!boot_cpu_has(X86_FEATURE_PTI)) return; - tlb_gather_mmu(&tlb, mm, start, end); + /* + * Although free_pgd_range() is intended for freeing user + * page-tables, it also works out for kernel mappings on x86. + * We use tlb_gather_mmu_fullmm() to avoid confusing the + * range-tracking logic in __tlb_adjust_range(). + */ + tlb_gather_mmu_fullmm(&tlb, mm); free_pgd_range(&tlb, start, end, start, end); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb); #endif } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 34b153cbd4ac..5e9a34b5bd74 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -114,6 +114,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value; break; case R_386_PC32: + case R_386_PLT32: /* Add the value, subtract its position */ *location += sym->st_value - (uint32_t)location; break; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 8a67d1fa8dc5..ed8ac6bcbafb 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -182,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = security_locked_down(LOCKDOWN_MSR); if (err) break; + + err = filter_write(regs[1]); + if (err) + return err; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6c3407ba6ee9..c60222ab8ab9 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, else if (opfunc == _paravirt_ident_64) ret = paravirt_patch_ident_64(insn_buff, len); - else if (type == PARAVIRT_PATCH(cpu.iret) || - type == PARAVIRT_PATCH(cpu.usergs_sysret64)) + else if (type == PARAVIRT_PATCH(cpu.iret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); #endif @@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu) /* These are in entry.S */ extern void native_iret(void); -extern void native_usergs_sysret64(void); static struct resource reserve_ioports = { .start = 0, @@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, - .cpu.usergs_sysret64 = native_usergs_sysret64, .cpu.iret = native_iret, - .cpu.swapgs = native_swapgs, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, @@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .irq.safe_halt = native_safe_halt, diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c index ace6e334cb39..abd27ec67397 100644 --- a/arch/x86/kernel/paravirt_patch.c +++ b/arch/x86/kernel/paravirt_patch.c @@ -25,10 +25,7 @@ struct patch_xxl { const unsigned char mmu_read_cr2[3]; const unsigned char mmu_read_cr3[3]; const unsigned char mmu_write_cr3[3]; - const unsigned char irq_restore_fl[2]; const unsigned char cpu_wbinvd[2]; - const unsigned char cpu_usergs_sysret64[6]; - const unsigned char cpu_swapgs[3]; const unsigned char mov64[3]; }; @@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = { .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8, - 0x48, 0x0f, 0x07 }, // swapgs; sysretq - .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax }; @@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, switch (type) { #ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, restore_fl, xxl, insn_buff, len); PATCH_CASE(irq, save_fl, xxl, insn_buff, len); PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); @@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); - PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); #endif diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 2e9006c1e240..42e92ec62973 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -4,9 +4,6 @@ #include <linux/string.h> #include <linux/kallsyms.h> - -#define DEBUG 1 - static struct iommu_table_entry * __init find_dependents_of(struct iommu_table_entry *start, struct iommu_table_entry *finish, diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index bedca011459c..87a4143aa7d7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif +#ifdef CONFIG_X86_64 +static const struct user_regset_view user_x86_64_view; /* Initialized below. */ +#endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request, int ret; unsigned long __user *datap = (unsigned long __user *)data; +#ifdef CONFIG_X86_64 + /* This is native 64-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_64_view; +#else + /* This is native 32-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_32_view; +#endif + switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } +/* + * This is used by the core dump code to decide which regset to dump. The + * core dump code writes out the resulting .e_machine and the corresponding + * regsets. This is suboptimal if the task is messing around with its CS.L + * field, but at worst the core dump will end up missing some information. + * + * Unfortunately, it is also used by the broken PTRACE_GETREGSET and + * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have + * no way to make sure that the e_machine they use matches the caller's + * expectations. The result is that the data format returned by + * PTRACE_GETREGSET depends on the returned CS field (and even the offset + * of the returned CS field depends on its value!) and the data format + * accepted by PTRACE_SETREGSET is determined by the old CS value. The + * upshot is that it is basically impossible to use these APIs correctly. + * + * The best way to fix it in the long run would probably be to add new + * improved ptrace() APIs to read and write registers reliably, possibly by + * allowing userspace to select the ELF e_machine variant that they expect. + */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index db115943e8bd..9991c5920aac 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, }, + { /* PCIe Wifi card isn't detected after reboot otherwise */ + .callback = set_pci_reboot, + .ident = "Zotac ZBOX CI327 nano", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"), + }, + }, + /* Sony */ { /* Handle problems with rebooting on Sony VGN-Z540N */ .callback = set_bios_reboot, diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 504fa5425bce..660b78827638 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - long error; - error = -EINVAL; if (off & ~PAGE_MASK) - goto out; + return -EINVAL; - error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); -out: - return error; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } static void find_start_end(unsigned long addr, unsigned long flags, diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 764573de3996..e5a7a10a0164 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); - unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end); + + /* + * Don't write screen_bitmap in case some user had a value there + * and expected it to remain unchanged. + */ user_access_end(); @@ -160,49 +164,6 @@ Efault: do_exit(SIGSEGV); } -static void mark_screen_rdonly(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i; - - mmap_write_lock(mm); - pgd = pgd_offset(mm, 0xA0000); - if (pgd_none_or_clear_bad(pgd)) - goto out; - p4d = p4d_offset(pgd, 0xA0000); - if (p4d_none_or_clear_bad(p4d)) - goto out; - pud = pud_offset(p4d, 0xA0000); - if (pud_none_or_clear_bad(pud)) - goto out; - pmd = pmd_offset(pud, 0xA0000); - - if (pmd_trans_huge(*pmd)) { - vma = find_vma(mm, 0xA0000); - split_huge_pmd(vma, pmd, 0xA0000); - } - if (pmd_none_or_clear_bad(pmd)) - goto out; - pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); - for (i = 0; i < 32; i++) { - if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); - pte++; - } - pte_unmap_unlock(pte, ptl); -out: - mmap_write_unlock(mm); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); -} - - - static int do_vm86_irq_handling(int subfunction, int irqnumber); static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); @@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) offsetof(struct vm86_struct, int_revectored))) return -EFAULT; + + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ + if (v.flags & VM86_SCREEN_BITMAP) { + char comm[TASK_COMM_LEN]; + + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + return -EINVAL; + } + memset(&vm86regs, 0, sizeof(vm86regs)); vm86regs.pt.bx = v.regs.ebx; @@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86regs.gs = v.regs.gs; vm86->flags = v.flags; - vm86->screen_bitmap = v.screen_bitmap; vm86->cpu_type = v.cpu_type; if (copy_from_user(&vm86->int_revectored, @@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) update_task_stack(tsk); preempt_enable(); - if (vm86->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk->mm); - memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); return regs->ax; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f1f1b5a0956a..525197381baa 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,7 +16,7 @@ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ -#include <linux/efi.h> /* efi_recover_from_page_fault()*/ +#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ @@ -25,7 +25,7 @@ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ -#include <asm/efi.h> /* efi_recover_from_page_fault()*/ +#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ @@ -54,7 +54,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * @@ -83,11 +83,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, #ifdef CONFIG_X86_64 case 0x40: /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. + * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif @@ -110,6 +106,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, } } +static bool is_amd_k8_pre_npt(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && + c->x86_vendor == X86_VENDOR_AMD && + c->x86 == 0xf && c->x86_model < 0x40); +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { @@ -117,6 +122,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) unsigned char *instr; int prefetch = 0; + /* Erratum #91 affects AMD K8, pre-NPT CPUs */ + if (!is_amd_k8_pre_npt()) + return 0; + /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: @@ -127,20 +136,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) - return 0; + /* + * This code has historically always bailed out if IP points to a + * not-present page (e.g. due to a race). No one has ever + * complained about this. + */ + pagefault_disable(); while (instr < max_instr) { unsigned char opcode; - if (get_kernel_nofault(opcode, instr)) - break; + if (user_mode(regs)) { + if (get_user(opcode, instr)) + break; + } else { + if (get_kernel_nofault(opcode, instr)) + break; + } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } + + pagefault_enable(); return prefetch; } @@ -262,25 +282,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -#ifdef CONFIG_VM86 - unsigned long bit; - - if (!v8086_mode(regs) || !tsk->thread.vm86) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.vm86->screen_bitmap |= 1 << bit; -#endif -} - static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; @@ -335,15 +336,6 @@ KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ -} - static int bad_address(void *p) { unsigned long dummy; @@ -427,6 +419,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) || boot_cpu_data.x86 != 0xf) return 0; + if (user_mode(regs)) + return 0; + if (address != regs->ip) return 0; @@ -462,10 +457,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) } /* Pentium F0 0F C7 C8 bug workaround: */ -static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_F00F_BUG - if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { + if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && + idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } @@ -630,53 +627,20 @@ static void set_signal_archinfo(unsigned long address, } static noinline void -no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) +page_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - struct task_struct *tsk = current; unsigned long flags; int sig; if (user_mode(regs)) { /* - * This is an implicit supervisor-mode access from user - * mode. Bypass all the kernel-mode recovery code and just - * OOPS. + * Implicit kernel access from user mode? Skip the stack + * overflow and EFI special cases. */ goto oops; } - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { - /* - * Any interrupt that takes a fault gets the fixup. This makes - * the below recursive fault logic only apply to a faults from - * task context. - */ - if (in_interrupt()) - return; - - /* - * Per the above we're !in_interrupt(), aka. task context. - * - * In this case we need to make sure we're not recursively - * faulting through the emulate_vsyscall() logic. - */ - if (current->thread.sig_on_uaccess_err && signal) { - sanitize_error_code(address, &error_code); - - set_signal_archinfo(address, error_code); - - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); - } - - /* - * Barring that, we can do the fixup and be happy. - */ - return; - } - #ifdef CONFIG_VMAP_STACK /* * Stack overflow? During boot, we can fault near the initial @@ -684,8 +648,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && - (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || - address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + (((unsigned long)current->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) { unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -709,28 +673,12 @@ no_context(struct pt_regs *regs, unsigned long error_code, #endif /* - * 32-bit: - * - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * 64-bit: - * - * Hall of shame of CPU/BIOS bugs. - */ - if (is_prefetch(regs, error_code, address)) - return; - - if (is_errata93(regs, address)) - return; - - /* - * Buggy firmware could access regions which might page fault, try to - * recover from such faults. + * Buggy firmware could access regions which might page fault. If + * this happens, EFI has a special OOPS path that will try to + * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) - efi_recover_from_page_fault(address); + efi_crash_gracefully_on_page_fault(address); oops: /* @@ -741,7 +689,7 @@ oops: show_fault_oops(regs, error_code, address); - if (task_stack_end_corrupted(tsk)) + if (task_stack_end_corrupted(current)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); sig = SIGKILL; @@ -754,6 +702,53 @@ oops: oops_end(flags, regs, sig); } +static noinline void +kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) +{ + WARN_ON_ONCE(user_mode(regs)); + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { + /* + * Any interrupt that takes a fault gets the fixup. This makes + * the below recursive fault logic only apply to a faults from + * task context. + */ + if (in_interrupt()) + return; + + /* + * Per the above we're !in_interrupt(), aka. task context. + * + * In this case we need to make sure we're not recursively + * faulting through the emulate_vsyscall() logic. + */ + if (current->thread.sig_on_uaccess_err && signal) { + sanitize_error_code(address, &error_code); + + set_signal_archinfo(address, error_code); + + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_fault(signal, si_code, (void __user *)address); + } + + /* + * Barring that, we can do the fixup and be happy. + */ + return; + } + + /* + * AMD erratum #91 manifests as a spurious page fault on a PREFETCH + * instruction. + */ + if (is_prefetch(regs, error_code, address)) + return; + + page_fault_oops(regs, error_code, address); +} + /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: @@ -796,47 +791,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; - /* User mode accesses just cause a SIGSEGV */ - if (user_mode(regs) && (error_code & X86_PF_USER)) { - /* - * It's possible to have interrupts off here: - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space: - */ - if (is_prefetch(regs, error_code, address)) - return; + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code); + return; + } - if (is_errata100(regs, address)) - return; + if (!(error_code & X86_PF_USER)) { + /* Implicit user access to kernel memory -- just oops */ + page_fault_oops(regs, error_code, address); + return; + } - sanitize_error_code(address, &error_code); + /* + * User mode accesses just cause a SIGSEGV. + * It's possible to have interrupts off here: + */ + local_irq_enable(); - if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) - return; + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; - if (likely(show_unhandled_signals)) - show_signal_msg(regs, error_code, address, tsk); + if (is_errata100(regs, address)) + return; - set_signal_archinfo(address, error_code); + sanitize_error_code(address, &error_code); - if (si_code == SEGV_PKUERR) - force_sig_pkuerr((void __user *)address, pkey); + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; - force_sig_fault(SIGSEGV, si_code, (void __user *)address); + if (likely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); - local_irq_disable(); + set_signal_archinfo(address, error_code); - return; - } + if (si_code == SEGV_PKUERR) + force_sig_pkuerr((void __user *)address, pkey); - if (is_f00f_bug(regs, address)) - return; + force_sig_fault(SIGSEGV, si_code, (void __user *)address); - no_context(regs, error_code, address, SIGSEGV, si_code); + local_irq_disable(); } static noinline void @@ -926,8 +923,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -961,40 +958,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } -static noinline void -mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, vm_fault_t fault) -{ - if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, 0, 0); - return; - } - - if (fault & VM_FAULT_OOM) { - /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & X86_PF_USER)) { - no_context(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); - return; - } - - /* - * We ran out of memory, call the OOM killer, and return the - * userspace (which will retry the fault, or kill us if we got - * oom-killed): - */ - pagefault_out_of_memory(); - } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| - VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, fault); - else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address); - else - BUG(); - } -} - static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) @@ -1209,6 +1172,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } #endif + if (is_f00f_bug(regs, hw_error_code, address)) + return; + /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; @@ -1229,10 +1195,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(do_kern_addr_fault); -/* Handle faults in the user portion of the address space */ +/* + * Handle faults in the user portion of the address space. Nothing in here + * should check X86_PF_USER without a specific justification: for almost + * all purposes, we should treat a normal kernel access to user memory + * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. + * The one exception is AC flag handling, which is, per the x86 + * architecture, special for WRUSS. + */ static inline void do_user_addr_fault(struct pt_regs *regs, - unsigned long hw_error_code, + unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; @@ -1244,6 +1217,21 @@ void do_user_addr_fault(struct pt_regs *regs, tsk = current; mm = tsk->mm; + if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { + /* + * Whoops, this is kernel mode code trying to execute from + * user memory. Unless this is AMD erratum #93, which + * corrupts RIP such that it looks like a user address, + * this is unrecoverable. Don't even try to look up the + * VMA or look for extable entries. + */ + if (is_errata93(regs, address)) + return; + + page_fault_oops(regs, error_code, address); + return; + } + /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) return; @@ -1252,8 +1240,8 @@ void do_user_addr_fault(struct pt_regs *regs, * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ - if (unlikely(hw_error_code & X86_PF_RSVD)) - pgtable_bad(regs, hw_error_code, address); + if (unlikely(error_code & X86_PF_RSVD)) + pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user @@ -1263,10 +1251,13 @@ void do_user_addr_fault(struct pt_regs *regs, * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && - !(hw_error_code & X86_PF_USER) && - !(regs->flags & X86_EFLAGS_AC))) - { - bad_area_nosemaphore(regs, hw_error_code, address); + !(error_code & X86_PF_USER) && + !(regs->flags & X86_EFLAGS_AC))) { + /* + * No extable entry here. This was a kernel access to an + * invalid pointer. get_kernel_nofault() will not get here. + */ + page_fault_oops(regs, error_code, address); return; } @@ -1275,7 +1266,7 @@ void do_user_addr_fault(struct pt_regs *regs, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1296,9 +1287,9 @@ void do_user_addr_fault(struct pt_regs *regs, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (hw_error_code & X86_PF_WRITE) + if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (hw_error_code & X86_PF_INSTR) + if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; #ifdef CONFIG_X86_64 @@ -1314,7 +1305,7 @@ void do_user_addr_fault(struct pt_regs *regs, * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { - if (emulate_vsyscall(hw_error_code, regs, address)) + if (emulate_vsyscall(error_code, regs, address)) return; } #endif @@ -1337,7 +1328,7 @@ void do_user_addr_fault(struct pt_regs *regs, * Fault from code in kernel from * which we do not expect faults. */ - bad_area_nosemaphore(regs, hw_error_code, address); + bad_area_nosemaphore(regs, error_code, address); return; } retry: @@ -1353,17 +1344,17 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, hw_error_code, address); + bad_area(regs, error_code, address); return; } @@ -1372,8 +1363,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(hw_error_code, vma))) { - bad_area_access_error(regs, hw_error_code, address, vma); + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); return; } @@ -1392,11 +1383,14 @@ good_area: */ fault = handle_mm_fault(vma, address, flags, regs); - /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { + /* + * Quick path to respond to signals. The core mm code + * has unlocked the mm for us if we get here. + */ if (!user_mode(regs)) - no_context(regs, hw_error_code, address, SIGBUS, - BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR); return; } @@ -1412,12 +1406,37 @@ good_area: } mmap_read_unlock(mm); - if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, hw_error_code, address, fault); + if (likely(!(fault & VM_FAULT_ERROR))) + return; + + if (fatal_signal_pending(current) && !user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, 0, 0); return; } - check_v8086_mode(regs, address, tsk); + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); + return; + } + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + do_sigbus(regs, error_code, address, fault); + else if (fault & VM_FAULT_SIGSEGV) + bad_area_nosemaphore(regs, error_code, address); + else + BUG(); + } } NOKPROBE_SYMBOL(do_user_addr_fault); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e26f5c5c6565..dd694fb93916 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num) } /* - * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. - * With KASLR memory randomization, depending on the machine e820 memory - * and the PUD alignment. We may need twice more pages when KASLR memory + * By default need to be able to allocate page tables below PGD firstly for + * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping. + * With KASLR memory randomization, depending on the machine e820 memory and the + * PUD alignment, twice that many pages may be needed when KASLR memory * randomization is enabled. */ + +#ifndef CONFIG_X86_5LEVEL +#define INIT_PGD_PAGE_TABLES 3 +#else +#define INIT_PGD_PAGE_TABLES 4 +#endif + #ifndef CONFIG_RANDOMIZE_MEMORY -#define INIT_PGD_PAGE_COUNT 6 +#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) #else -#define INIT_PGD_PAGE_COUNT 12 +#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES) #endif + #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index c3d5f0236f35..4b01f7dbaf30 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -475,9 +475,10 @@ void __init mem_encrypt_init(void) swiotlb_update_mem_attributes(); /* - * With SEV, we need to unroll the rep string I/O instructions. + * With SEV, we need to unroll the rep string I/O instructions, + * but SEV-ES supports them through the #VC handler. */ - if (sev_active()) + if (sev_active() && !sev_es_active()) static_branch_enable(&sev_enable_key); print_mem_encrypt_feature_info(); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index bd7aff5c51f7..cd768dafca9e 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -10,8 +10,6 @@ #define pr_fmt(fmt) "mmiotrace: " fmt -#define DEBUG 1 - #include <linux/moduleparam.h> #include <linux/debugfs.h> #include <linux/slab.h> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 796506dcfc42..79e7a0ec1da5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -205,6 +205,18 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +/* Some 1-byte opcodes for binary ALU operations */ +static u8 simple_alu_opcodes[] = { + [BPF_ADD] = 0x01, + [BPF_SUB] = 0x29, + [BPF_AND] = 0x21, + [BPF_OR] = 0x09, + [BPF_XOR] = 0x31, + [BPF_LSH] = 0xE0, + [BPF_RSH] = 0xE8, + [BPF_ARSH] = 0xF8, +}; + static void jit_fill_hole(void *area, unsigned int size) { /* Fill whole space with INT3 instructions */ @@ -681,6 +693,42 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } +/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */ +static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is_imm8(off)) { + /* 1-byte signed displacement. + * + * If off == 0 we could skip this and save one extra byte, but + * special case of x86 R13 which always needs an offset is not + * worth the hassle + */ + EMIT2(add_2reg(0x40, ptr_reg, val_reg), off); + } else { + /* 4-byte signed displacement */ + EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off); + } + *pprog = prog; +} + +/* + * Emit a REX byte if it will be necessary to address these registers + */ +static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is64) + EMIT1(add_2mod(0x48, dst_reg, src_reg)); + else if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT1(add_2mod(0x40, dst_reg, src_reg)); + *pprog = prog; +} + /* LDX: dst_reg = *(u8*)(src_reg + off) */ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -708,15 +756,7 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); break; } - /* - * If insn->off == 0 we can save one extra byte, but - * special case of x86 R13 which always needs an offset - * is not worth the hassle - */ - if (is_imm8(off)) - EMIT2(add_2reg(0x40, src_reg, dst_reg), off); - else - EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off); + emit_insn_suffix(&prog, src_reg, dst_reg, off); *pprog = prog; } @@ -751,11 +791,51 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); break; } - if (is_imm8(off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off); + emit_insn_suffix(&prog, dst_reg, src_reg, off); + *pprog = prog; +} + +static int emit_atomic(u8 **pprog, u8 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +{ + u8 *prog = *pprog; + int cnt = 0; + + EMIT1(0xF0); /* lock prefix */ + + maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW); + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: + /* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + + emit_insn_suffix(&prog, dst_reg, src_reg, off); + *pprog = prog; + return 0; } static bool ex_handler_bpf(const struct exception_table_entry *x, @@ -789,8 +869,31 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } +static int emit_nops(u8 **pprog, int len) +{ + u8 *prog = *pprog; + int i, noplen, cnt = 0; + + while (len > 0) { + noplen = len; + + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + + for (i = 0; i < noplen; i++) + EMIT1(ideal_nops[noplen][i]); + len -= noplen; + } + + *pprog = prog; + + return cnt; +} + +#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - int oldproglen, struct jit_context *ctx) + int oldproglen, struct jit_context *ctx, bool jmp_padding) { bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; @@ -800,8 +903,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0, excnt = 0; - int proglen = 0; + int ilen, proglen = 0; u8 *prog = temp; + int err; detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -813,17 +917,24 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_prog->aux->func_idx != 0); push_callee_regs(&prog, callee_regs_used); - addrs[0] = prog - temp; + + ilen = prog - temp; + if (image) + memcpy(image + proglen, temp, ilen); + proglen += ilen; + addrs[0] = proglen; + prog = temp; for (i = 1; i <= insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; u8 b2 = 0, b3 = 0; + u8 *start_of_ldx; s64 jmp_offset; u8 jmp_cond; - int ilen; u8 *func; + int nops; switch (insn->code) { /* ALU */ @@ -837,17 +948,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_AND | BPF_X: case BPF_ALU64 | BPF_OR | BPF_X: case BPF_ALU64 | BPF_XOR | BPF_X: - switch (BPF_OP(insn->code)) { - case BPF_ADD: b2 = 0x01; break; - case BPF_SUB: b2 = 0x29; break; - case BPF_AND: b2 = 0x21; break; - case BPF_OR: b2 = 0x09; break; - case BPF_XOR: b2 = 0x31; break; - } - if (BPF_CLASS(insn->code) == BPF_ALU64) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_ALU64); + b2 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg)); break; @@ -1027,12 +1130,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - switch (BPF_OP(insn->code)) { - case BPF_LSH: b3 = 0xE0; break; - case BPF_RSH: b3 = 0xE8; break; - case BPF_ARSH: b3 = 0xF8; break; - } - + b3 = simple_alu_opcodes[BPF_OP(insn->code)]; if (imm32 == 1) EMIT2(0xD1, add_1reg(b3, dst_reg)); else @@ -1066,11 +1164,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, else if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); - switch (BPF_OP(insn->code)) { - case BPF_LSH: b3 = 0xE0; break; - case BPF_RSH: b3 = 0xE8; break; - case BPF_ARSH: b3 = 0xF8; break; - } + b3 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(0xD3, add_1reg(b3, dst_reg)); if (src_reg != BPF_REG_4) @@ -1185,12 +1279,30 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + /* test src_reg, src_reg */ + maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */ + EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg)); + /* jne start_of_ldx */ + EMIT2(X86_JNE, 0); + /* xor dst_reg, dst_reg */ + emit_mov_imm32(&prog, false, dst_reg, 0); + /* jmp byte_after_ldx */ + EMIT2(0xEB, 0); + + /* populate jmp_offset for JNE above */ + temp[4] = prog - temp - 5 /* sizeof(test + jne) */; + start_of_ldx = prog; + } emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; u8 *_insn = image + proglen; s64 delta; + /* populate jmp_offset for JMP above */ + start_of_ldx[-1] = prog - start_of_ldx; + if (!bpf_prog->aux->extable) break; @@ -1230,21 +1342,56 @@ st: if (is_imm8(insn->off)) } break; - /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ - case BPF_STX | BPF_XADD | BPF_W: - /* Emit 'lock add dword ptr [rax + off], eax' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); - else - EMIT2(0xF0, 0x01); - goto xadd; - case BPF_STX | BPF_XADD | BPF_DW: - EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01); -xadd: if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), - insn->off); + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) { + u8 *branch_target; + bool is64 = BPF_SIZE(insn->code) == BPF_DW; + + /* + * Can't be implemented with a single x86 insn. + * Need to do a CMPXCHG loop. + */ + + /* Will need RAX as a CMPXCHG operand so save R0 */ + emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); + branch_target = prog; + /* Load old value */ + emit_ldx(&prog, BPF_SIZE(insn->code), + BPF_REG_0, dst_reg, insn->off); + /* + * Perform the (commutative) operation locally, + * put the result in the AUX_REG. + */ + emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0); + maybe_emit_mod(&prog, AUX_REG, src_reg, is64); + EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], + add_2reg(0xC0, AUX_REG, src_reg)); + /* Attempt to swap in new value */ + err = emit_atomic(&prog, BPF_CMPXCHG, + dst_reg, AUX_REG, insn->off, + BPF_SIZE(insn->code)); + if (WARN_ON(err)) + return err; + /* + * ZF tells us whether we won the race. If it's + * cleared we need to try again. + */ + EMIT2(X86_JNE, -(prog - branch_target) - 2); + /* Return the pre-modification value */ + emit_mov_reg(&prog, is64, src_reg, BPF_REG_0); + /* Restore R0 after clobbering RAX */ + emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); + break; + + } + + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + if (err) + return err; break; /* call */ @@ -1295,20 +1442,16 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSGE | BPF_X: case BPF_JMP32 | BPF_JSLE | BPF_X: /* cmp dst_reg, src_reg */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg)); goto emit_cond_jmp; case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP32 | BPF_JSET | BPF_X: /* test dst_reg, src_reg */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, src_reg)); - else if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT1(add_2mod(0x40, dst_reg, src_reg)); + maybe_emit_mod(&prog, dst_reg, src_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg)); goto emit_cond_jmp; @@ -1344,10 +1487,8 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSLE | BPF_K: /* test dst_reg, dst_reg to save one extra byte */ if (imm32 == 0) { - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_2mod(0x48, dst_reg, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + maybe_emit_mod(&prog, dst_reg, dst_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); goto emit_cond_jmp; } @@ -1409,6 +1550,30 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ } jmp_offset = addrs[i + insn->off] - addrs[i]; if (is_imm8(jmp_offset)) { + if (jmp_padding) { + /* To keep the jmp_offset valid, the extra bytes are + * padded before the jump insn, so we substract the + * 2 bytes of jmp_cond insn from INSN_SZ_DIFF. + * + * If the previous pass already emits an imm8 + * jmp_cond, then this BPF insn won't shrink, so + * "nops" is 0. + * + * On the other hand, if the previous pass emits an + * imm32 jmp_cond, the extra 4 bytes(*) is padded to + * keep the image from shrinking further. + * + * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond + * is 2 bytes, so the size difference is 4 bytes. + */ + nops = INSN_SZ_DIFF - 2; + if (nops != 0 && nops != 4) { + pr_err("unexpected jmp_cond padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, nops); + } EMIT2(jmp_cond, jmp_offset); } else if (is_simm32(jmp_offset)) { EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset); @@ -1431,11 +1596,55 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ else jmp_offset = addrs[i + insn->off] - addrs[i]; - if (!jmp_offset) - /* Optimize out nop jumps */ + if (!jmp_offset) { + /* + * If jmp_padding is enabled, the extra nops will + * be inserted. Otherwise, optimize out nop jumps. + */ + if (jmp_padding) { + /* There are 3 possible conditions. + * (1) This BPF_JA is already optimized out in + * the previous run, so there is no need + * to pad any extra byte (0 byte). + * (2) The previous pass emits an imm8 jmp, + * so we pad 2 bytes to match the previous + * insn size. + * (3) Similarly, the previous pass emits an + * imm32 jmp, and 5 bytes is padded. + */ + nops = INSN_SZ_DIFF; + if (nops != 0 && nops != 2 && nops != 5) { + pr_err("unexpected nop jump padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, nops); + } break; + } emit_jmp: if (is_imm8(jmp_offset)) { + if (jmp_padding) { + /* To avoid breaking jmp_offset, the extra bytes + * are padded before the actual jmp insn, so + * 2 bytes is substracted from INSN_SZ_DIFF. + * + * If the previous pass already emits an imm8 + * jmp, there is nothing to pad (0 byte). + * + * If it emits an imm32 jmp (5 bytes) previously + * and now an imm8 jmp (2 bytes), then we pad + * (5 - 2 = 3) bytes to stop the image from + * shrinking further. + */ + nops = INSN_SZ_DIFF - 2; + if (nops != 0 && nops != 3) { + pr_err("unexpected jump padding: %d bytes\n", + nops); + return -EFAULT; + } + cnt += emit_nops(&prog, INSN_SZ_DIFF - 2); + } EMIT2(0xEB, jmp_offset); } else if (is_simm32(jmp_offset)) { EMIT1_off32(0xE9, jmp_offset); @@ -1531,17 +1740,25 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; + u8 *jmp_insn; int cnt = 0; - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) - return -EINVAL; - } else { - if (emit_call(&prog, __bpf_prog_enter, prog)) + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_enter_sleepable : + __bpf_prog_enter, prog)) return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); - } + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + + /* if (__bpf_prog_enter*(prog) == 0) + * goto skip_exec_of_prog; + */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ + /* emit 2 nops that will be replaced with JE insn */ + jmp_insn = prog; + emit_nops(&prog, 2); /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1561,43 +1778,23 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + /* replace 2 nops with JE insn, since jmp target is known */ + jmp_insn[0] = X86_JE; + jmp_insn[1] = prog - jmp_insn - 2; + + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_exit_sleepable : + __bpf_prog_exit, prog)) return -EINVAL; - } else { - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; - } *pprog = prog; return 0; } -static void emit_nops(u8 **pprog, unsigned int len) -{ - unsigned int i, noplen; - u8 *prog = *pprog; - int cnt = 0; - - while (len > 0) { - noplen = len; - - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - - for (i = 0; i < noplen; i++) - EMIT1(ideal_nops[noplen][i]); - len -= noplen; - } - - *pprog = prog; -} - static void emit_align(u8 **pprog, u32 align) { u8 *target, *prog = *pprog; @@ -1972,6 +2169,9 @@ struct x64_jit_data { struct jit_context ctx; }; +#define MAX_PASSES 20 +#define PADDING_PASSES (MAX_PASSES - 5) + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; @@ -1981,6 +2181,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct jit_context ctx = {}; bool tmp_blinded = false; bool extra_pass = false; + bool padding = false; u8 *image = NULL; int *addrs; int pass; @@ -2017,6 +2218,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) image = jit_data->image; header = jit_data->header; extra_pass = true; + padding = true; goto skip_init_addrs; } addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); @@ -2042,8 +2244,10 @@ skip_init_addrs: * may converge on the last pass. In such case do one more * pass to emit the final image. */ - for (pass = 0; pass < 20 || image; pass++) { - proglen = do_jit(prog, addrs, image, oldproglen, &ctx); + for (pass = 0; pass < MAX_PASSES || image; pass++) { + if (!padding && pass >= PADDING_PASSES) + padding = true; + proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding); if (proglen <= 0) { out_image: image = NULL; diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 96fde03aa987..d17b67c69f89 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2243,10 +2243,8 @@ emit_jmp: return -EFAULT; } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; case BPF_JMP | BPF_EXIT: if (seen_exit) { diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile deleted file mode 100644 index 4d49b5a27025..000000000000 --- a/arch/x86/oprofile/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_OPROFILE) += oprofile.o - -DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ - oprof.o cpu_buffer.o buffer_sync.o \ - event_buffer.o oprofile_files.o \ - oprofilefs.o oprofile_stats.o \ - timer_int.o nmi_timer_int.o ) - -oprofile-y := $(DRIVER_OBJS) init.o backtrace.o -oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ - op_model_ppro.o op_model_p4.o diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c deleted file mode 100644 index 1d8391fcca68..000000000000 --- a/arch/x86/oprofile/backtrace.c +++ /dev/null @@ -1,127 +0,0 @@ -/** - * @file backtrace.c - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - * @author David Smith - */ - -#include <linux/oprofile.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/compat.h> -#include <linux/uaccess.h> - -#include <asm/ptrace.h> -#include <asm/stacktrace.h> -#include <asm/unwind.h> - -#ifdef CONFIG_COMPAT -static struct stack_frame_ia32 * -dump_user_backtrace_32(struct stack_frame_ia32 *head) -{ - /* Also check accessibility of one struct frame_head beyond: */ - struct stack_frame_ia32 bufhead[2]; - struct stack_frame_ia32 *fp; - unsigned long bytes; - - bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); - if (bytes != 0) - return NULL; - - fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); - - oprofile_add_trace(bufhead[0].return_address); - - /* frame pointers should strictly progress back up the stack - * (towards higher addresses) */ - if (head >= fp) - return NULL; - - return fp; -} - -static inline int -x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) -{ - struct stack_frame_ia32 *head; - - /* User process is IA32 */ - if (!current || user_64bit_mode(regs)) - return 0; - - head = (struct stack_frame_ia32 *) regs->bp; - while (depth-- && head) - head = dump_user_backtrace_32(head); - - return 1; -} - -#else -static inline int -x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) -{ - return 0; -} -#endif /* CONFIG_COMPAT */ - -static struct stack_frame *dump_user_backtrace(struct stack_frame *head) -{ - /* Also check accessibility of one struct frame_head beyond: */ - struct stack_frame bufhead[2]; - unsigned long bytes; - - bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); - if (bytes != 0) - return NULL; - - oprofile_add_trace(bufhead[0].return_address); - - /* frame pointers should strictly progress back up the stack - * (towards higher addresses) */ - if (head >= bufhead[0].next_frame) - return NULL; - - return bufhead[0].next_frame; -} - -void -x86_backtrace(struct pt_regs * const regs, unsigned int depth) -{ - struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); - - if (!user_mode(regs)) { - struct unwind_state state; - unsigned long addr; - - if (!depth) - return; - - oprofile_add_trace(regs->ip); - - if (!--depth) - return; - - for (unwind_start(&state, current, regs, NULL); - !unwind_done(&state); unwind_next_frame(&state)) { - addr = unwind_get_return_address(&state); - if (!addr) - break; - - oprofile_add_trace(addr); - - if (!--depth) - break; - } - - return; - } - - if (x86_backtrace_32(regs, depth)) - return; - - while (depth-- && head) - head = dump_user_backtrace(head); -} diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c deleted file mode 100644 index 9e138d00ad36..000000000000 --- a/arch/x86/oprofile/init.c +++ /dev/null @@ -1,38 +0,0 @@ -/** - * @file init.c - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon <levon@movementarian.org> - */ - -#include <linux/oprofile.h> -#include <linux/init.h> -#include <linux/errno.h> - -/* - * We support CPUs that have performance counters like the Pentium Pro - * with the NMI mode driver. - */ - -#ifdef CONFIG_X86_LOCAL_APIC -extern int op_nmi_init(struct oprofile_operations *ops); -extern void op_nmi_exit(void); -#else -static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; } -static void op_nmi_exit(void) { } -#endif - -extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); - -int __init oprofile_arch_init(struct oprofile_operations *ops) -{ - ops->backtrace = x86_backtrace; - return op_nmi_init(ops); -} - -void oprofile_arch_exit(void) -{ - op_nmi_exit(); -} diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c deleted file mode 100644 index a7a7677265b6..000000000000 --- a/arch/x86/oprofile/nmi_int.c +++ /dev/null @@ -1,780 +0,0 @@ -/** - * @file nmi_int.c - * - * @remark Copyright 2002-2009 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon <levon@movementarian.org> - * @author Robert Richter <robert.richter@amd.com> - * @author Barry Kasindorf <barry.kasindorf@amd.com> - * @author Jason Yeh <jason.yeh@amd.com> - * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - */ - -#include <linux/init.h> -#include <linux/notifier.h> -#include <linux/smp.h> -#include <linux/oprofile.h> -#include <linux/syscore_ops.h> -#include <linux/slab.h> -#include <linux/moduleparam.h> -#include <linux/kdebug.h> -#include <linux/cpu.h> -#include <asm/nmi.h> -#include <asm/msr.h> -#include <asm/apic.h> - -#include "op_counter.h" -#include "op_x86_model.h" - -static struct op_x86_model_spec *model; -static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); -static DEFINE_PER_CPU(unsigned long, saved_lvtpc); - -/* must be protected with get_online_cpus()/put_online_cpus(): */ -static int nmi_enabled; -static int ctr_running; - -struct op_counter_config counter_config[OP_MAX_COUNTER]; - -/* common functions */ - -u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, - struct op_counter_config *counter_config) -{ - u64 val = 0; - u16 event = (u16)counter_config->event; - - val |= ARCH_PERFMON_EVENTSEL_INT; - val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; - val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; - val |= (counter_config->unit_mask & 0xFF) << 8; - counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_CMASK); - val |= counter_config->extra; - event &= model->event_mask ? model->event_mask : 0xFF; - val |= event & 0xFF; - val |= (u64)(event & 0x0F00) << 24; - - return val; -} - - -static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs) -{ - if (ctr_running) - model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs)); - else if (!nmi_enabled) - return NMI_DONE; - else - model->stop(this_cpu_ptr(&cpu_msrs)); - return NMI_HANDLED; -} - -static void nmi_cpu_save_registers(struct op_msrs *msrs) -{ - struct op_msr *counters = msrs->counters; - struct op_msr *controls = msrs->controls; - unsigned int i; - - for (i = 0; i < model->num_counters; ++i) { - if (counters[i].addr) - rdmsrl(counters[i].addr, counters[i].saved); - } - - for (i = 0; i < model->num_controls; ++i) { - if (controls[i].addr) - rdmsrl(controls[i].addr, controls[i].saved); - } -} - -static void nmi_cpu_start(void *dummy) -{ - struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs); - if (!msrs->controls) - WARN_ON_ONCE(1); - else - model->start(msrs); -} - -static int nmi_start(void) -{ - get_online_cpus(); - ctr_running = 1; - /* make ctr_running visible to the nmi handler: */ - smp_mb(); - on_each_cpu(nmi_cpu_start, NULL, 1); - put_online_cpus(); - return 0; -} - -static void nmi_cpu_stop(void *dummy) -{ - struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs); - if (!msrs->controls) - WARN_ON_ONCE(1); - else - model->stop(msrs); -} - -static void nmi_stop(void) -{ - get_online_cpus(); - on_each_cpu(nmi_cpu_stop, NULL, 1); - ctr_running = 0; - put_online_cpus(); -} - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static DEFINE_PER_CPU(int, switch_index); - -static inline int has_mux(void) -{ - return !!model->switch_ctrl; -} - -inline int op_x86_phys_to_virt(int phys) -{ - return __this_cpu_read(switch_index) + phys; -} - -inline int op_x86_virt_to_phys(int virt) -{ - return virt % model->num_counters; -} - -static void nmi_shutdown_mux(void) -{ - int i; - - if (!has_mux()) - return; - - for_each_possible_cpu(i) { - kfree(per_cpu(cpu_msrs, i).multiplex); - per_cpu(cpu_msrs, i).multiplex = NULL; - per_cpu(switch_index, i) = 0; - } -} - -static int nmi_setup_mux(void) -{ - size_t multiplex_size = - sizeof(struct op_msr) * model->num_virt_counters; - int i; - - if (!has_mux()) - return 1; - - for_each_possible_cpu(i) { - per_cpu(cpu_msrs, i).multiplex = - kzalloc(multiplex_size, GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).multiplex) - return 0; - } - - return 1; -} - -static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) -{ - int i; - struct op_msr *multiplex = msrs->multiplex; - - if (!has_mux()) - return; - - for (i = 0; i < model->num_virt_counters; ++i) { - if (counter_config[i].enabled) { - multiplex[i].saved = -(u64)counter_config[i].count; - } else { - multiplex[i].saved = 0; - } - } - - per_cpu(switch_index, cpu) = 0; -} - -static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) -{ - struct op_msr *counters = msrs->counters; - struct op_msr *multiplex = msrs->multiplex; - int i; - - for (i = 0; i < model->num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (counters[i].addr) - rdmsrl(counters[i].addr, multiplex[virt].saved); - } -} - -static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) -{ - struct op_msr *counters = msrs->counters; - struct op_msr *multiplex = msrs->multiplex; - int i; - - for (i = 0; i < model->num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (counters[i].addr) - wrmsrl(counters[i].addr, multiplex[virt].saved); - } -} - -static void nmi_cpu_switch(void *dummy) -{ - int cpu = smp_processor_id(); - int si = per_cpu(switch_index, cpu); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - nmi_cpu_stop(NULL); - nmi_cpu_save_mpx_registers(msrs); - - /* move to next set */ - si += model->num_counters; - if ((si >= model->num_virt_counters) || (counter_config[si].count == 0)) - per_cpu(switch_index, cpu) = 0; - else - per_cpu(switch_index, cpu) = si; - - model->switch_ctrl(model, msrs); - nmi_cpu_restore_mpx_registers(msrs); - - nmi_cpu_start(NULL); -} - - -/* - * Quick check to see if multiplexing is necessary. - * The check should be sufficient since counters are used - * in ordre. - */ -static int nmi_multiplex_on(void) -{ - return counter_config[model->num_counters].count ? 0 : -EINVAL; -} - -static int nmi_switch_event(void) -{ - if (!has_mux()) - return -ENOSYS; /* not implemented */ - if (nmi_multiplex_on() < 0) - return -EINVAL; /* not necessary */ - - get_online_cpus(); - if (ctr_running) - on_each_cpu(nmi_cpu_switch, NULL, 1); - put_online_cpus(); - - return 0; -} - -static inline void mux_init(struct oprofile_operations *ops) -{ - if (has_mux()) - ops->switch_events = nmi_switch_event; -} - -static void mux_clone(int cpu) -{ - if (!has_mux()) - return; - - memcpy(per_cpu(cpu_msrs, cpu).multiplex, - per_cpu(cpu_msrs, 0).multiplex, - sizeof(struct op_msr) * model->num_virt_counters); -} - -#else - -inline int op_x86_phys_to_virt(int phys) { return phys; } -inline int op_x86_virt_to_phys(int virt) { return virt; } -static inline void nmi_shutdown_mux(void) { } -static inline int nmi_setup_mux(void) { return 1; } -static inline void -nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } -static inline void mux_init(struct oprofile_operations *ops) { } -static void mux_clone(int cpu) { } - -#endif - -static void free_msrs(void) -{ - int i; - for_each_possible_cpu(i) { - kfree(per_cpu(cpu_msrs, i).counters); - per_cpu(cpu_msrs, i).counters = NULL; - kfree(per_cpu(cpu_msrs, i).controls); - per_cpu(cpu_msrs, i).controls = NULL; - } - nmi_shutdown_mux(); -} - -static int allocate_msrs(void) -{ - size_t controls_size = sizeof(struct op_msr) * model->num_controls; - size_t counters_size = sizeof(struct op_msr) * model->num_counters; - - int i; - for_each_possible_cpu(i) { - per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).counters) - goto fail; - per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).controls) - goto fail; - } - - if (!nmi_setup_mux()) - goto fail; - - return 1; - -fail: - free_msrs(); - return 0; -} - -static void nmi_cpu_setup(void) -{ - int cpu = smp_processor_id(); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - nmi_cpu_save_registers(msrs); - raw_spin_lock(&oprofilefs_lock); - model->setup_ctrs(model, msrs); - nmi_cpu_setup_mux(cpu, msrs); - raw_spin_unlock(&oprofilefs_lock); - per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, APIC_DM_NMI); -} - -static void nmi_cpu_restore_registers(struct op_msrs *msrs) -{ - struct op_msr *counters = msrs->counters; - struct op_msr *controls = msrs->controls; - unsigned int i; - - for (i = 0; i < model->num_controls; ++i) { - if (controls[i].addr) - wrmsrl(controls[i].addr, controls[i].saved); - } - - for (i = 0; i < model->num_counters; ++i) { - if (counters[i].addr) - wrmsrl(counters[i].addr, counters[i].saved); - } -} - -static void nmi_cpu_shutdown(void) -{ - unsigned int v; - int cpu = smp_processor_id(); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - /* restoring APIC_LVTPC can trigger an apic error because the delivery - * mode and vector nr combination can be illegal. That's by design: on - * power on apic lvt contain a zero vector nr which are legal only for - * NMI delivery mode. So inhibit apic err before restoring lvtpc - */ - v = apic_read(APIC_LVTERR); - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); - apic_write(APIC_LVTERR, v); - nmi_cpu_restore_registers(msrs); -} - -static int nmi_cpu_online(unsigned int cpu) -{ - local_irq_disable(); - if (nmi_enabled) - nmi_cpu_setup(); - if (ctr_running) - nmi_cpu_start(NULL); - local_irq_enable(); - return 0; -} - -static int nmi_cpu_down_prep(unsigned int cpu) -{ - local_irq_disable(); - if (ctr_running) - nmi_cpu_stop(NULL); - if (nmi_enabled) - nmi_cpu_shutdown(); - local_irq_enable(); - return 0; -} - -static int nmi_create_files(struct dentry *root) -{ - unsigned int i; - - for (i = 0; i < model->num_virt_counters; ++i) { - struct dentry *dir; - char buf[4]; - - /* quick little hack to _not_ expose a counter if it is not - * available for use. This should protect userspace app. - * NOTE: assumes 1:1 mapping here (that counters are organized - * sequentially in their struct assignment). - */ - if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i))) - continue; - - snprintf(buf, sizeof(buf), "%d", i); - dir = oprofilefs_mkdir(root, buf); - oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(dir, "user", &counter_config[i].user); - oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra); - } - - return 0; -} - -static enum cpuhp_state cpuhp_nmi_online; - -static int nmi_setup(void) -{ - int err = 0; - int cpu; - - if (!allocate_msrs()) - return -ENOMEM; - - /* We need to serialize save and setup for HT because the subset - * of msrs are distinct for save and setup operations - */ - - /* Assume saved/restored counters are the same on all CPUs */ - err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); - if (err) - goto fail; - - for_each_possible_cpu(cpu) { - if (!IS_ENABLED(CONFIG_SMP) || !cpu) - continue; - - memcpy(per_cpu(cpu_msrs, cpu).counters, - per_cpu(cpu_msrs, 0).counters, - sizeof(struct op_msr) * model->num_counters); - - memcpy(per_cpu(cpu_msrs, cpu).controls, - per_cpu(cpu_msrs, 0).controls, - sizeof(struct op_msr) * model->num_controls); - - mux_clone(cpu); - } - - nmi_enabled = 0; - ctr_running = 0; - /* make variables visible to the nmi handler: */ - smp_mb(); - err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify, - 0, "oprofile"); - if (err) - goto fail; - - nmi_enabled = 1; - /* make nmi_enabled visible to the nmi handler: */ - smp_mb(); - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online", - nmi_cpu_online, nmi_cpu_down_prep); - if (err < 0) - goto fail_nmi; - cpuhp_nmi_online = err; - return 0; -fail_nmi: - unregister_nmi_handler(NMI_LOCAL, "oprofile"); -fail: - free_msrs(); - return err; -} - -static void nmi_shutdown(void) -{ - struct op_msrs *msrs; - - cpuhp_remove_state(cpuhp_nmi_online); - nmi_enabled = 0; - ctr_running = 0; - - /* make variables visible to the nmi handler: */ - smp_mb(); - unregister_nmi_handler(NMI_LOCAL, "oprofile"); - msrs = &get_cpu_var(cpu_msrs); - model->shutdown(msrs); - free_msrs(); - put_cpu_var(cpu_msrs); -} - -#ifdef CONFIG_PM - -static int nmi_suspend(void) -{ - /* Only one CPU left, just stop that one */ - if (nmi_enabled == 1) - nmi_cpu_stop(NULL); - return 0; -} - -static void nmi_resume(void) -{ - if (nmi_enabled == 1) - nmi_cpu_start(NULL); -} - -static struct syscore_ops oprofile_syscore_ops = { - .resume = nmi_resume, - .suspend = nmi_suspend, -}; - -static void __init init_suspend_resume(void) -{ - register_syscore_ops(&oprofile_syscore_ops); -} - -static void exit_suspend_resume(void) -{ - unregister_syscore_ops(&oprofile_syscore_ops); -} - -#else - -static inline void init_suspend_resume(void) { } -static inline void exit_suspend_resume(void) { } - -#endif /* CONFIG_PM */ - -static int __init p4_init(char **cpu_type) -{ - __u8 cpu_model = boot_cpu_data.x86_model; - - if (cpu_model > 6 || cpu_model == 5) - return 0; - -#ifndef CONFIG_SMP - *cpu_type = "i386/p4"; - model = &op_p4_spec; - return 1; -#else - switch (smp_num_siblings) { - case 1: - *cpu_type = "i386/p4"; - model = &op_p4_spec; - return 1; - - case 2: - *cpu_type = "i386/p4-ht"; - model = &op_p4_ht2_spec; - return 1; - } -#endif - - printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n"); - printk(KERN_INFO "oprofile: Reverting to timer mode.\n"); - return 0; -} - -enum __force_cpu_type { - reserved = 0, /* do not force */ - timer, - arch_perfmon, -}; - -static int force_cpu_type; - -static int set_cpu_type(const char *str, const struct kernel_param *kp) -{ - if (!strcmp(str, "timer")) { - force_cpu_type = timer; - printk(KERN_INFO "oprofile: forcing NMI timer mode\n"); - } else if (!strcmp(str, "arch_perfmon")) { - force_cpu_type = arch_perfmon; - printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); - } else { - force_cpu_type = 0; - } - - return 0; -} -module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); - -static int __init ppro_init(char **cpu_type) -{ - __u8 cpu_model = boot_cpu_data.x86_model; - struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ - - if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) - return 0; - - /* - * Documentation on identifying Intel processors by CPU family - * and model can be found in the Intel Software Developer's - * Manuals (SDM): - * - * http://www.intel.com/products/processor/manuals/ - * - * As of May 2010 the documentation for this was in the: - * "Intel 64 and IA-32 Architectures Software Developer's - * Manual Volume 3B: System Programming Guide", "Table B-1 - * CPUID Signature Values of DisplayFamily_DisplayModel". - */ - switch (cpu_model) { - case 0 ... 2: - *cpu_type = "i386/ppro"; - break; - case 3 ... 5: - *cpu_type = "i386/pii"; - break; - case 6 ... 8: - case 10 ... 11: - *cpu_type = "i386/piii"; - break; - case 9: - case 13: - *cpu_type = "i386/p6_mobile"; - break; - case 14: - *cpu_type = "i386/core"; - break; - case 0x0f: - case 0x16: - case 0x17: - case 0x1d: - *cpu_type = "i386/core_2"; - break; - case 0x1a: - case 0x1e: - case 0x2e: - spec = &op_arch_perfmon_spec; - *cpu_type = "i386/core_i7"; - break; - case 0x1c: - *cpu_type = "i386/atom"; - break; - default: - /* Unknown */ - return 0; - } - - model = spec; - return 1; -} - -int __init op_nmi_init(struct oprofile_operations *ops) -{ - __u8 vendor = boot_cpu_data.x86_vendor; - __u8 family = boot_cpu_data.x86; - char *cpu_type = NULL; - int ret = 0; - - if (!boot_cpu_has(X86_FEATURE_APIC)) - return -ENODEV; - - if (force_cpu_type == timer) - return -ENODEV; - - switch (vendor) { - case X86_VENDOR_AMD: - /* Needs to be at least an Athlon (or hammer in 32bit mode) */ - - switch (family) { - case 6: - cpu_type = "i386/athlon"; - break; - case 0xf: - /* - * Actually it could be i386/hammer too, but - * give user space an consistent name. - */ - cpu_type = "x86-64/hammer"; - break; - case 0x10: - cpu_type = "x86-64/family10"; - break; - case 0x11: - cpu_type = "x86-64/family11h"; - break; - case 0x12: - cpu_type = "x86-64/family12h"; - break; - case 0x14: - cpu_type = "x86-64/family14h"; - break; - case 0x15: - cpu_type = "x86-64/family15h"; - break; - default: - return -ENODEV; - } - model = &op_amd_spec; - break; - - case X86_VENDOR_INTEL: - switch (family) { - /* Pentium IV */ - case 0xf: - p4_init(&cpu_type); - break; - - /* A P6-class processor */ - case 6: - ppro_init(&cpu_type); - break; - - default: - break; - } - - if (cpu_type) - break; - - if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) - return -ENODEV; - - /* use arch perfmon as fallback */ - cpu_type = "i386/arch_perfmon"; - model = &op_arch_perfmon_spec; - break; - - default: - return -ENODEV; - } - - /* default values, can be overwritten by model */ - ops->create_files = nmi_create_files; - ops->setup = nmi_setup; - ops->shutdown = nmi_shutdown; - ops->start = nmi_start; - ops->stop = nmi_stop; - ops->cpu_type = cpu_type; - - if (model->init) - ret = model->init(ops); - if (ret) - return ret; - - if (!model->num_virt_counters) - model->num_virt_counters = model->num_counters; - - mux_init(ops); - - init_suspend_resume(); - - printk(KERN_INFO "oprofile: using NMI interrupt.\n"); - return 0; -} - -void op_nmi_exit(void) -{ - exit_suspend_resume(); -} diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h deleted file mode 100644 index 0b7b7b179cbe..000000000000 --- a/arch/x86/oprofile/op_counter.h +++ /dev/null @@ -1,30 +0,0 @@ -/** - * @file op_counter.h - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - */ - -#ifndef OP_COUNTER_H -#define OP_COUNTER_H - -#define OP_MAX_COUNTER 32 - -/* Per-perfctr configuration as set via - * oprofilefs. - */ -struct op_counter_config { - unsigned long count; - unsigned long enabled; - unsigned long event; - unsigned long kernel; - unsigned long user; - unsigned long unit_mask; - unsigned long extra; -}; - -extern struct op_counter_config counter_config[]; - -#endif /* OP_COUNTER_H */ diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c deleted file mode 100644 index 660a83c8287b..000000000000 --- a/arch/x86/oprofile/op_model_amd.c +++ /dev/null @@ -1,542 +0,0 @@ -/* - * @file op_model_amd.c - * athlon / K7 / K8 / Family 10h model-specific MSR operations - * - * @remark Copyright 2002-2009 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - * @author Philippe Elie - * @author Graydon Hoare - * @author Robert Richter <robert.richter@amd.com> - * @author Barry Kasindorf <barry.kasindorf@amd.com> - * @author Jason Yeh <jason.yeh@amd.com> - * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> - */ - -#include <linux/oprofile.h> -#include <linux/device.h> -#include <linux/pci.h> -#include <linux/percpu.h> - -#include <asm/ptrace.h> -#include <asm/msr.h> -#include <asm/nmi.h> -#include <asm/apic.h> -#include <asm/processor.h> - -#include "op_x86_model.h" -#include "op_counter.h" - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -#define NUM_VIRT_COUNTERS 32 -#else -#define NUM_VIRT_COUNTERS 0 -#endif - -#define OP_EVENT_MASK 0x0FFF -#define OP_CTR_OVERFLOW (1ULL<<31) - -#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) - -static int num_counters; -static unsigned long reset_value[OP_MAX_COUNTER]; - -#define IBS_FETCH_SIZE 6 -#define IBS_OP_SIZE 12 - -static u32 ibs_caps; - -struct ibs_config { - unsigned long op_enabled; - unsigned long fetch_enabled; - unsigned long max_cnt_fetch; - unsigned long max_cnt_op; - unsigned long rand_en; - unsigned long dispatched_ops; - unsigned long branch_target; -}; - -struct ibs_state { - u64 ibs_op_ctl; - int branch_target; - unsigned long sample_size; -}; - -static struct ibs_config ibs_config; -static struct ibs_state ibs_state; - -/* - * IBS randomization macros - */ -#define IBS_RANDOM_BITS 12 -#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) -#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) - -/* - * 16-bit Linear Feedback Shift Register (LFSR) - * - * 16 14 13 11 - * Feedback polynomial = X + X + X + X + 1 - */ -static unsigned int lfsr_random(void) -{ - static unsigned int lfsr_value = 0xF00D; - unsigned int bit; - - /* Compute next bit to shift in */ - bit = ((lfsr_value >> 0) ^ - (lfsr_value >> 2) ^ - (lfsr_value >> 3) ^ - (lfsr_value >> 5)) & 0x0001; - - /* Advance to next register value */ - lfsr_value = (lfsr_value >> 1) | (bit << 15); - - return lfsr_value; -} - -/* - * IBS software randomization - * - * The IBS periodic op counter is randomized in software. The lower 12 - * bits of the 20 bit counter are randomized. IbsOpCurCnt is - * initialized with a 12 bit random value. - */ -static inline u64 op_amd_randomize_ibs_op(u64 val) -{ - unsigned int random = lfsr_random(); - - if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) - /* - * Work around if the hw can not write to IbsOpCurCnt - * - * Randomize the lower 8 bits of the 16 bit - * IbsOpMaxCnt [15:0] value in the range of -128 to - * +127 by adding/subtracting an offset to the - * maximum count (IbsOpMaxCnt). - * - * To avoid over or underflows and protect upper bits - * starting at bit 16, the initial value for - * IbsOpMaxCnt must fit in the range from 0x0081 to - * 0xff80. - */ - val += (s8)(random >> 4); - else - val |= (u64)(random & IBS_RANDOM_MASK) << 32; - - return val; -} - -static inline void -op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - u64 val, ctl; - struct op_entry entry; - - if (!ibs_caps) - return; - - if (ibs_config.fetch_enabled) { - rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); - if (ctl & IBS_FETCH_VAL) { - rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); - oprofile_write_reserve(&entry, regs, val, - IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data64(&entry, val); - oprofile_add_data64(&entry, ctl); - rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); - oprofile_add_data64(&entry, val); - oprofile_write_commit(&entry); - - /* reenable the IRQ */ - ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); - ctl |= IBS_FETCH_ENABLE; - wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); - } - } - - if (ibs_config.op_enabled) { - rdmsrl(MSR_AMD64_IBSOPCTL, ctl); - if (ctl & IBS_OP_VAL) { - rdmsrl(MSR_AMD64_IBSOPRIP, val); - oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, - ibs_state.sample_size); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSOPDATA, val); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSOPDATA2, val); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSOPDATA3, val); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSDCLINAD, val); - oprofile_add_data64(&entry, val); - rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); - oprofile_add_data64(&entry, val); - if (ibs_state.branch_target) { - rdmsrl(MSR_AMD64_IBSBRTARGET, val); - oprofile_add_data(&entry, (unsigned long)val); - } - oprofile_write_commit(&entry); - - /* reenable the IRQ */ - ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); - wrmsrl(MSR_AMD64_IBSOPCTL, ctl); - } - } -} - -static inline void op_amd_start_ibs(void) -{ - u64 val; - - if (!ibs_caps) - return; - - memset(&ibs_state, 0, sizeof(ibs_state)); - - /* - * Note: Since the max count settings may out of range we - * write back the actual used values so that userland can read - * it. - */ - - if (ibs_config.fetch_enabled) { - val = ibs_config.max_cnt_fetch >> 4; - val = min(val, IBS_FETCH_MAX_CNT); - ibs_config.max_cnt_fetch = val << 4; - val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; - val |= IBS_FETCH_ENABLE; - wrmsrl(MSR_AMD64_IBSFETCHCTL, val); - } - - if (ibs_config.op_enabled) { - val = ibs_config.max_cnt_op >> 4; - if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { - /* - * IbsOpCurCnt not supported. See - * op_amd_randomize_ibs_op() for details. - */ - val = clamp(val, 0x0081ULL, 0xFF80ULL); - ibs_config.max_cnt_op = val << 4; - } else { - /* - * The start value is randomized with a - * positive offset, we need to compensate it - * with the half of the randomized range. Also - * avoid underflows. - */ - val += IBS_RANDOM_MAXCNT_OFFSET; - if (ibs_caps & IBS_CAPS_OPCNTEXT) - val = min(val, IBS_OP_MAX_CNT_EXT); - else - val = min(val, IBS_OP_MAX_CNT); - ibs_config.max_cnt_op = - (val - IBS_RANDOM_MAXCNT_OFFSET) << 4; - } - val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT); - val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; - val |= IBS_OP_ENABLE; - ibs_state.ibs_op_ctl = val; - ibs_state.sample_size = IBS_OP_SIZE; - if (ibs_config.branch_target) { - ibs_state.branch_target = 1; - ibs_state.sample_size++; - } - val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); - wrmsrl(MSR_AMD64_IBSOPCTL, val); - } -} - -static void op_amd_stop_ibs(void) -{ - if (!ibs_caps) - return; - - if (ibs_config.fetch_enabled) - /* clear max count and enable */ - wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); - - if (ibs_config.op_enabled) - /* clear max count and enable */ - wrmsrl(MSR_AMD64_IBSOPCTL, 0); -} - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* enable active counters */ - for (i = 0; i < num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!reset_value[virt]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - -#endif - -/* functions for op_amd_spec */ - -static void op_amd_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (!msrs->counters[i].addr) - continue; - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - -static int op_amd_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < num_counters; i++) { - if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - goto fail; - if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - goto fail; - } - /* both registers must be reserved */ - if (num_counters == AMD64_NUM_COUNTERS_CORE) { - msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); - msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); - } else { - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - } - continue; - fail: - if (!counter_config[i].enabled) - continue; - op_x86_warn_reserved(i); - op_amd_shutdown(msrs); - return -EBUSY; - } - - return 0; -} - -static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* setup reset_value */ - for (i = 0; i < OP_MAX_COUNTER; ++i) { - if (counter_config[i].enabled - && msrs->counters[op_x86_virt_to_phys(i)].addr) - reset_value[i] = counter_config[i].count; - else - reset_value[i] = 0; - } - - /* clear all counters */ - for (i = 0; i < num_counters; ++i) { - if (!msrs->controls[i].addr) - continue; - rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL_ENABLE) - op_x86_warn_in_use(i); - val &= model->reserved; - wrmsrl(msrs->controls[i].addr, val); - /* - * avoid a false detection of ctr overflows in NMI - * handler - */ - wrmsrl(msrs->counters[i].addr, -1LL); - } - - /* enable active counters */ - for (i = 0; i < num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!reset_value[virt]) - continue; - - /* setup counter registers */ - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); - - /* setup control registers */ - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - -static int op_amd_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!reset_value[virt]) - continue; - rdmsrl(msrs->counters[i].addr, val); - /* bit is clear if overflowed: */ - if (val & OP_CTR_OVERFLOW) - continue; - oprofile_add_sample(regs, virt); - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); - } - - op_amd_handle_ibs(regs, msrs); - - /* See op_model_ppro.c */ - return 1; -} - -static void op_amd_start(struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[op_x86_phys_to_virt(i)]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } - - op_amd_start_ibs(); -} - -static void op_amd_stop(struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* - * Subtle: stop on all counters to avoid race with setting our - * pm callback - */ - for (i = 0; i < num_counters; ++i) { - if (!reset_value[op_x86_phys_to_virt(i)]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } - - op_amd_stop_ibs(); -} - -/* - * check and reserve APIC extended interrupt LVT offset for IBS if - * available - */ - -static void init_ibs(void) -{ - ibs_caps = get_ibs_caps(); - - if (!ibs_caps) - return; - - printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); -} - -static int (*create_arch_files)(struct dentry *root); - -static int setup_ibs_files(struct dentry *root) -{ - struct dentry *dir; - int ret = 0; - - /* architecture specific files */ - if (create_arch_files) - ret = create_arch_files(root); - - if (ret) - return ret; - - if (!ibs_caps) - return ret; - - /* model specific files */ - - /* setup some reasonable defaults */ - memset(&ibs_config, 0, sizeof(ibs_config)); - ibs_config.max_cnt_fetch = 250000; - ibs_config.max_cnt_op = 250000; - - if (ibs_caps & IBS_CAPS_FETCHSAM) { - dir = oprofilefs_mkdir(root, "ibs_fetch"); - oprofilefs_create_ulong(dir, "enable", - &ibs_config.fetch_enabled); - oprofilefs_create_ulong(dir, "max_count", - &ibs_config.max_cnt_fetch); - oprofilefs_create_ulong(dir, "rand_enable", - &ibs_config.rand_en); - } - - if (ibs_caps & IBS_CAPS_OPSAM) { - dir = oprofilefs_mkdir(root, "ibs_op"); - oprofilefs_create_ulong(dir, "enable", - &ibs_config.op_enabled); - oprofilefs_create_ulong(dir, "max_count", - &ibs_config.max_cnt_op); - if (ibs_caps & IBS_CAPS_OPCNT) - oprofilefs_create_ulong(dir, "dispatched_ops", - &ibs_config.dispatched_ops); - if (ibs_caps & IBS_CAPS_BRNTRGT) - oprofilefs_create_ulong(dir, "branch_target", - &ibs_config.branch_target); - } - - return 0; -} - -struct op_x86_model_spec op_amd_spec; - -static int op_amd_init(struct oprofile_operations *ops) -{ - init_ibs(); - create_arch_files = ops->create_files; - ops->create_files = setup_ibs_files; - - if (boot_cpu_data.x86 == 0x15) { - num_counters = AMD64_NUM_COUNTERS_CORE; - } else { - num_counters = AMD64_NUM_COUNTERS; - } - - op_amd_spec.num_counters = num_counters; - op_amd_spec.num_controls = num_counters; - op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS); - - return 0; -} - -struct op_x86_model_spec op_amd_spec = { - /* num_counters/num_controls filled in at runtime */ - .reserved = MSR_AMD_EVENTSEL_RESERVED, - .event_mask = OP_EVENT_MASK, - .init = op_amd_init, - .fill_in_addresses = &op_amd_fill_in_addresses, - .setup_ctrs = &op_amd_setup_ctrs, - .check_ctrs = &op_amd_check_ctrs, - .start = &op_amd_start, - .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown, -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - .switch_ctrl = &op_mux_switch_ctrl, -#endif -}; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c deleted file mode 100644 index ad1d91f475ab..000000000000 --- a/arch/x86/oprofile/op_model_p4.c +++ /dev/null @@ -1,723 +0,0 @@ -/** - * @file op_model_p4.c - * P4 model-specific MSR operations - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author Graydon Hoare - */ - -#include <linux/oprofile.h> -#include <linux/smp.h> -#include <linux/ptrace.h> -#include <asm/nmi.h> -#include <asm/msr.h> -#include <asm/fixmap.h> -#include <asm/apic.h> - - -#include "op_x86_model.h" -#include "op_counter.h" - -#define NUM_EVENTS 39 - -#define NUM_COUNTERS_NON_HT 8 -#define NUM_ESCRS_NON_HT 45 -#define NUM_CCCRS_NON_HT 18 -#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT) - -#define NUM_COUNTERS_HT2 4 -#define NUM_ESCRS_HT2 23 -#define NUM_CCCRS_HT2 9 -#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) - -#define OP_CTR_OVERFLOW (1ULL<<31) - -static unsigned int num_counters = NUM_COUNTERS_NON_HT; -static unsigned int num_controls = NUM_CONTROLS_NON_HT; - -/* this has to be checked dynamically since the - hyper-threadedness of a chip is discovered at - kernel boot-time. */ -static inline void setup_num_counters(void) -{ -#ifdef CONFIG_SMP - if (smp_num_siblings == 2) { - num_counters = NUM_COUNTERS_HT2; - num_controls = NUM_CONTROLS_HT2; - } -#endif -} - -static inline int addr_increment(void) -{ -#ifdef CONFIG_SMP - return smp_num_siblings == 2 ? 2 : 1; -#else - return 1; -#endif -} - - -/* tables to simulate simplified hardware view of p4 registers */ -struct p4_counter_binding { - int virt_counter; - int counter_address; - int cccr_address; -}; - -struct p4_event_binding { - int escr_select; /* value to put in CCCR */ - int event_select; /* value to put in ESCR */ - struct { - int virt_counter; /* for this counter... */ - int escr_address; /* use this ESCR */ - } bindings[2]; -}; - -/* nb: these CTR_* defines are a duplicate of defines in - event/i386.p4*events. */ - - -#define CTR_BPU_0 (1 << 0) -#define CTR_MS_0 (1 << 1) -#define CTR_FLAME_0 (1 << 2) -#define CTR_IQ_4 (1 << 3) -#define CTR_BPU_2 (1 << 4) -#define CTR_MS_2 (1 << 5) -#define CTR_FLAME_2 (1 << 6) -#define CTR_IQ_5 (1 << 7) - -static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = { - { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, - { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, - { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, - { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 }, - { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 }, - { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 }, - { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 }, - { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } -}; - -#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT) - -/* p4 event codes in libop/op_event.h are indices into this table. */ - -static struct p4_event_binding p4_events[NUM_EVENTS] = { - - { /* BRANCH_RETIRED */ - 0x05, 0x06, - { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, - {CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* MISPRED_BRANCH_RETIRED */ - 0x04, 0x03, - { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, - { CTR_IQ_5, MSR_P4_CRU_ESCR1} } - }, - - { /* TC_DELIVER_MODE */ - 0x01, 0x01, - { { CTR_MS_0, MSR_P4_TC_ESCR0}, - { CTR_MS_2, MSR_P4_TC_ESCR1} } - }, - - { /* BPU_FETCH_REQUEST */ - 0x00, 0x03, - { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, - { CTR_BPU_2, MSR_P4_BPU_ESCR1} } - }, - - { /* ITLB_REFERENCE */ - 0x03, 0x18, - { { CTR_BPU_0, MSR_P4_ITLB_ESCR0}, - { CTR_BPU_2, MSR_P4_ITLB_ESCR1} } - }, - - { /* MEMORY_CANCEL */ - 0x05, 0x02, - { { CTR_FLAME_0, MSR_P4_DAC_ESCR0}, - { CTR_FLAME_2, MSR_P4_DAC_ESCR1} } - }, - - { /* MEMORY_COMPLETE */ - 0x02, 0x08, - { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, - { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } - }, - - { /* LOAD_PORT_REPLAY */ - 0x02, 0x04, - { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, - { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } - }, - - { /* STORE_PORT_REPLAY */ - 0x02, 0x05, - { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, - { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } - }, - - { /* MOB_LOAD_REPLAY */ - 0x02, 0x03, - { { CTR_BPU_0, MSR_P4_MOB_ESCR0}, - { CTR_BPU_2, MSR_P4_MOB_ESCR1} } - }, - - { /* PAGE_WALK_TYPE */ - 0x04, 0x01, - { { CTR_BPU_0, MSR_P4_PMH_ESCR0}, - { CTR_BPU_2, MSR_P4_PMH_ESCR1} } - }, - - { /* BSQ_CACHE_REFERENCE */ - 0x07, 0x0c, - { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, - { CTR_BPU_2, MSR_P4_BSU_ESCR1} } - }, - - { /* IOQ_ALLOCATION */ - 0x06, 0x03, - { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, - { 0, 0 } } - }, - - { /* IOQ_ACTIVE_ENTRIES */ - 0x06, 0x1a, - { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, - { 0, 0 } } - }, - - { /* FSB_DATA_ACTIVITY */ - 0x06, 0x17, - { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, - { CTR_BPU_2, MSR_P4_FSB_ESCR1} } - }, - - { /* BSQ_ALLOCATION */ - 0x07, 0x05, - { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, - { 0, 0 } } - }, - - { /* BSQ_ACTIVE_ENTRIES */ - 0x07, 0x06, - { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, - { 0, 0 } } - }, - - { /* X87_ASSIST */ - 0x05, 0x03, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* SSE_INPUT_ASSIST */ - 0x01, 0x34, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* PACKED_SP_UOP */ - 0x01, 0x08, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* PACKED_DP_UOP */ - 0x01, 0x0c, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* SCALAR_SP_UOP */ - 0x01, 0x0a, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* SCALAR_DP_UOP */ - 0x01, 0x0e, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* 64BIT_MMX_UOP */ - 0x01, 0x02, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* 128BIT_MMX_UOP */ - 0x01, 0x1a, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* X87_FP_UOP */ - 0x01, 0x04, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* X87_SIMD_MOVES_UOP */ - 0x01, 0x2e, - { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, - { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } - }, - - { /* MACHINE_CLEAR */ - 0x05, 0x02, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* GLOBAL_POWER_EVENTS */ - 0x06, 0x13 /* older manual says 0x05, newer 0x13 */, - { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, - { CTR_BPU_2, MSR_P4_FSB_ESCR1} } - }, - - { /* TC_MS_XFER */ - 0x00, 0x05, - { { CTR_MS_0, MSR_P4_MS_ESCR0}, - { CTR_MS_2, MSR_P4_MS_ESCR1} } - }, - - { /* UOP_QUEUE_WRITES */ - 0x00, 0x09, - { { CTR_MS_0, MSR_P4_MS_ESCR0}, - { CTR_MS_2, MSR_P4_MS_ESCR1} } - }, - - { /* FRONT_END_EVENT */ - 0x05, 0x08, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* EXECUTION_EVENT */ - 0x05, 0x0c, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* REPLAY_EVENT */ - 0x05, 0x09, - { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, - { CTR_IQ_5, MSR_P4_CRU_ESCR3} } - }, - - { /* INSTR_RETIRED */ - 0x04, 0x02, - { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, - { CTR_IQ_5, MSR_P4_CRU_ESCR1} } - }, - - { /* UOPS_RETIRED */ - 0x04, 0x01, - { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, - { CTR_IQ_5, MSR_P4_CRU_ESCR1} } - }, - - { /* UOP_TYPE */ - 0x02, 0x02, - { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, - { CTR_IQ_5, MSR_P4_RAT_ESCR1} } - }, - - { /* RETIRED_MISPRED_BRANCH_TYPE */ - 0x02, 0x05, - { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, - { CTR_MS_2, MSR_P4_TBPU_ESCR1} } - }, - - { /* RETIRED_BRANCH_TYPE */ - 0x02, 0x04, - { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, - { CTR_MS_2, MSR_P4_TBPU_ESCR1} } - } -}; - - -#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7) - -#define ESCR_RESERVED_BITS 0x80000003 -#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS) -#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2)) -#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3)) -#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1))) -#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) -#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) -#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) - -#define CCCR_RESERVED_BITS 0x38030FFF -#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) -#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000) -#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13)) -#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26)) -#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) -#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) -#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) -#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) - - -/* this assigns a "stagger" to the current CPU, which is used throughout - the code in this module as an extra array offset, to select the "even" - or "odd" part of all the divided resources. */ -static unsigned int get_stagger(void) -{ -#ifdef CONFIG_SMP - int cpu = smp_processor_id(); - return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); -#endif - return 0; -} - - -/* finally, mediate access to a real hardware counter - by passing a "virtual" counter numer to this macro, - along with your stagger setting. */ -#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger))) - -static unsigned long reset_value[NUM_COUNTERS_NON_HT]; - -static void p4_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (msrs->counters[i].addr) - release_perfctr_nmi(msrs->counters[i].addr); - } - /* - * some of the control registers are specially reserved in - * conjunction with the counter registers (hence the starting offset). - * This saves a few bits. - */ - for (i = num_counters; i < num_controls; ++i) { - if (msrs->controls[i].addr) - release_evntsel_nmi(msrs->controls[i].addr); - } -} - -static int p4_fill_in_addresses(struct op_msrs * const msrs) -{ - unsigned int i; - unsigned int addr, cccraddr, stag; - - setup_num_counters(); - stag = get_stagger(); - - /* the counter & cccr registers we pay attention to */ - for (i = 0; i < num_counters; ++i) { - addr = p4_counters[VIRT_CTR(stag, i)].counter_address; - cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; - if (reserve_perfctr_nmi(addr)) { - msrs->counters[i].addr = addr; - msrs->controls[i].addr = cccraddr; - } - } - - /* 43 ESCR registers in three or four discontiguous group */ - for (addr = MSR_P4_BSU_ESCR0 + stag; - addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - - /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1 - * to avoid special case in nmi_{save|restore}_registers() */ - if (boot_cpu_data.x86_model >= 0x3) { - for (addr = MSR_P4_BSU_ESCR0 + stag; - addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - } else { - for (addr = MSR_P4_IQ_ESCR0 + stag; - addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - } - - for (addr = MSR_P4_RAT_ESCR0 + stag; - addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - - for (addr = MSR_P4_MS_ESCR0 + stag; - addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - - for (addr = MSR_P4_IX_ESCR0 + stag; - addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { - if (reserve_evntsel_nmi(addr)) - msrs->controls[i].addr = addr; - } - - /* there are 2 remaining non-contiguously located ESCRs */ - - if (num_counters == NUM_COUNTERS_NON_HT) { - /* standard non-HT CPUs handle both remaining ESCRs*/ - if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; - if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4)) - msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; - - } else if (stag == 0) { - /* HT CPUs give the first remainder to the even thread, as - the 32nd control register */ - if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4)) - msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; - - } else { - /* and two copies of the second to the odd thread, - for the 22st and 23nd control registers */ - if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) { - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; - msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; - } - } - - for (i = 0; i < num_counters; ++i) { - if (!counter_config[i].enabled) - continue; - if (msrs->controls[i].addr) - continue; - op_x86_warn_reserved(i); - p4_shutdown(msrs); - return -EBUSY; - } - - return 0; -} - - -static void pmc_setup_one_p4_counter(unsigned int ctr) -{ - int i; - int const maxbind = 2; - unsigned int cccr = 0; - unsigned int escr = 0; - unsigned int high = 0; - unsigned int counter_bit; - struct p4_event_binding *ev = NULL; - unsigned int stag; - - stag = get_stagger(); - - /* convert from counter *number* to counter *bit* */ - counter_bit = 1 << VIRT_CTR(stag, ctr); - - /* find our event binding structure. */ - if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { - printk(KERN_ERR - "oprofile: P4 event code 0x%lx out of range\n", - counter_config[ctr].event); - return; - } - - ev = &(p4_events[counter_config[ctr].event - 1]); - - for (i = 0; i < maxbind; i++) { - if (ev->bindings[i].virt_counter & counter_bit) { - - /* modify ESCR */ - rdmsr(ev->bindings[i].escr_address, escr, high); - ESCR_CLEAR(escr); - if (stag == 0) { - ESCR_SET_USR_0(escr, counter_config[ctr].user); - ESCR_SET_OS_0(escr, counter_config[ctr].kernel); - } else { - ESCR_SET_USR_1(escr, counter_config[ctr].user); - ESCR_SET_OS_1(escr, counter_config[ctr].kernel); - } - ESCR_SET_EVENT_SELECT(escr, ev->event_select); - ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); - wrmsr(ev->bindings[i].escr_address, escr, high); - - /* modify CCCR */ - rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, - cccr, high); - CCCR_CLEAR(cccr); - CCCR_SET_REQUIRED_BITS(cccr); - CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); - if (stag == 0) - CCCR_SET_PMI_OVF_0(cccr); - else - CCCR_SET_PMI_OVF_1(cccr); - wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, - cccr, high); - return; - } - } - - printk(KERN_ERR - "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", - counter_config[ctr].event, stag, ctr); -} - - -static void p4_setup_ctrs(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - unsigned int i; - unsigned int low, high; - unsigned int stag; - - stag = get_stagger(); - - rdmsr(MSR_IA32_MISC_ENABLE, low, high); - if (!MISC_PMC_ENABLED_P(low)) { - printk(KERN_ERR "oprofile: P4 PMC not available\n"); - return; - } - - /* clear the cccrs we will use */ - for (i = 0; i < num_counters; i++) { - if (unlikely(!msrs->controls[i].addr)) - continue; - rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - CCCR_CLEAR(low); - CCCR_SET_REQUIRED_BITS(low); - wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - } - - /* clear all escrs (including those outside our concern) */ - for (i = num_counters; i < num_controls; i++) { - if (unlikely(!msrs->controls[i].addr)) - continue; - wrmsr(msrs->controls[i].addr, 0, 0); - } - - /* setup all counters */ - for (i = 0; i < num_counters; ++i) { - if (counter_config[i].enabled && msrs->controls[i].addr) { - reset_value[i] = counter_config[i].count; - pmc_setup_one_p4_counter(i); - wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, - -(u64)counter_config[i].count); - } else { - reset_value[i] = 0; - } - } -} - - -static int p4_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - unsigned long ctr, low, high, stag, real; - int i; - - stag = get_stagger(); - - for (i = 0; i < num_counters; ++i) { - - if (!reset_value[i]) - continue; - - /* - * there is some eccentricity in the hardware which - * requires that we perform 2 extra corrections: - * - * - check both the CCCR:OVF flag for overflow and the - * counter high bit for un-flagged overflows. - * - * - write the counter back twice to ensure it gets - * updated properly. - * - * the former seems to be related to extra NMIs happening - * during the current NMI; the latter is reported as errata - * N15 in intel doc 249199-029, pentium 4 specification - * update, though their suggested work-around does not - * appear to solve the problem. - */ - - real = VIRT_CTR(stag, i); - - rdmsr(p4_counters[real].cccr_address, low, high); - rdmsr(p4_counters[real].counter_address, ctr, high); - if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { - oprofile_add_sample(regs, i); - wrmsrl(p4_counters[real].counter_address, - -(u64)reset_value[i]); - CCCR_CLEAR_OVF(low); - wrmsr(p4_counters[real].cccr_address, low, high); - wrmsrl(p4_counters[real].counter_address, - -(u64)reset_value[i]); - } - } - - /* P4 quirk: you have to re-unmask the apic vector */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); - - /* See op_model_ppro.c */ - return 1; -} - - -static void p4_start(struct op_msrs const * const msrs) -{ - unsigned int low, high, stag; - int i; - - stag = get_stagger(); - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[i]) - continue; - rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - CCCR_SET_ENABLE(low); - wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - } -} - - -static void p4_stop(struct op_msrs const * const msrs) -{ - unsigned int low, high, stag; - int i; - - stag = get_stagger(); - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[i]) - continue; - rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - CCCR_SET_DISABLE(low); - wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); - } -} - -#ifdef CONFIG_SMP -struct op_x86_model_spec op_p4_ht2_spec = { - .num_counters = NUM_COUNTERS_HT2, - .num_controls = NUM_CONTROLS_HT2, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown -}; -#endif - -struct op_x86_model_spec op_p4_spec = { - .num_counters = NUM_COUNTERS_NON_HT, - .num_controls = NUM_CONTROLS_NON_HT, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown -}; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c deleted file mode 100644 index 7913b6921959..000000000000 --- a/arch/x86/oprofile/op_model_ppro.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * @file op_model_ppro.h - * Family 6 perfmon and architectural perfmon MSR operations - * - * @remark Copyright 2002 OProfile authors - * @remark Copyright 2008 Intel Corporation - * @remark Read the file COPYING - * - * @author John Levon - * @author Philippe Elie - * @author Graydon Hoare - * @author Andi Kleen - * @author Robert Richter <robert.richter@amd.com> - */ - -#include <linux/oprofile.h> -#include <linux/slab.h> -#include <asm/ptrace.h> -#include <asm/msr.h> -#include <asm/apic.h> -#include <asm/nmi.h> - -#include "op_x86_model.h" -#include "op_counter.h" - -static int num_counters = 2; -static int counter_width = 32; - -#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) - -static u64 reset_value[OP_MAX_COUNTER]; - -static void ppro_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (!msrs->counters[i].addr) - continue; - release_perfctr_nmi(MSR_P6_PERFCTR0 + i); - release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); - } -} - -static int ppro_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < num_counters; i++) { - if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) - goto fail; - if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) { - release_perfctr_nmi(MSR_P6_PERFCTR0 + i); - goto fail; - } - /* both registers must be reserved */ - msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; - msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; - continue; - fail: - if (!counter_config[i].enabled) - continue; - op_x86_warn_reserved(i); - ppro_shutdown(msrs); - return -EBUSY; - } - - return 0; -} - - -static void ppro_setup_ctrs(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) { - union cpuid10_eax eax; - eax.full = cpuid_eax(0xa); - - /* - * For Core2 (family 6, model 15), don't reset the - * counter width: - */ - if (!(eax.split.version_id == 0 && - __this_cpu_read(cpu_info.x86) == 6 && - __this_cpu_read(cpu_info.x86_model) == 15)) { - - if (counter_width < eax.split.bit_width) - counter_width = eax.split.bit_width; - } - } - - /* clear all counters */ - for (i = 0; i < num_counters; ++i) { - if (!msrs->controls[i].addr) - continue; - rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL_ENABLE) - op_x86_warn_in_use(i); - val &= model->reserved; - wrmsrl(msrs->controls[i].addr, val); - /* - * avoid a false detection of ctr overflows in NMI * - * handler - */ - wrmsrl(msrs->counters[i].addr, -1LL); - } - - /* enable active counters */ - for (i = 0; i < num_counters; ++i) { - if (counter_config[i].enabled && msrs->counters[i].addr) { - reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[i]); - wrmsrl(msrs->controls[i].addr, val); - } else { - reset_value[i] = 0; - } - } -} - - -static int ppro_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[i]) - continue; - rdmsrl(msrs->counters[i].addr, val); - if (val & (1ULL << (counter_width - 1))) - continue; - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - } - - /* Only P6 based Pentium M need to re-unmask the apic vector but it - * doesn't hurt other P6 variant */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); - - /* We can't work out if we really handled an interrupt. We - * might have caught a *second* counter just after overflowing - * the interrupt for this counter then arrives - * and we don't find a counter that's overflowed, so we - * would return 0 and get dazed + confused. Instead we always - * assume we found an overflow. This sucks. - */ - return 1; -} - - -static void ppro_start(struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - if (reset_value[i]) { - rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } - } -} - - -static void ppro_stop(struct op_msrs const * const msrs) -{ - u64 val; - int i; - - for (i = 0; i < num_counters; ++i) { - if (!reset_value[i]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } -} - -struct op_x86_model_spec op_ppro_spec = { - .num_counters = 2, - .num_controls = 2, - .reserved = MSR_PPRO_EVENTSEL_RESERVED, - .fill_in_addresses = &ppro_fill_in_addresses, - .setup_ctrs = &ppro_setup_ctrs, - .check_ctrs = &ppro_check_ctrs, - .start = &ppro_start, - .stop = &ppro_stop, - .shutdown = &ppro_shutdown -}; - -/* - * Architectural performance monitoring. - * - * Newer Intel CPUs (Core1+) have support for architectural - * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. - * The advantage of this is that it can be done without knowing about - * the specific CPU. - */ - -static void arch_perfmon_setup_counters(void) -{ - union cpuid10_eax eax; - - eax.full = cpuid_eax(0xa); - - /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ - if (eax.split.version_id == 0 && boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 15) { - eax.split.version_id = 2; - eax.split.num_counters = 2; - eax.split.bit_width = 40; - } - - num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER); - - op_arch_perfmon_spec.num_counters = num_counters; - op_arch_perfmon_spec.num_controls = num_counters; -} - -static int arch_perfmon_init(struct oprofile_operations *ignore) -{ - arch_perfmon_setup_counters(); - return 0; -} - -struct op_x86_model_spec op_arch_perfmon_spec = { - .reserved = MSR_PPRO_EVENTSEL_RESERVED, - .init = &arch_perfmon_init, - /* num_counters/num_controls filled in at runtime */ - .fill_in_addresses = &ppro_fill_in_addresses, - /* user space does the cpuid check for available events */ - .setup_ctrs = &ppro_setup_ctrs, - .check_ctrs = &ppro_check_ctrs, - .start = &ppro_start, - .stop = &ppro_stop, - .shutdown = &ppro_shutdown -}; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h deleted file mode 100644 index 276cf79b5d24..000000000000 --- a/arch/x86/oprofile/op_x86_model.h +++ /dev/null @@ -1,90 +0,0 @@ -/** - * @file op_x86_model.h - * interface to x86 model-specific MSR operations - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author Graydon Hoare - * @author Robert Richter <robert.richter@amd.com> - */ - -#ifndef OP_X86_MODEL_H -#define OP_X86_MODEL_H - -#include <asm/types.h> -#include <asm/perf_event.h> - -struct op_msr { - unsigned long addr; - u64 saved; -}; - -struct op_msrs { - struct op_msr *counters; - struct op_msr *controls; - struct op_msr *multiplex; -}; - -struct pt_regs; - -struct oprofile_operations; - -/* The model vtable abstracts the differences between - * various x86 CPU models' perfctr support. - */ -struct op_x86_model_spec { - unsigned int num_counters; - unsigned int num_controls; - unsigned int num_virt_counters; - u64 reserved; - u16 event_mask; - int (*init)(struct oprofile_operations *ops); - int (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs); - int (*check_ctrs)(struct pt_regs * const regs, - struct op_msrs const * const msrs); - void (*start)(struct op_msrs const * const msrs); - void (*stop)(struct op_msrs const * const msrs); - void (*shutdown)(struct op_msrs const * const msrs); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - void (*switch_ctrl)(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs); -#endif -}; - -struct op_counter_config; - -static inline void op_x86_warn_in_use(int counter) -{ - /* - * The warning indicates an already running counter. If - * oprofile doesn't collect data, then try using a different - * performance counter on your platform to monitor the desired - * event. Delete counter #%d from the desired event by editing - * the /usr/share/oprofile/%s/<cpu>/events file. If the event - * cannot be monitored by any other counter, contact your - * hardware or BIOS vendor. - */ - pr_warn("oprofile: counter #%d on cpu #%d may already be used\n", - counter, smp_processor_id()); -} - -static inline void op_x86_warn_reserved(int counter) -{ - pr_warn("oprofile: counter #%d is already reserved\n", counter); -} - -extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, - struct op_counter_config *counter_config); -extern int op_x86_phys_to_virt(int phys); -extern int op_x86_virt_to_phys(int virt); - -extern struct op_x86_model_spec op_ppro_spec; -extern struct op_x86_model_spec op_p4_spec; -extern struct op_x86_model_spec op_p4_ht2_spec; -extern struct op_x86_model_spec op_amd_spec; -extern struct op_x86_model_spec op_arch_perfmon_spec; - -#endif /* OP_X86_MODEL_H */ diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index d0e835470d01..b2f90a1a89f1 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -4,7 +4,6 @@ obj-y += atom/ obj-y += ce4100/ obj-y += efi/ obj-y += geode/ -obj-y += goldfish/ obj-y += iris/ obj-y += intel/ obj-y += intel-mid/ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 8efd003540ca..1b82d77019b1 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -54,10 +54,7 @@ * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G. */ static u64 efi_va = EFI_VA_START; - -struct efi_scratch efi_scratch; - -EXPORT_SYMBOL_GPL(efi_mm); +static struct mm_struct *efi_prev_mm; /* * We need our own copy of the higher levels of the page tables @@ -237,7 +234,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 1; } - efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */ + efi_mixed_mode_stack_pa = page_to_phys(page + 1); /* stack grows down */ npages = (_etext - _text) >> PAGE_SHIFT; text = __pa(_text); @@ -462,11 +459,17 @@ void __init efi_dump_pagetable(void) * can not change under us. * It should be ensured that there are no concurent calls to this function. */ -void efi_switch_mm(struct mm_struct *mm) +void efi_enter_mm(void) +{ + efi_prev_mm = current->active_mm; + current->active_mm = &efi_mm; + switch_mm(efi_prev_mm, &efi_mm, NULL); +} + +void efi_leave_mm(void) { - efi_scratch.prev_mm = current->active_mm; - current->active_mm = mm; - switch_mm(efi_scratch.prev_mm, mm, NULL); + current->active_mm = efi_prev_mm; + switch_mm(&efi_mm, efi_prev_mm, NULL); } static DEFINE_SPINLOCK(efi_runtime_lock); @@ -530,12 +533,12 @@ efi_thunk_set_virtual_address_map(unsigned long memory_map_size, efi_sync_low_kernel_mappings(); local_irq_save(flags); - efi_switch_mm(&efi_mm); + efi_enter_mm(); status = __efi_thunk(set_virtual_address_map, memory_map_size, descriptor_size, descriptor_version, virtual_map); - efi_switch_mm(efi_scratch.prev_mm); + efi_leave_mm(); local_irq_restore(flags); return status; @@ -829,9 +832,9 @@ efi_set_virtual_address_map(unsigned long memory_map_size, descriptor_size, descriptor_version, virtual_map); - efi_switch_mm(&efi_mm); + efi_enter_mm(); - kernel_fpu_begin(); + efi_fpu_begin(); /* Disable interrupts around EFI calls: */ local_irq_save(flags); @@ -840,12 +843,12 @@ efi_set_virtual_address_map(unsigned long memory_map_size, descriptor_version, virtual_map); local_irq_restore(flags); - kernel_fpu_end(); + efi_fpu_end(); /* grab the virtually remapped EFI runtime services table pointer */ efi.runtime = READ_ONCE(systab->runtime); - efi_switch_mm(efi_scratch.prev_mm); + efi_leave_mm(); return status; } diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 26f0da238c1c..fd3dd1708eba 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -33,7 +33,7 @@ SYM_CODE_START(__efi64_thunk) * Switch to 1:1 mapped 32-bit stack pointer. */ movq %rsp, %rax - movq efi_scratch(%rip), %rsp + movq efi_mixed_mode_stack_pa(%rip), %rsp push %rax /* @@ -70,3 +70,7 @@ SYM_CODE_START(__efi64_thunk) pushl %ebp lret SYM_CODE_END(__efi64_thunk) + + .bss + .balign 8 +SYM_DATA(efi_mixed_mode_stack_pa, .quad 0) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 5a40fe411ebd..67d93a243c35 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, * @return: Returns, if the page fault is not handled. This function * will never return if the page fault is handled successfully. */ -void efi_recover_from_page_fault(unsigned long phys_addr) +void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) { if (!IS_ENABLED(CONFIG_X86_64)) return; /* + * If we get an interrupt/NMI while processing an EFI runtime service + * then this is a regular OOPS, not an EFI failure. + */ + if (in_interrupt()) + return; + + /* * Make sure that an efi runtime service caused the page fault. + * READ_ONCE() because we might be OOPSing in a different thread, + * and we don't want to trip KTSAN while trying to OOPS. */ - if (efi_rts_work.efi_rts_id == EFI_NONE) + if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE || + current_work() != &efi_rts_work.work) return; /* @@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr) set_current_state(TASK_IDLE); schedule(); } - - return; } diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index c33f744b5388..b39bf3b5e108 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -22,6 +22,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <linux/dmi.h> #include <asm/geode.h> @@ -69,21 +70,15 @@ static struct platform_device alix_buttons_dev = { static struct gpio_led alix_leds[] = { { .name = "alix:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 1, }, { .name = "alix:2", - .gpio = 25, .default_trigger = "default-off", - .active_low = 1, }, { .name = "alix:3", - .gpio = 27, .default_trigger = "default-off", - .active_low = 1, }, }; @@ -92,6 +87,17 @@ static struct gpio_led_platform_data alix_leds_data = { .leds = alix_leds, }; +static struct gpiod_lookup_table alix_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW), + { } + }, +}; + static struct platform_device alix_leds_dev = { .name = "leds-gpio", .id = -1, @@ -106,6 +112,7 @@ static struct platform_device *alix_devs[] __initdata = { static void __init register_alix(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&alix_leds_gpio_table); platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs)); } diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index 73a3f49b4eb6..d263528c90bb 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -20,6 +20,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <linux/dmi.h> #include <asm/geode.h> @@ -53,21 +54,15 @@ static struct platform_device geos_buttons_dev = { static struct gpio_led geos_leds[] = { { .name = "geos:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 1, }, { .name = "geos:2", - .gpio = 25, .default_trigger = "default-off", - .active_low = 1, }, { .name = "geos:3", - .gpio = 27, .default_trigger = "default-off", - .active_low = 1, }, }; @@ -76,6 +71,17 @@ static struct gpio_led_platform_data geos_leds_data = { .leds = geos_leds, }; +static struct gpiod_lookup_table geos_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW), + { } + }, +}; + static struct platform_device geos_leds_dev = { .name = "leds-gpio", .id = -1, @@ -90,6 +96,7 @@ static struct platform_device *geos_devs[] __initdata = { static void __init register_geos(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&geos_leds_gpio_table); platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs)); } diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index 163e1b545517..558384acd777 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -20,6 +20,7 @@ #include <linux/platform_device.h> #include <linux/input.h> #include <linux/gpio_keys.h> +#include <linux/gpio/machine.h> #include <asm/geode.h> @@ -55,9 +56,7 @@ static struct platform_device net5501_buttons_dev = { static struct gpio_led net5501_leds[] = { { .name = "net5501:1", - .gpio = 6, .default_trigger = "default-on", - .active_low = 0, }, }; @@ -66,6 +65,15 @@ static struct gpio_led_platform_data net5501_leds_data = { .leds = net5501_leds, }; +static struct gpiod_lookup_table net5501_leds_gpio_table = { + .dev_id = "leds-gpio", + .table = { + /* The Geode GPIOs should be on the CS5535 companion chip */ + GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_HIGH), + { } + }, +}; + static struct platform_device net5501_leds_dev = { .name = "leds-gpio", .id = -1, @@ -80,6 +88,7 @@ static struct platform_device *net5501_devs[] __initdata = { static void __init register_net5501(void) { /* Setup LED control through leds-gpio driver */ + gpiod_add_lookup_table(&net5501_leds_gpio_table); platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs)); } diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile deleted file mode 100644 index 072c395379ac..000000000000 --- a/arch/x86/platform/goldfish/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_GOLDFISH) += goldfish.o diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c deleted file mode 100644 index 6b6f8b4360dd..000000000000 --- a/arch/x86/platform/goldfish/goldfish.c +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2007 Google, Inc. - * Copyright (C) 2011 Intel, Inc. - * Copyright (C) 2013 Intel, Inc. - */ - -#include <linux/kernel.h> -#include <linux/irq.h> -#include <linux/platform_device.h> - -/* - * Where in virtual device memory the IO devices (timers, system controllers - * and so on) - */ - -#define GOLDFISH_PDEV_BUS_BASE (0xff001000) -#define GOLDFISH_PDEV_BUS_END (0xff7fffff) -#define GOLDFISH_PDEV_BUS_IRQ (4) - -#define GOLDFISH_TTY_BASE (0x2000) - -static struct resource goldfish_pdev_bus_resources[] = { - { - .start = GOLDFISH_PDEV_BUS_BASE, - .end = GOLDFISH_PDEV_BUS_END, - .flags = IORESOURCE_MEM, - }, - { - .start = GOLDFISH_PDEV_BUS_IRQ, - .end = GOLDFISH_PDEV_BUS_IRQ, - .flags = IORESOURCE_IRQ, - } -}; - -static bool goldfish_enable __initdata; - -static int __init goldfish_setup(char *str) -{ - goldfish_enable = true; - return 0; -} -__setup("goldfish", goldfish_setup); - -static int __init goldfish_init(void) -{ - if (!goldfish_enable) - return -ENODEV; - - platform_device_register_simple("goldfish_pdev_bus", -1, - goldfish_pdev_bus_resources, 2); - return 0; -} -device_initcall(goldfish_init); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index 31dda18bb370..2930b6e9473e 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -88,8 +88,8 @@ static int __init bt_sfi_init(void) memset(&info, 0, sizeof(info)); info.fwnode = ddata->dev->fwnode; info.parent = ddata->dev; - info.name = ddata->name, - info.id = PLATFORM_DEVID_NONE, + info.name = ddata->name; + info.id = PLATFORM_DEVID_NONE; pdev = platform_device_register_full(&info); if (IS_ERR(pdev)) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index ce7188cbdae5..1c3a1962cade 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -867,9 +867,11 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, case R_386_PC32: case R_386_PC16: case R_386_PC8: + case R_386_PLT32: /* - * NONE can be ignored and PC relative relocations don't - * need to be adjusted. + * NONE can be ignored and PC relative relocations don't need + * to be adjusted. Because sym must be defined, R_386_PLT32 can + * be treated the same way as R_386_PC32. */ break; @@ -910,9 +912,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, case R_386_PC32: case R_386_PC16: case R_386_PC8: + case R_386_PLT32: /* - * NONE can be ignored and PC relative relocations don't - * need to be adjusted. + * NONE can be ignored and PC relative relocations don't need + * to be adjusted. Because sym must be defined, R_386_PLT32 can + * be treated the same way as R_386_PC32. */ break; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 9a5a50cdaab5..dc0a337f985b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -567,10 +567,16 @@ void noist_exc_debug(struct pt_regs *regs); DEFINE_IDTENTRY_RAW(xenpv_exc_nmi) { - /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */ + /* On Xen PV, NMI doesn't use IST. The C part is the same as native. */ exc_nmi(regs); } +DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault) +{ + /* On Xen PV, DF doesn't use IST. The C part is the same as native. */ + exc_double_fault(regs, error_code); +} + DEFINE_IDTENTRY_RAW(xenpv_exc_debug) { /* @@ -590,6 +596,20 @@ DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap) BUG(); } +#ifdef CONFIG_X86_MCE +DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check) +{ + /* + * There's no IST on Xen PV, but we still need to dispatch + * to the correct handler. + */ + if (user_mode(regs)) + noist_exc_machine_check(regs); + else + exc_machine_check(regs); +} +#endif + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -608,9 +628,9 @@ struct trap_array_entry { static struct trap_array_entry trap_array[] = { TRAP_ENTRY_REDIR(exc_debug, true ), - TRAP_ENTRY(exc_double_fault, true ), + TRAP_ENTRY_REDIR(exc_double_fault, true ), #ifdef CONFIG_X86_MCE - TRAP_ENTRY(exc_machine_check, true ), + TRAP_ENTRY_REDIR(exc_machine_check, true ), #endif TRAP_ENTRY_REDIR(exc_nmi, true ), TRAP_ENTRY(exc_int3, false ), @@ -1015,8 +1035,6 @@ void __init xen_setup_vcpu_info_placement(void) */ if (xen_have_vcpu_info_placement) { pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_ops.irq.restore_fl = - __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); pv_ops.irq.irq_enable = @@ -1053,7 +1071,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_pmc = xen_read_pmc, .iret = xen_iret, - .usergs_sysret64 = xen_sysret64, .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, @@ -1078,9 +1095,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { #endif .io_delay = xen_io_delay, - /* Xen takes care of %gs when switching to usermode for us */ - .swapgs = paravirt_nop, - .start_context_switch = paravirt_start_context_switch, .end_context_switch = xen_end_context_switch, }; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 850c93f346c7..dfa091d79c2e 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -42,28 +42,6 @@ asmlinkage __visible unsigned long xen_save_fl(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); -__visible void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* See xen_irq_enable() for why preemption must be disabled. */ - preempt_disable(); - vcpu = this_cpu_read(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - - if (flags == 0) { - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - xen_force_evtchn_callback(); - preempt_enable(); - } else - preempt_enable_no_resched(); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); - asmlinkage __visible void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to @@ -118,7 +96,6 @@ static void xen_halt(void) static const struct pv_irq_ops xen_irq_ops __initconst = { .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 53cf8aa35032..02f31341e435 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -72,34 +72,6 @@ SYM_FUNC_START(xen_save_fl_direct) ret SYM_FUNC_END(xen_save_fl_direct) - -/* - * In principle the caller should be passing us a value return from - * xen_save_fl_direct, but for robustness sake we test only the - * X86_EFLAGS_IF flag rather than the whole byte. After setting the - * interrupt mask state, it checks for unmasked pending events and - * enters the hypervisor to get them delivered if so. - */ -SYM_FUNC_START(xen_restore_fl_direct) - FRAME_BEGIN - testw $X86_EFLAGS_IF, %di - setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - /* - * Preempt here doesn't matter because that will deal with any - * pending interrupts. The pending check may end up being run - * on the wrong CPU, but that doesn't hurt. - */ - - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jnz 1f - call check_events -1: - FRAME_END - ret -SYM_FUNC_END(xen_restore_fl_direct) - - /* * Force an event check by making a hypercall, but preserve regs * before making the call. @@ -161,7 +133,7 @@ xen_pv_trap asm_exc_overflow xen_pv_trap asm_exc_bounds xen_pv_trap asm_exc_invalid_op xen_pv_trap asm_exc_device_not_available -xen_pv_trap asm_exc_double_fault +xen_pv_trap asm_xenpv_exc_double_fault xen_pv_trap asm_exc_coproc_segment_overrun xen_pv_trap asm_exc_invalid_tss xen_pv_trap asm_exc_segment_not_present @@ -172,7 +144,7 @@ xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check #ifdef CONFIG_X86_MCE -xen_pv_trap asm_exc_machine_check +xen_pv_trap asm_xenpv_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION @@ -215,26 +187,6 @@ SYM_CODE_START(xen_iret) jmp hypercall_iret SYM_CODE_END(xen_iret) -SYM_CODE_START(xen_sysret64) - /* - * We're already on the usermode stack at this point, but - * still with the kernel gs, so we can easily switch back. - * - * tss.sp2 is scratch space. - */ - movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - pushq $__USER_DS - pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) - pushq %r11 - pushq $__USER_CS - pushq %rcx - - pushq $VGCF_in_syscall - jmp hypercall_iret -SYM_CODE_END(xen_sysret64) - /* * Xen handles syscall callbacks much like ordinary exceptions, which * means we have: diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9546c3384c75..8d7ec49a35fb 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -131,15 +131,12 @@ static inline void __init xen_efi_init(struct boot_params *boot_params) __visible void xen_irq_enable_direct(void); __visible void xen_irq_disable_direct(void); __visible unsigned long xen_save_fl_direct(void); -__visible void xen_restore_fl_direct(unsigned long); __visible unsigned long xen_read_cr2(void); __visible unsigned long xen_read_cr2_direct(void); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); -__visible void xen_sysret32(void); -__visible void xen_sysret64(void); extern int xen_panic_handler_init(void); |