diff options
author | Tony Lindgren <tony@atomide.com> | 2015-09-14 22:42:11 +0200 |
---|---|---|
committer | Tony Lindgren <tony@atomide.com> | 2015-09-14 22:42:11 +0200 |
commit | 7168e947291f0ead07e5638b4599fb7845288b69 (patch) | |
tree | b39cdf02f6d2282eeb91696d4a1c7ddb6afb0e38 /arch/x86 | |
parent | ARM: omap2plus_defconfig: Enable MUSB DMA support (diff) | |
parent | ARM: dts: Fixup model name for HP t410 dts (diff) | |
download | linux-7168e947291f0ead07e5638b4599fb7845288b69.tar.xz linux-7168e947291f0ead07e5638b4599fb7845288b69.zip |
Merge branch 'fixes-rc1' into omap-for-v4.3/fixes
Diffstat (limited to 'arch/x86')
82 files changed, 2439 insertions, 1061 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 48f7433dac6f..7aef2d52daa0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,7 +27,8 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_PMEM_API + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI @@ -41,6 +42,7 @@ config X86 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_IPC_PARSE_VERSION if X86_32 @@ -1449,10 +1451,14 @@ config ILLEGAL_POINTER_VALUE source "mm/Kconfig" +config X86_PMEM_LEGACY_DEVICE + bool + config X86_PMEM_LEGACY - bool "Support non-standard NVDIMMs and ADR protected memory" + tristate "Support non-standard NVDIMMs and ADR protected memory" depends on PHYS_ADDR_T_64BIT depends on BLK_DEV + select X86_PMEM_LEGACY_DEVICE select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used @@ -1748,6 +1754,7 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" + select KEXEC_CORE ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1764,8 +1771,8 @@ config KEXEC config KEXEC_FILE bool "kexec file based system call" + select KEXEC_CORE select BUILD_BIN2C - depends on KEXEC depends on X86_64 depends on CRYPTO=y depends on CRYPTO_SHA256=y diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index f63797942bb5..79dac1758e7c 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -448,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, #endif debug_putstr("\nDecompressing Linux... "); - decompress(input_data, input_len, NULL, NULL, output, NULL, error); + __decompress(input_data, input_len, NULL, NULL, output, output_len, + NULL, error); parse_elf(output); /* * 32-bit always performs relocations. 64-bit relocations are only diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 16ef02596db2..2d6b309c8e9a 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -414,7 +414,7 @@ xloadflags: # define XLF23 0 #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC) +#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE) # define XLF4 XLF_EFI_KEXEC #else # define XLF4 0 diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 962297d244b3..cb5b3ab5beec 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -208,7 +208,6 @@ CONFIG_AGP_AMD64=y CONFIG_AGP_INTEL=y CONFIG_DRM=y CONFIG_DRM_I915=y -CONFIG_DRM_I915_KMS=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 64d7cf1b50e1..440df0c7a2ee 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -294,6 +294,7 @@ static struct ahash_alg ghash_async_alg = { .cra_name = "ghash", .cra_driver_name = "ghash-clmulni", .cra_priority = 400, + .cra_ctxsize = sizeof(struct ghash_async_ctx), .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_type = &crypto_ahash_type, diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 25e3cf1cd8fd..7663c455b9f6 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -380,3 +380,5 @@ 371 i386 recvfrom sys_recvfrom compat_sys_recvfrom 372 i386 recvmsg sys_recvmsg compat_sys_recvmsg 373 i386 shutdown sys_shutdown +374 i386 userfaultfd sys_userfaultfd +375 i386 membarrier sys_membarrier diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9ef32d5f1b19..278842fdf1f6 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -329,6 +329,8 @@ 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf 322 64 execveat stub_execveat +323 common userfaultfd sys_userfaultfd +324 common membarrier sys_membarrier # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 26a46f44e298..b160c0c6baed 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma) { return "[vsyscall]"; } -static struct vm_operations_struct gate_vma_ops = { +static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; static struct vm_area_struct gate_vma = { diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index e9168955c42f..fb52aa644aab 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -182,6 +182,21 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } +#define ATOMIC_OP(op) \ +static inline void atomic_##op(int i, atomic_t *v) \ +{ \ + asm volatile(LOCK_PREFIX #op"l %1,%0" \ + : "+m" (v->counter) \ + : "ir" (i) \ + : "memory"); \ +} + +ATOMIC_OP(and) +ATOMIC_OP(or) +ATOMIC_OP(xor) + +#undef ATOMIC_OP + /** * __atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t @@ -219,16 +234,6 @@ static __always_inline short int atomic_inc_short(short int *v) return *v; } -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "andl %0,%1" \ - : : "r" (~(mask)), "m" (*(addr)) : "memory") - -#define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" ((unsigned)(mask)), "m" (*(addr)) \ - : "memory") - #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index b154de75c90c..a11c30b77fb5 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -313,4 +313,18 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 +#define ATOMIC64_OP(op, c_op) \ +static inline void atomic64_##op(long long i, atomic64_t *v) \ +{ \ + long long old, c = 0; \ + while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ + c = old; \ +} + +ATOMIC64_OP(and, &) +ATOMIC64_OP(or, |) +ATOMIC64_OP(xor, ^) + +#undef ATOMIC64_OP + #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index b965f9e03f2a..50e33eff58de 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -220,4 +220,19 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } +#define ATOMIC64_OP(op) \ +static inline void atomic64_##op(long i, atomic64_t *v) \ +{ \ + asm volatile(LOCK_PREFIX #op"q %1,%0" \ + : "+m" (v->counter) \ + : "er" (i) \ + : "memory"); \ +} + +ATOMIC64_OP(and) +ATOMIC64_OP(or) +ATOMIC64_OP(xor) + +#undef ATOMIC64_OP + #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 818cb8788225..0681d2532527 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -57,12 +57,12 @@ do { \ compiletime_assert_atomic_type(*p); \ smp_mb(); \ - ACCESS_ONCE(*p) = (v); \ + WRITE_ONCE(*p, v); \ } while (0) #define smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ smp_mb(); \ ___p1; \ @@ -74,12 +74,12 @@ do { \ do { \ compiletime_assert_atomic_type(*p); \ barrier(); \ - ACCESS_ONCE(*p) = (v); \ + WRITE_ONCE(*p, v); \ } while (0) #define smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ barrier(); \ ___p1; \ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9bf3ea14b9f0..e63aa38e85fb 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages); void clflush_cache_range(void *addr, unsigned int size); +#define mmio_flush_range(addr, size) clflush_cache_range(addr, size) + #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; @@ -109,75 +111,4 @@ static inline int rodata_test(void) } #endif -#ifdef ARCH_HAS_NOCACHE_UACCESS - -/** - * arch_memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Copy data to persistent memory media via non-temporal stores so that - * a subsequent arch_wmb_pmem() can flush cpu and memory controller - * write buffers to guarantee durability. - */ -static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, - size_t n) -{ - int unwritten; - - /* - * We are copying between two kernel buffers, if - * __copy_from_user_inatomic_nocache() returns an error (page - * fault) we would have already reported a general protection fault - * before the WARN+BUG. - */ - unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, - (void __user *) src, n); - if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", - __func__, dst, src, unwritten)) - BUG(); -} - -/** - * arch_wmb_pmem - synchronize writes to persistent memory - * - * After a series of arch_memcpy_to_pmem() operations this drains data - * from cpu write buffers and any platform (memory controller) buffers - * to ensure that written data is durable on persistent memory media. - */ -static inline void arch_wmb_pmem(void) -{ - /* - * wmb() to 'sfence' all previous writes such that they are - * architecturally visible to 'pcommit'. Note, that we've - * already arranged for pmem writes to avoid the cache via - * arch_memcpy_to_pmem(). - */ - wmb(); - pcommit_sfence(); -} - -static inline bool __arch_has_wmb_pmem(void) -{ -#ifdef CONFIG_X86_64 - /* - * We require that wmb() be an 'sfence', that is only guaranteed on - * 64-bit builds - */ - return static_cpu_has(X86_FEATURE_PCOMMIT); -#else - return false; -#endif -} -#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ -extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); -extern void arch_wmb_pmem(void); - -static inline bool __arch_has_wmb_pmem(void) -{ - return false; -} -#endif - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1f5b7287d1ad..953b7263f844 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -12,7 +12,6 @@ #include <linux/dma-attrs.h> #include <asm/io.h> #include <asm/swiotlb.h> -#include <asm-generic/dma-coherent.h> #include <linux/dma-contiguous.h> #ifdef CONFIG_ISA @@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #endif } -#include <asm-generic/dma-mapping-common.h> - -/* Make sure we keep the same behaviour */ -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - debug_dma_mapping_error(dev, dma_addr); - if (ops->mapping_error) - return ops->mapping_error(dev, dma_addr); - - return (dma_addr == DMA_ERROR_CODE); -} - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) +bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); +#define arch_dma_alloc_attrs arch_dma_alloc_attrs +#define HAVE_ARCH_DMA_SUPPORTED 1 extern int dma_supported(struct device *hwdev, u64 mask); -extern int dma_set_mask(struct device *dev, u64 mask); + +#include <asm-generic/dma-mapping-common.h> extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, @@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) return gfp; } -#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) - -void * -dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, struct dma_attrs *attrs); - -#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) - -void dma_free_attrs(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus, - struct dma_attrs *attrs); - #endif diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f45acad3c4b6..24938852db30 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,9 +3,9 @@ #ifdef CONFIG_FUNCTION_TRACER #ifdef CC_USING_FENTRY -# define MCOUNT_ADDR ((long)(__fentry__)) +# define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else -# define MCOUNT_ADDR ((long)(mcount)) +# define MCOUNT_ADDR ((unsigned long)(mcount)) #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 6615032e19c8..1e3408e88604 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -182,10 +182,10 @@ extern char irq_entries_start[]; #define trace_irq_entries_start irq_entries_start #endif -#define VECTOR_UNDEFINED (-1) -#define VECTOR_RETRIGGERED (-2) +#define VECTOR_UNUSED NULL +#define VECTOR_RETRIGGERED ((void *)~0UL) -typedef int vector_irq_t[NR_VECTORS]; +typedef struct irq_desc* vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); #endif /* !ASSEMBLY_ */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 7cfc085b6879..de25aad07853 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -250,12 +250,6 @@ static inline void flush_write_buffers(void) #endif } -static inline void __pmem *arch_memremap_pmem(resource_size_t offset, - unsigned long size) -{ - return (void __force __pmem *) ioremap_cache(offset, size); -} - #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 8008d06581c7..881b4768644a 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -36,7 +36,9 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); -extern bool handle_irq(unsigned irq, struct pt_regs *regs); + +struct irq_desc; +extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a4c1cf7e93f8..5daeca3d0f9e 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -16,15 +16,32 @@ # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC #endif -static __always_inline bool arch_static_branch(struct static_key *key) +static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:" ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 \n\t" + _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" ".popsection \n\t" - : : "i" (key) : : l_yes); + : : "i" (key), "i" (branch) : : l_yes); + + return false; +l_yes: + return true; +} + +static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) +{ + asm_volatile_goto("1:" + ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" + "2:\n\t" + ".pushsection __jump_table, \"aw\" \n\t" + _ASM_ALIGN "\n\t" + _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" + ".popsection \n\t" + : : "i" (key), "i" (branch) : : l_yes); + return false; l_yes: return true; diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 32ce71375b21..b130d59406fb 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs, extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE extern int in_crash_kexec; #else /* no crash dump is ever in progress if no crash kernel can be kexec'd */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index fcd17c1fc0c6..c1c0a1c14344 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -184,6 +184,12 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL_1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + #define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 #define MSR_PKG_ANY_CORE_C0_RES 0x00000659 #define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h new file mode 100644 index 000000000000..d8ce3ec816ab --- /dev/null +++ b/arch/x86/include/asm/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ASM_X86_PMEM_H__ +#define __ASM_X86_PMEM_H__ + +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/special_insns.h> + +#ifdef CONFIG_ARCH_HAS_PMEM_API +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + int unwritten; + + /* + * We are copying between two kernel buffers, if + * __copy_from_user_inatomic_nocache() returns an error (page + * fault) we would have already reported a general protection fault + * before the WARN+BUG. + */ + unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, + (void __user *) src, n); + if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, unwritten)) + BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ + /* + * wmb() to 'sfence' all previous writes such that they are + * architecturally visible to 'pcommit'. Note, that we've + * already arranged for pmem writes to avoid the cache via + * arch_memcpy_to_pmem(). + */ + wmb(); + pcommit_sfence(); +} + +/** + * __arch_wb_cache_pmem - write back a cache range with CLWB + * @vaddr: virtual start address + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction. This function requires explicit ordering with an + * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + */ +static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +{ + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; + unsigned long clflush_mask = x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; + + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += x86_clflush_size) + clwb(p); +} + +/* + * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec + * iterators, so for other types (bvec & kvec) we must do a cache write-back. + */ +static inline bool __iter_needs_pmem_wb(struct iov_iter *i) +{ + return iter_is_iovec(i) == false; +} + +/** + * arch_copy_from_iter_pmem - copy data from an iterator to PMEM + * @addr: PMEM destination address + * @bytes: number of bytes to copy + * @i: iterator with source data + * + * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, + struct iov_iter *i) +{ + void *vaddr = (void __force *)addr; + size_t len; + + /* TODO: skip the write-back by always using non-temporal stores */ + len = copy_from_iter_nocache(vaddr, bytes, i); + + if (__iter_needs_pmem_wb(i)) + __arch_wb_cache_pmem(vaddr, bytes); + + return len; +} + +/** + * arch_clear_pmem - zero a PMEM memory range + * @addr: virtual start address + * @size: number of bytes to zero + * + * Write zeros into the memory range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline void arch_clear_pmem(void __pmem *addr, size_t size) +{ + void *vaddr = (void __force *)addr; + + /* TODO: implement the zeroing via non-temporal writes */ + if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) + clear_page(vaddr); + else + memset(vaddr, 0, size); + + __arch_wb_cache_pmem(vaddr, size); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + /* + * We require that wmb() be an 'sfence', that is only guaranteed on + * 64-bit builds + */ + return static_cpu_has(X86_FEATURE_PCOMMIT); +} +#endif /* CONFIG_ARCH_HAS_PMEM_API */ +#endif /* __ASM_X86_PMEM_H__ */ diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h index ae0e241e228b..c537cbb038a7 100644 --- a/arch/x86/include/asm/qrwlock.h +++ b/arch/x86/include/asm/qrwlock.h @@ -2,16 +2,6 @@ #define _ASM_X86_QRWLOCK_H #include <asm-generic/qrwlock_types.h> - -#ifndef CONFIG_X86_PPRO_FENCE -#define queue_write_unlock queue_write_unlock -static inline void queue_write_unlock(struct qrwlock *lock) -{ - barrier(); - ACCESS_ONCE(*(u8 *)&lock->cnts) = 0; -} -#endif - #include <asm-generic/qrwlock.h> #endif /* _ASM_X86_QRWLOCK_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cd791948b286..6df2029405a3 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void) #endif /* SMP */ +/* Not inlined due to inc_irq_stat not being defined yet */ +#define flush_tlb_local() { \ + inc_irq_stat(irq_tlb_count); \ + local_flush_tlb(); \ +} + #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, mm, start, end) \ native_flush_tlb_others(mask, mm, start, end) diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a466..e6911caf5bbf 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ca08a27b90b3..83aea8055119 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -465,6 +465,12 @@ HYPERVISOR_tmem_op( return _hypercall1(int, tmem_op, op); } +static inline int +HYPERVISOR_xenpmu_op(unsigned int op, void *arg) +{ + return _hypercall2(int, xenpmu_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3400dbaec3c3..62ca03ef5c65 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -3,12 +3,38 @@ * * Guest OS interface to x86 Xen. * - * Copyright (c) 2004, K A Fraser + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser */ #ifndef _ASM_X86_XEN_INTERFACE_H #define _ASM_X86_XEN_INTERFACE_H +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */ #ifdef __XEN__ #define __DEFINE_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name @@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* - * Send an array of these to HYPERVISOR_set_trap_table() + * Send an array of these to HYPERVISOR_set_trap_table(). + * Terminate the array with a sentinel entry, with traps[].address==0. * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: @@ -118,10 +147,41 @@ struct trap_info { DEFINE_GUEST_HANDLE_STRUCT(trap_info); struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ - unsigned long pfn_to_mfn_frame_list_list; - unsigned long nmi_reason; + /* + * Number of valid entries in the p2m table(s) anchored at + * pfn_to_mfn_frame_list_list and/or p2m_vaddr. + */ + unsigned long max_pfn; + /* + * Frame containing list of mfns containing list of mfns containing p2m. + * A value of 0 indicates it has not yet been set up, ~0 indicates it + * has been set to invalid e.g. due to the p2m being too large for the + * 3-level p2m tree. In this case the linear mapper p2m list anchored + * at p2m_vaddr is to be used. + */ + xen_pfn_t pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; + /* + * Following three fields are valid if p2m_cr3 contains a value + * different from 0. + * p2m_cr3 is the root of the address space where p2m_vaddr is valid. + * p2m_cr3 is in the same format as a cr3 value in the vcpu register + * state and holds the folded machine frame number (via xen_pfn_to_cr3) + * of a L3 or L4 page table. + * p2m_vaddr holds the virtual address of the linear p2m list. All + * entries in the range [0...max_pfn[ are accessible via this pointer. + * p2m_generation will be incremented by the guest before and after each + * change of the mappings of the p2m list. p2m_generation starts at 0 + * and a value with the least significant bit set indicates that a + * mapping update is in progress. This allows guest external software + * (e.g. in Dom0) to verify that read mappings are consistent and + * whether they have changed since the last check. + * Modifying a p2m element in the linear p2m list is allowed via an + * atomic write only. + */ + unsigned long p2m_cr3; /* cr3 value of the p2m address space */ + unsigned long p2m_vaddr; /* virtual address of the p2m list */ + unsigned long p2m_generation; /* generation count of p2m mapping */ }; #endif /* !__ASSEMBLY__ */ @@ -137,13 +197,31 @@ struct arch_shared_info { /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ -#define VGCF_I387_VALID (1<<0) -#define VGCF_HVM_GUEST (1<<1) -#define VGCF_IN_KERNEL (1<<2) +#define VGCF_I387_VALID (1<<0) +#define VGCF_IN_KERNEL (1<<2) +#define _VGCF_i387_valid 0 +#define VGCF_i387_valid (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel 2 +#define VGCF_in_kernel (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events 4 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) +#define _VGCF_online 5 +#define VGCF_online (1<<_VGCF_online) unsigned long flags; /* VGCF_* flags */ struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ @@ -172,6 +250,129 @@ struct vcpu_guest_context { #endif }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { + /* + * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). + * For PV(H) guests these fields are RO. + */ + uint32_t counters; + uint32_t ctrls; + + /* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { + uint64_t counter; + uint64_t control; +}; + +struct xen_pmu_intel_ctxt { + /* + * Offsets to fixed and architectural counter MSRs (relative to + * xen_pmu_arch.c.intel). + * For PV(H) guests these fields are RO. + */ + uint32_t fixed_counters; + uint32_t arch_counters; + + /* PMU registers */ + uint64_t global_ctrl; + uint64_t global_ovf_ctrl; + uint64_t global_status; + uint64_t fixed_ctrl; + uint64_t ds_area; + uint64_t pebs_enable; + uint64_t debugctl; + + /* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Sampled domain's registers */ +struct xen_pmu_regs { + uint64_t ip; + uint64_t sp; + uint64_t flags; + uint16_t cs; + uint16_t ss; + uint8_t cpl; + uint8_t pad[3]; +}; + +/* PMU flags */ +#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { + union { + /* + * Processor's registers at the time of interrupt. + * WO for hypervisor, RO for guests. + */ + struct xen_pmu_regs regs; + /* + * Padding for adding new registers to xen_pmu_regs in + * the future + */ +#define XENPMU_REGS_PAD_SZ 64 + uint8_t pad[XENPMU_REGS_PAD_SZ]; + } r; + + /* WO for hypervisor, RO for guest */ + uint64_t pmu_flags; + + /* + * APIC LVTPC register. + * RW for both hypervisor and guest. + * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware + * during XENPMU_flush or XENPMU_lvtpc_set. + */ + union { + uint32_t lapic_lvtpc; + uint64_t pad; + } l; + + /* + * Vendor-specific PMU registers. + * RW for both hypervisor and guest (see exceptions above). + * Guest's updates to this field are verified and then loaded by the + * hypervisor into hardware during XENPMU_flush + */ + union { + struct xen_pmu_amd_ctxt amd; + struct xen_pmu_intel_ctxt intel; + + /* + * Padding for contexts (fixed parts only, does not include + * MSR banks that are specified by offsets) + */ +#define XENPMU_CTXT_PAD_SZ 128 + uint8_t pad[XENPMU_CTXT_PAD_SZ]; + } c; +}; + #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c44a5d53e464..0679e11d2cf7 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -35,9 +35,7 @@ typedef struct xpaddr { #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) -/* Maximum amount of memory we can handle in a domain in pages */ -#define MAX_DOMAIN_PAGES \ - ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; @@ -48,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern unsigned long set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, @@ -103,6 +101,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; + /* + * Some x86 code are still using pfn_to_mfn instead of + * pfn_to_mfn. This will have to be removed when we figured + * out which call. + */ if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; @@ -149,6 +152,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) { unsigned long pfn; + /* + * Some x86 code are still using mfn_to_pfn instead of + * gfn_to_pfn. This will have to be removed when we figure + * out which call. + */ if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; @@ -178,6 +186,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); } +/* Pseudo-physical <-> Guest conversion */ +static inline unsigned long pfn_to_gfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + else + return pfn_to_mfn(pfn); +} + +static inline unsigned long gfn_to_pfn(unsigned long gfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return gfn; + else + return mfn_to_pfn(gfn); +} + +/* Pseudo-physical <-> Bus conversion */ +#define pfn_to_bfn(pfn) pfn_to_gfn(pfn) +#define bfn_to_pfn(bfn) gfn_to_pfn(bfn) + /* * We detect special mappings in one of two ways: * 1. If the MFN is an I/O page then Xen will set the m2p entry @@ -198,7 +227,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) * require. In all the cases we care about, the FOREIGN_FRAME bit is * masked (e.g., pfn_to_mfn()) so behaviour there is correct. */ -static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +static inline unsigned long bfn_to_local_pfn(unsigned long mfn) { unsigned long pfn; @@ -217,6 +246,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) +/* VIRT <-> GUEST conversion */ +#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v))) +#define gfn_to_virt(g) (__va(gfn_to_pfn(g) << PAGE_SHIFT)) + static inline unsigned long pte_mfn(pte_t pte) { return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; @@ -264,7 +297,7 @@ void make_lowmem_page_readwrite(void *vaddr); static inline bool xen_arch_need_swiotlb(struct device *dev, unsigned long pfn, - unsigned long mfn) + unsigned long bfn) { return false; } diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 0f457e6eab18..9dafe59cf6e2 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -37,7 +37,7 @@ /* * This is a non-standardized way to represent ADR or NVDIMM regions that * persist over a reboot. The kernel will ignore their special capabilities - * unless the CONFIG_X86_PMEM_LEGACY=y option is set. + * unless the CONFIG_X86_PMEM_LEGACY option is set. * * ( Note that older platforms also used 6 for the same type of memory, * but newer versions switched to 12 as 6 was assigned differently. Some diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3c3622176340..b1b78ffe01d0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o -obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o -obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o +obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ @@ -94,7 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o -obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o +obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 75e8bad53798..ded848c20e05 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -445,6 +445,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* * stash over-ride to indicate we've been here diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 5aba9220a5ac..3ca3e46aa405 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -462,40 +462,40 @@ static int lapic_next_deadline(unsigned long delta, return 0; } -/* - * Setup the lapic timer in periodic or oneshot mode - */ -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) +static int lapic_timer_shutdown(struct clock_event_device *evt) { - unsigned long flags; unsigned int v; /* Lapic used as dummy for broadcast ? */ if (evt->features & CLOCK_EVT_FEAT_DUMMY) - return; + return 0; - local_irq_save(flags); + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + apic_write(APIC_TMICT, 0); + return 0; +} - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(lapic_timer_frequency, - mode != CLOCK_EVT_MODE_PERIODIC, 1); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - v = apic_read(APIC_LVTT); - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0); - break; - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } +static inline int +lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot) +{ + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return 0; - local_irq_restore(flags); + __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1); + return 0; +} + +static int lapic_timer_set_periodic(struct clock_event_device *evt) +{ + return lapic_timer_set_periodic_oneshot(evt, false); +} + +static int lapic_timer_set_oneshot(struct clock_event_device *evt) +{ + return lapic_timer_set_periodic_oneshot(evt, true); } /* @@ -513,15 +513,18 @@ static void lapic_timer_broadcast(const struct cpumask *mask) * The local apic timer can be used for any function which is CPU local. */ static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP + | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_state_shutdown = lapic_timer_shutdown, + .set_state_periodic = lapic_timer_set_periodic, + .set_state_oneshot = lapic_timer_set_oneshot, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); @@ -778,7 +781,7 @@ static int __init calibrate_APIC_clock(void) * Setup the apic timer manually */ levt->event_handler = lapic_cal_handler; - lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_timer_set_periodic(levt); lapic_cal_loops = -1; /* Let the interrupts run */ @@ -788,7 +791,8 @@ static int __init calibrate_APIC_clock(void) cpu_relax(); /* Stop the lapic timer */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + local_irq_disable(); + lapic_timer_shutdown(levt); /* Jiffies delta */ deltaj = lapic_cal_j2 - lapic_cal_j1; @@ -799,8 +803,8 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); else levt->features |= CLOCK_EVT_FEAT_DUMMY; - } else - local_irq_enable(); + } + local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { pr_warning("APIC timer disabled due to verification failure\n"); @@ -878,7 +882,7 @@ static void local_apic_timer_interrupt(void) if (!evt->event_handler) { pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + lapic_timer_shutdown(evt); return; } diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6873ab925d00..045e424fb368 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #endif #ifdef arch_trigger_all_cpu_backtrace -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); - -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ -static unsigned long backtrace_flag; - -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +static void nmi_raise_cpu_backtrace(cpumask_t *mask) { - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); + apic->send_IPI_mask(mask, NMI_VECTOR); } void arch_trigger_all_cpu_backtrace(bool include_self) { - struct nmi_seq_buf *s; - int len; - int cpu; - int i; - int this_cpu = get_cpu(); - - if (test_and_set_bit(0, &backtrace_flag)) { - /* - * If there is already a trigger_all_cpu_backtrace() in progress - * (backtrace_flag == 1), don't output double cpu dump infos. - */ - put_cpu(); - return; - } - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) - cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - - if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); - apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); - } - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - touch_softlockup_watchdog(); - } - - /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. - */ - for_each_cpu(cpu, &printtrace_mask) { - int last_i = 0; - - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); - put_cpu(); -} - -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; + nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace); } static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { - int cpu; - - cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - this_cpu_write(printk_func, printk_func_save); - - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + if (nmi_cpu_backtrace(regs)) return NMI_HANDLED; - } return NMI_DONE; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 206052e55517..38a76f826530 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2541,7 +2541,7 @@ void __init setup_ioapic_dest(void) * Honour affinities which have been set in early boot */ if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) - mask = idata->affinity; + mask = irq_data_get_affinity_mask(idata); else mask = apic->target_cpus(); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 1a9d735e09c6..5f1feb6854af 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -264,7 +264,7 @@ static inline int hpet_dev_id(struct irq_domain *domain) static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { - hpet_msi_write(data->handler_data, msg); + hpet_msi_write(irq_data_get_irq_handler_data(data), msg); } static struct irq_chip hpet_msi_controller = { diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 2683f36e4e0a..1bbd0fe2c806 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -169,8 +169,7 @@ next: goto next; for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) { - if (per_cpu(vector_irq, new_cpu)[vector] > - VECTOR_UNDEFINED) + if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector])) goto next; } /* Found one! */ @@ -182,7 +181,7 @@ next: cpumask_intersects(d->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; + per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq); d->cfg.vector = vector; cpumask_copy(d->domain, vector_cpumask); err = 0; @@ -224,15 +223,16 @@ static int assign_irq_vector_policy(int irq, int node, static void clear_irq_vector(int irq, struct apic_chip_data *data) { - int cpu, vector; + struct irq_desc *desc; unsigned long flags; + int cpu, vector; raw_spin_lock_irqsave(&vector_lock, flags); BUG_ON(!data->cfg.vector); vector = data->cfg.vector; for_each_cpu_and(cpu, data->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; data->cfg.vector = 0; cpumask_clear(data->domain); @@ -242,12 +242,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data) return; } + desc = irq_to_desc(irq); for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - if (per_cpu(vector_irq, cpu)[vector] != irq) + if (per_cpu(vector_irq, cpu)[vector] != desc) continue; - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; break; } } @@ -296,7 +297,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, struct irq_alloc_info *info = arg; struct apic_chip_data *data; struct irq_data *irq_data; - int i, err; + int i, err, node; if (disable_apic) return -ENXIO; @@ -308,12 +309,13 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, for (i = 0; i < nr_irqs; i++) { irq_data = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irq_data); + node = irq_data_get_node(irq_data); #ifdef CONFIG_X86_IO_APIC if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) data = legacy_irq_data[virq + i]; else #endif - data = alloc_apic_chip_data(irq_data->node); + data = alloc_apic_chip_data(node); if (!data) { err = -ENOMEM; goto error; @@ -322,8 +324,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector_policy(virq + i, irq_data->node, data, - info); + err = assign_irq_vector_policy(virq + i, node, data, info); if (err) goto error; } @@ -403,32 +404,32 @@ int __init arch_early_irq_init(void) return arch_early_ioapic_init(); } +/* Initialize vector_irq on a new cpu */ static void __setup_vector_irq(int cpu) { - /* Initialize vector_irq on a new cpu */ - int irq, vector; struct apic_chip_data *data; + struct irq_desc *desc; + int irq, vector; /* Mark the inuse vectors */ - for_each_active_irq(irq) { - data = apic_chip_data(irq_get_irq_data(irq)); - if (!data) - continue; + for_each_irq_desc(irq, desc) { + struct irq_data *idata = irq_desc_get_irq_data(desc); - if (!cpumask_test_cpu(cpu, data->domain)) + data = apic_chip_data(idata); + if (!data || !cpumask_test_cpu(cpu, data->domain)) continue; vector = data->cfg.vector; - per_cpu(vector_irq, cpu)[vector] = irq; + per_cpu(vector_irq, cpu)[vector] = desc; } /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq <= VECTOR_UNDEFINED) + desc = per_cpu(vector_irq, cpu)[vector]; + if (IS_ERR_OR_NULL(desc)) continue; - data = apic_chip_data(irq_get_irq_data(irq)); + data = apic_chip_data(irq_desc_get_irq_data(desc)); if (!cpumask_test_cpu(cpu, data->domain)) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; } } @@ -448,7 +449,7 @@ void setup_vector_irq(int cpu) * legacy vector to irq mapping: */ for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq; + per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq); __setup_vector_irq(cpu); } @@ -490,7 +491,8 @@ static int apic_set_affinity(struct irq_data *irq_data, if (err) { struct irq_data *top = irq_get_irq_data(irq); - if (assign_irq_vector(irq, data, top->affinity)) + if (assign_irq_vector(irq, data, + irq_data_get_affinity_mask(top))) pr_err("Failed to recover vector for irq %d\n", irq); return err; } @@ -538,27 +540,30 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) entering_ack_irq(); + /* Prevent vectors vanishing under us */ + raw_spin_lock(&vector_lock); + me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - int irq; - unsigned int irr; - struct irq_desc *desc; struct apic_chip_data *data; + struct irq_desc *desc; + unsigned int irr; - irq = __this_cpu_read(vector_irq[vector]); - - if (irq <= VECTOR_UNDEFINED) + retry: + desc = __this_cpu_read(vector_irq[vector]); + if (IS_ERR_OR_NULL(desc)) continue; - desc = irq_to_desc(irq); - if (!desc) - continue; + if (!raw_spin_trylock(&desc->lock)) { + raw_spin_unlock(&vector_lock); + cpu_relax(); + raw_spin_lock(&vector_lock); + goto retry; + } - data = apic_chip_data(&desc->irq_data); + data = apic_chip_data(irq_desc_get_irq_data(desc)); if (!data) - continue; - - raw_spin_lock(&desc->lock); + goto unlock; /* * Check if the irq migration is in progress. If so, we @@ -583,11 +588,13 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); goto unlock; } - __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); unlock: raw_spin_unlock(&desc->lock); } + raw_spin_unlock(&vector_lock); + exiting_irq(); } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3f124d553c5a..cd9b6d0b10bf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -12,7 +12,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/export.h> -#include <linux/watchdog.h> +#include <linux/nmi.h> #include <asm/cpufeature.h> #include <asm/hardirq.h> @@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void) return 0; } - watchdog_nmi_disable_all(); + if (lockup_detector_suspend() != 0) { + pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); + return 0; + } x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - watchdog_nmi_enable_all(); + lockup_detector_resume(); get_online_cpus(); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index f75c5908c7a6..88b4da373081 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -226,22 +226,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { } */ static unsigned long hpet_freq; -static void hpet_legacy_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt); -static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt); - -/* - * The hpet clock event device - */ -static struct clock_event_device hpet_clockevent = { - .name = "hpet", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = hpet_legacy_set_mode, - .set_next_event = hpet_legacy_next_event, - .irq = 0, - .rating = 50, -}; +static struct clock_event_device hpet_clockevent; static void hpet_stop_counter(void) { @@ -306,64 +291,74 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static void hpet_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt, int timer) +static int hpet_set_periodic(struct clock_event_device *evt, int timer) { unsigned int cfg, cmp, now; uint64_t delta; - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - hpet_stop_counter(); - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; - delta >>= evt->shift; - now = hpet_readl(HPET_COUNTER); - cmp = now + (unsigned int) delta; - cfg = hpet_readl(HPET_Tn_CFG(timer)); - cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | - HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_Tn_CFG(timer)); - hpet_writel(cmp, HPET_Tn_CMP(timer)); - udelay(1); - /* - * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL - * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL - * bit is automatically cleared after the first write. - * (See AMD-8111 HyperTransport I/O Hub Data Sheet, - * Publication # 24674) - */ - hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer)); - hpet_start_counter(); - hpet_print_config(); - break; + hpet_stop_counter(); + delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult; + delta >>= evt->shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned int)delta; + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + hpet_writel(cmp, HPET_Tn_CMP(timer)); + udelay(1); + /* + * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL + * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL + * bit is automatically cleared after the first write. + * (See AMD-8111 HyperTransport I/O Hub Data Sheet, + * Publication # 24674) + */ + hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer)); + hpet_start_counter(); + hpet_print_config(); - case CLOCK_EVT_MODE_ONESHOT: - cfg = hpet_readl(HPET_Tn_CFG(timer)); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_Tn_CFG(timer)); - break; + return 0; +} - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - cfg = hpet_readl(HPET_Tn_CFG(timer)); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_Tn_CFG(timer)); - break; +static int hpet_set_oneshot(struct clock_event_device *evt, int timer) +{ + unsigned int cfg; - case CLOCK_EVT_MODE_RESUME: - if (timer == 0) { - hpet_enable_legacy_int(); - } else { - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); - disable_irq(hdev->irq); - irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); - enable_irq(hdev->irq); - } - hpet_print_config(); - break; + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + + return 0; +} + +static int hpet_shutdown(struct clock_event_device *evt, int timer) +{ + unsigned int cfg; + + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + + return 0; +} + +static int hpet_resume(struct clock_event_device *evt, int timer) +{ + if (!timer) { + hpet_enable_legacy_int(); + } else { + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + + irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); + disable_irq(hdev->irq); + irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); + enable_irq(hdev->irq); } + hpet_print_config(); + + return 0; } static int hpet_next_event(unsigned long delta, @@ -403,10 +398,24 @@ static int hpet_next_event(unsigned long delta, return res < HPET_MIN_CYCLES ? -ETIME : 0; } -static void hpet_legacy_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int hpet_legacy_shutdown(struct clock_event_device *evt) +{ + return hpet_shutdown(evt, 0); +} + +static int hpet_legacy_set_oneshot(struct clock_event_device *evt) +{ + return hpet_set_oneshot(evt, 0); +} + +static int hpet_legacy_set_periodic(struct clock_event_device *evt) { - hpet_set_mode(mode, evt, 0); + return hpet_set_periodic(evt, 0); +} + +static int hpet_legacy_resume(struct clock_event_device *evt) +{ + return hpet_resume(evt, 0); } static int hpet_legacy_next_event(unsigned long delta, @@ -416,6 +425,22 @@ static int hpet_legacy_next_event(unsigned long delta, } /* + * The hpet clock event device + */ +static struct clock_event_device hpet_clockevent = { + .name = "hpet", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_periodic = hpet_legacy_set_periodic, + .set_state_oneshot = hpet_legacy_set_oneshot, + .set_state_shutdown = hpet_legacy_shutdown, + .tick_resume = hpet_legacy_resume, + .set_next_event = hpet_legacy_next_event, + .irq = 0, + .rating = 50, +}; + +/* * HPET MSI Support */ #ifdef CONFIG_PCI_MSI @@ -426,7 +451,7 @@ static struct irq_domain *hpet_domain; void hpet_msi_unmask(struct irq_data *data) { - struct hpet_dev *hdev = data->handler_data; + struct hpet_dev *hdev = irq_data_get_irq_handler_data(data); unsigned int cfg; /* unmask it */ @@ -437,7 +462,7 @@ void hpet_msi_unmask(struct irq_data *data) void hpet_msi_mask(struct irq_data *data) { - struct hpet_dev *hdev = data->handler_data; + struct hpet_dev *hdev = irq_data_get_irq_handler_data(data); unsigned int cfg; /* mask it */ @@ -459,11 +484,32 @@ void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg) msg->address_hi = 0; } -static void hpet_msi_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int hpet_msi_shutdown(struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + + return hpet_shutdown(evt, hdev->num); +} + +static int hpet_msi_set_oneshot(struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + + return hpet_set_oneshot(evt, hdev->num); +} + +static int hpet_msi_set_periodic(struct clock_event_device *evt) { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - hpet_set_mode(mode, evt, hdev->num); + + return hpet_set_periodic(evt, hdev->num); +} + +static int hpet_msi_resume(struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + + return hpet_resume(evt, hdev->num); } static int hpet_msi_next_event(unsigned long delta, @@ -523,10 +569,14 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) evt->rating = 110; evt->features = CLOCK_EVT_FEAT_ONESHOT; - if (hdev->flags & HPET_DEV_PERI_CAP) + if (hdev->flags & HPET_DEV_PERI_CAP) { evt->features |= CLOCK_EVT_FEAT_PERIODIC; + evt->set_state_periodic = hpet_msi_set_periodic; + } - evt->set_mode = hpet_msi_set_mode; + evt->set_state_shutdown = hpet_msi_shutdown; + evt->set_state_oneshot = hpet_msi_set_oneshot; + evt->tick_resume = hpet_msi_resume; evt->set_next_event = hpet_msi_next_event; evt->cpumask = cpumask_of(hdev->cpu); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index f2b96de3c7c1..efb82f07b29c 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -34,7 +34,7 @@ static int __init init_pit_clocksource(void) * - when local APIC timer is active (PIT is switched off) */ if (num_possible_cpus() > 1 || is_hpet_enabled() || - i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) + !clockevent_state_periodic(&i8253_clockevent)) return 0; return clocksource_i8253_init(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index ae00b355114d..f8062aaf5df9 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -214,10 +214,9 @@ u64 arch_irq_stat(void) __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - + struct irq_desc * desc; /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; - unsigned irq; /* * NB: Unlike exception entries, IRQ entries do not reliably @@ -236,17 +235,17 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) /* entering_irq() tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); - irq = __this_cpu_read(vector_irq[vector]); + desc = __this_cpu_read(vector_irq[vector]); - if (!handle_irq(irq, regs)) { + if (!handle_irq(desc, regs)) { ack_APIC_irq(); - if (irq != VECTOR_RETRIGGERED) { - pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n", + if (desc != VECTOR_RETRIGGERED) { + pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", __func__, smp_processor_id(), - vector, irq); + vector); } else { - __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } @@ -348,10 +347,10 @@ static struct cpumask affinity_new, online_new; */ int check_irq_vectors_for_cpu_disable(void) { - int irq, cpu; unsigned int this_cpu, vector, this_count, count; struct irq_desc *desc; struct irq_data *data; + int cpu; this_cpu = smp_processor_id(); cpumask_copy(&online_new, cpu_online_mask); @@ -359,47 +358,43 @@ int check_irq_vectors_for_cpu_disable(void) this_count = 0; for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - irq = __this_cpu_read(vector_irq[vector]); - if (irq >= 0) { - desc = irq_to_desc(irq); - if (!desc) - continue; - - /* - * Protect against concurrent action removal, - * affinity changes etc. - */ - raw_spin_lock(&desc->lock); - data = irq_desc_get_irq_data(desc); - cpumask_copy(&affinity_new, data->affinity); - cpumask_clear_cpu(this_cpu, &affinity_new); - - /* Do not count inactive or per-cpu irqs. */ - if (!irq_has_action(irq) || irqd_is_per_cpu(data)) { - raw_spin_unlock(&desc->lock); - continue; - } + desc = __this_cpu_read(vector_irq[vector]); + if (IS_ERR_OR_NULL(desc)) + continue; + /* + * Protect against concurrent action removal, affinity + * changes etc. + */ + raw_spin_lock(&desc->lock); + data = irq_desc_get_irq_data(desc); + cpumask_copy(&affinity_new, + irq_data_get_affinity_mask(data)); + cpumask_clear_cpu(this_cpu, &affinity_new); + /* Do not count inactive or per-cpu irqs. */ + if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) { raw_spin_unlock(&desc->lock); - /* - * A single irq may be mapped to multiple - * cpu's vector_irq[] (for example IOAPIC cluster - * mode). In this case we have two - * possibilities: - * - * 1) the resulting affinity mask is empty; that is - * this the down'd cpu is the last cpu in the irq's - * affinity mask, or - * - * 2) the resulting affinity mask is no longer - * a subset of the online cpus but the affinity - * mask is not zero; that is the down'd cpu is the - * last online cpu in a user set affinity mask. - */ - if (cpumask_empty(&affinity_new) || - !cpumask_subset(&affinity_new, &online_new)) - this_count++; + continue; } + + raw_spin_unlock(&desc->lock); + /* + * A single irq may be mapped to multiple cpu's + * vector_irq[] (for example IOAPIC cluster mode). In + * this case we have two possibilities: + * + * 1) the resulting affinity mask is empty; that is + * this the down'd cpu is the last cpu in the irq's + * affinity mask, or + * + * 2) the resulting affinity mask is no longer a + * subset of the online cpus but the affinity mask is + * not zero; that is the down'd cpu is the last online + * cpu in a user set affinity mask. + */ + if (cpumask_empty(&affinity_new) || + !cpumask_subset(&affinity_new, &online_new)) + this_count++; } count = 0; @@ -418,8 +413,8 @@ int check_irq_vectors_for_cpu_disable(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < first_system_vector; vector++) { if (!test_bit(vector, used_vectors) && - per_cpu(vector_irq, cpu)[vector] < 0) - count++; + IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) + count++; } } @@ -455,7 +450,7 @@ void fixup_irqs(void) raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); - affinity = data->affinity; + affinity = irq_data_get_affinity_mask(data); if (!irq_has_action(irq) || irqd_is_per_cpu(data) || cpumask_subset(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); @@ -523,14 +518,13 @@ void fixup_irqs(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; - if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED) + if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); if (irr & (1 << (vector % 32))) { - irq = __this_cpu_read(vector_irq[vector]); + desc = __this_cpu_read(vector_irq[vector]); - desc = irq_to_desc(irq); raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); @@ -541,7 +535,7 @@ void fixup_irqs(void) raw_spin_unlock(&desc->lock); } if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) - __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); } } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index cd74f5978ab9..c80cf6699678 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -148,21 +148,21 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } -bool handle_irq(unsigned irq, struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - struct irq_desc *desc; + unsigned int irq; int overflow; overflow = check_stack_overflow(); - desc = irq_to_desc(irq); - if (unlikely(!desc)) + if (IS_ERR_OR_NULL(desc)) return false; + irq = irq_desc_get_irq(desc); if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); - desc->handle_irq(irq, desc); + generic_handle_irq_desc(irq, desc); } return true; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index bc4604e500a3..ff16ccb918f2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -68,16 +68,13 @@ static inline void stack_overflow_check(struct pt_regs *regs) #endif } -bool handle_irq(unsigned irq, struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - struct irq_desc *desc; - stack_overflow_check(regs); - desc = irq_to_desc(irq); - if (unlikely(!desc)) + if (unlikely(IS_ERR_OR_NULL(desc))) return false; - generic_handle_irq_desc(irq, desc); + generic_handle_irq_desc(irq_desc_get_irq(desc), desc); return true; } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a3a5e158ed69..1423ab1b0312 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -52,7 +52,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED, + [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector) int cpu; for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED) + if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) return 1; } @@ -94,7 +94,7 @@ void __init init_IRQ(void) * irq's migrate etc. */ for (i = 0; i < nr_legacy_irqs(); i++) - per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i; + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); x86_init.irqs.intr_init(); } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 26d5a55a2736..e565e0e4d216 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -45,7 +45,7 @@ static void __jump_label_transform(struct jump_entry *entry, const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - if (type == JUMP_LABEL_ENABLE) { + if (type == JUMP_LABEL_JMP) { if (init) { /* * Jump label is enabled for the first time. diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 961e51e9c6f6..0f8a6bbaaa44 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -533,7 +533,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) int ret; ret = verify_pefile_signature(kernel, kernel_len, - system_trusted_keyring, &trusted); + system_trusted_keyring, + VERIFYING_KEXEC_PE_SIGNATURE, + &trusted); if (ret < 0) return ret; if (!trusted) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 49487b488061..2c7aafa70702 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void) * kind of shutdown from our side, we unregister the clock by writting anything * that does not have the 'enable' bit set in the msr */ -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE static void kvm_crash_shutdown(struct pt_regs *regs) { native_write_msr(msr_kvm_system_time, 0, 0); @@ -259,7 +259,7 @@ void __init kvmclock_init(void) x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 353972c1946c..84b8ef82a159 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev); /* Number of entries preallocated for DMA-API debugging */ #define PREALLOC_DMA_DEBUG_ENTRIES 65536 -int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; -} -EXPORT_SYMBOL(dma_set_mask); - void __init pci_iommu_alloc(void) { struct iommu_table_entry *p; @@ -140,50 +129,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } -void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, struct dma_attrs *attrs) +bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) { - struct dma_map_ops *ops = get_dma_ops(dev); - void *memory; - - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - - if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) - dev = &x86_dma_fallback_dev; - - if (!is_device_dma_capable(dev)) - return NULL; - - if (!ops->alloc) - return NULL; - - memory = ops->alloc(dev, size, dma_handle, - dma_alloc_coherent_gfp_flags(dev, gfp), attrs); - debug_dma_alloc_coherent(dev, size, *dma_handle, memory); - - return memory; -} -EXPORT_SYMBOL(dma_alloc_attrs); - -void dma_free_attrs(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - WARN_ON(irqs_disabled()); /* for portability */ + *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp); + *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - if (dma_release_from_coherent(dev, get_order(size), vaddr)) - return; + if (!*dev) + *dev = &x86_dma_fallback_dev; + if (!is_device_dma_capable(*dev)) + return false; + return true; - debug_dma_free_coherent(dev, size, vaddr, bus); - if (ops->free) - ops->free(dev, size, vaddr, bus, attrs); } -EXPORT_SYMBOL(dma_free_attrs); +EXPORT_SYMBOL(arch_dma_alloc_attrs); /* * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 64f90f53bb85..4f00b63d7ff3 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -3,80 +3,17 @@ * Copyright (c) 2015, Intel Corporation. */ #include <linux/platform_device.h> -#include <linux/libnvdimm.h> #include <linux/module.h> -#include <asm/e820.h> - -static void e820_pmem_release(struct device *dev) -{ - struct nvdimm_bus *nvdimm_bus = dev->platform_data; - - if (nvdimm_bus) - nvdimm_bus_unregister(nvdimm_bus); -} - -static struct platform_device e820_pmem = { - .name = "e820_pmem", - .id = -1, - .dev = { - .release = e820_pmem_release, - }, -}; - -static const struct attribute_group *e820_pmem_attribute_groups[] = { - &nvdimm_bus_attribute_group, - NULL, -}; - -static const struct attribute_group *e820_pmem_region_attribute_groups[] = { - &nd_region_attribute_group, - &nd_device_attribute_group, - NULL, -}; static __init int register_e820_pmem(void) { - static struct nvdimm_bus_descriptor nd_desc; - struct device *dev = &e820_pmem.dev; - struct nvdimm_bus *nvdimm_bus; - int rc, i; - - rc = platform_device_register(&e820_pmem); - if (rc) - return rc; - - nd_desc.attr_groups = e820_pmem_attribute_groups; - nd_desc.provider_name = "e820"; - nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); - if (!nvdimm_bus) - goto err; - dev->platform_data = nvdimm_bus; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - struct resource res = { - .flags = IORESOURCE_MEM, - .start = ei->addr, - .end = ei->addr + ei->size - 1, - }; - struct nd_region_desc ndr_desc; - - if (ei->type != E820_PRAM) - continue; - - memset(&ndr_desc, 0, sizeof(ndr_desc)); - ndr_desc.res = &res; - ndr_desc.attr_groups = e820_pmem_region_attribute_groups; - ndr_desc.numa_node = NUMA_NO_NODE; - if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) - goto err; - } - - return 0; - - err: - dev_err(dev, "failed to register legacy persistent memory ranges\n"); - platform_device_unregister(&e820_pmem); - return -ENXIO; + struct platform_device *pdev; + + /* + * See drivers/nvdimm/e820.c for the implementation, this is + * simply here to trigger the module to load on demand. + */ + pdev = platform_device_alloc("e820_pmem", -1); + return platform_device_add(pdev); } device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 86db4bcd7ce5..02693dd9a079 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -673,7 +673,7 @@ struct machine_ops machine_ops = { .emergency_restart = native_machine_emergency_restart, .restart = native_machine_restart, .halt = native_machine_halt, -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE .crash_shutdown = native_machine_crash_shutdown, #endif }; @@ -703,7 +703,7 @@ void machine_halt(void) machine_ops.halt(); } -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE void machine_crash_shutdown(struct pt_regs *regs) { machine_ops.crash_shutdown(regs); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b143c2d04420..fdb7f2a2d328 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void) return ramdisk_size; } -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); - unsigned long slop, clen, mapaddr; - char *p, *q; /* We need to move the initrd down into directly mapped mem */ relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), @@ -343,25 +340,8 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); - q = (char *)initrd_start; - - /* Copy the initrd */ - while (ramdisk_size) { - slop = ramdisk_image & ~PAGE_MASK; - clen = ramdisk_size; - if (clen > MAX_MAP_CHUNK-slop) - clen = MAX_MAP_CHUNK-slop; - mapaddr = ramdisk_image & PAGE_MASK; - p = early_memremap(mapaddr, clen+slop); - memcpy(q, p+slop, clen); - early_memunmap(p, clen+slop); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } + copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); - ramdisk_image = get_ramdisk_image(); - ramdisk_size = get_ramdisk_size(); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, @@ -498,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) * --------- Crashkernel reservation ------------------------------ */ -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE /* * Keep the crash kernel below this limit. On 32 bits earlier kernels diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 79055cf2c497..c8d52cb4cb6e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -38,7 +38,7 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; -static struct static_key __use_tsc = STATIC_KEY_INIT; +static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; @@ -274,7 +274,12 @@ done: */ u64 native_sched_clock(void) { - u64 tsc_now; + if (static_branch_likely(&__use_tsc)) { + u64 tsc_now = rdtsc(); + + /* return the value in ns */ + return cycles_2_ns(tsc_now); + } /* * Fall back to jiffies if there's no TSC available: @@ -284,16 +289,9 @@ u64 native_sched_clock(void) * very important for it to be as fast as the platform * can achieve it. ) */ - if (!static_key_false(&__use_tsc)) { - /* No locking but a rare wrong value is not a big deal: */ - return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); - } - - /* read the Time Stamp Counter: */ - tsc_now = rdtsc(); - /* return the value in ns */ - return cycles_2_ns(tsc_now); + /* No locking but a rare wrong value is not a big deal: */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* @@ -1212,7 +1210,7 @@ void __init tsc_init(void) /* now allow native_sched_clock() to use rdtsc */ tsc_disabled = 0; - static_key_slow_inc(&__use_tsc); + static_branch_enable(&__use_tsc); if (!no_sched_irq_time) enable_sched_clock_irqtime(); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 00bf300fd846..74e4bf11f562 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE #include <asm/kexec.h> . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e7a4fde5d631..b372a7557c16 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, u16 sel; la = seg_base(ctxt, addr.seg) + addr.ea; + *linear = la; *max_size = 0; switch (mode) { case X86EMUL_MODE_PROT64: @@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) return emulate_gp(ctxt, 0); - *linear = la; return X86EMUL_CONTINUE; bad: if (addr.seg == VCPU_SREG_SS) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fb16a8ea3dee..69088a1ba509 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3309,13 +3309,14 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level; + for (shadow_walk_init(&iterator, vcpu, addr), + leaf = root = iterator.level; shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { - leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf - 1] = spte; + leaf--; if (!is_shadow_present_pte(spte)) break; @@ -3329,7 +3330,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", __func__, addr); - while (root >= leaf) { + while (root > leaf) { pr_err("------ spte 0x%llx level %d.\n", sptes[root - 1], root); root--; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4a4eec30cc08..d01986832afc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE /* * This bitmap is used to indicate whether the vmclear * operation is enabled on all cpus. All disabled by @@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void) #else static inline void crash_enable_local_vmclear(int cpu) { } static inline void crash_disable_local_vmclear(int cpu) { } -#endif /* CONFIG_KEXEC */ +#endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) { @@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); + pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); @@ -10411,7 +10411,7 @@ static int __init vmx_init(void) if (r) return r; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); #endif @@ -10421,7 +10421,7 @@ static int __init vmx_init(void) static void __exit vmx_exit(void) { -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1e7e76e14e89..a60bdbccff51 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5943,6 +5943,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg)); } +#ifdef CONFIG_X86_64 static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) { struct kvm_segment seg; @@ -5958,6 +5959,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) put_smstate(u32, buf, offset + 4, seg.limit); put_smstate(u64, buf, offset + 8, seg.base); } +#endif static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) { diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 433e5a7dd37f..161804de124a 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -835,16 +835,46 @@ static struct irq_chip lguest_irq_controller = { .irq_unmask = enable_lguest_irq, }; +/* + * Interrupt descriptors are allocated as-needed, but low-numbered ones are + * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it + * tells us the irq is already used: other errors (ie. ENOMEM) we take + * seriously. + */ +static int lguest_setup_irq(unsigned int irq) +{ + struct irq_desc *desc; + int err; + + /* Returns -ve error or vector number. */ + err = irq_alloc_desc_at(irq, 0); + if (err < 0 && err != -EEXIST) + return err; + + /* + * Tell the Linux infrastructure that the interrupt is + * controlled by our level-based lguest interrupt controller. + */ + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, + handle_level_irq, "level"); + + /* Some systems map "vectors" to interrupts weirdly. Not us! */ + desc = irq_to_desc(irq); + __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); + return 0; +} + static int lguest_enable_irq(struct pci_dev *dev) { + int err; u8 line = 0; /* We literally use the PCI interrupt line as the irq number. */ pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - irq_set_chip_and_handler_name(line, &lguest_irq_controller, - handle_level_irq, "level"); - dev->irq = line; - return 0; + err = lguest_setup_irq(line); + if (!err) + dev->irq = line; + return err; } /* We don't do hotplug PCI, so this shouldn't be called. */ @@ -855,17 +885,13 @@ static void lguest_disable_irq(struct pci_dev *dev) /* * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls), and then tells the - * Linux infrastructure that each interrupt is controlled by our level-based - * lguest interrupt controller. + * interrupt (except 128, which is used for system calls). */ static void __init lguest_init_IRQ(void) { unsigned int i; for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR)); @@ -879,26 +905,6 @@ static void __init lguest_init_IRQ(void) } /* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -int lguest_setup_irq(unsigned int irq) -{ - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - return 0; -} - -/* * Time. * * It would be far better for everyone if the Guest had its own clock, but @@ -1028,7 +1034,8 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ - lguest_setup_irq(0); + if (lguest_setup_irq(0) != 0) + panic("Could not set up timer irq"); irq_set_handler(0, lguest_time_irq); clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 68aec42545c2..7562f42914b4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,11 +823,11 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata = NODE_DATA(nid); struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM); + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3fba623e3ba5..30564e2752d3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL); + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index db1b0bc5017c..134948b0926f 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) */ static unsigned long mpx_mmap(unsigned long len) { - unsigned long ret; - unsigned long addr, pgoff; struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; - struct vm_area_struct *vma; + unsigned long addr, populate; /* Only bounds table can be allocated here */ if (len != mpx_bt_size_bytes(mm)) return -EINVAL; down_write(&mm->mmap_sem); - - /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) { - ret = -ENOMEM; - goto out; - } - - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. - */ - addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE); - if (addr & ~PAGE_MASK) { - ret = addr; - goto out; - } - - vm_flags = VM_READ | VM_WRITE | VM_MPX | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - - /* Set pgoff according to addr for anon_vma */ - pgoff = addr >> PAGE_SHIFT; - - ret = mmap_region(NULL, addr, len, vm_flags, pgoff); - if (IS_ERR_VALUE(ret)) - goto out; - - vma = find_vma(mm, ret); - if (!vma) { - ret = -ENOMEM; - goto out; - } - - if (vm_flags & VM_LOCKED) { - up_write(&mm->mmap_sem); - mm_populate(ret, len); - return ret; - } - -out: + addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate); up_write(&mm->mmap_sem); - return ret; + if (populate) + mm_populate(addr, populate); + + return addr; } enum reg_type { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 4053bb58bf92..c3b3f653ed0c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) bi->start = max(bi->start, low); bi->end = min(bi->end, high); - /* and there's no empty block */ - if (bi->start >= bi->end) + /* and there's no empty or non-exist block */ + if (bi->start >= bi->end || + !memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) numa_remove_memblk_from(i--, mi); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 90b924acd982..8ddb5d0d66fb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_end = end; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start); if (is_uv_system()) { unsigned int cpu; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index be2e7a2b10d7..70efcd0940f9 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog) * goto out; * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; - * prog = array->prog[index]; + * prog = array->ptrs[index]; * if (prog == NULL) * goto out; * goto *(prog->bpf_func + prologue_size); @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ - /* prog = array->prog[index]; */ + /* prog = array->ptrs[index]; */ EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ - offsetof(struct bpf_array, prog)); + offsetof(struct bpf_array, ptrs)); EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) @@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } + +static void emit_load_skb_data_hlen(u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* r9d = skb->len - skb->data_len (headlen) + * r10 = skb->data + */ + /* mov %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len)); + + /* sub %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len)); + + /* mov %r10, off32(%rdi) */ + EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data)); + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, emit_prologue(&prog); - if (seen_ld_abs) { - /* r9d : skb->len - skb->data_len (headlen) - * r10 : skb->data - */ - if (is_imm8(offsetof(struct sk_buff, len))) - /* mov %r9d, off8(%rdi) */ - EMIT4(0x44, 0x8b, 0x4f, - offsetof(struct sk_buff, len)); - else - /* mov %r9d, off32(%rdi) */ - EMIT3_off32(0x44, 0x8b, 0x8f, - offsetof(struct sk_buff, len)); - - if (is_imm8(offsetof(struct sk_buff, data_len))) - /* sub %r9d, off8(%rdi) */ - EMIT4(0x44, 0x2b, 0x4f, - offsetof(struct sk_buff, data_len)); - else - EMIT3_off32(0x44, 0x2b, 0x8f, - offsetof(struct sk_buff, data_len)); - - if (is_imm8(offsetof(struct sk_buff, data))) - /* mov %r10, off8(%rdi) */ - EMIT4(0x4c, 0x8b, 0x57, - offsetof(struct sk_buff, data)); - else - /* mov %r10, off32(%rdi) */ - EMIT3_off32(0x4c, 0x8b, 0x97, - offsetof(struct sk_buff, data)); - } + if (seen_ld_abs) + emit_load_skb_data_hlen(&prog); for (i = 0; i < insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; @@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 b1 = 0, b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; + bool reload_skb_data; int ilen; u8 *func; @@ -818,12 +811,18 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - EMIT2(0x41, 0x52); /* push %r10 */ - EMIT2(0x41, 0x51); /* push %r9 */ - /* need to adjust jmp offset, since - * pop %r9, pop %r10 take 4 bytes after call insn - */ - jmp_offset += 4; + reload_skb_data = bpf_helper_changes_skb_data(func); + if (reload_skb_data) { + EMIT1(0x57); /* push %rdi */ + jmp_offset += 22; /* pop, mov, sub, mov */ + } else { + EMIT2(0x41, 0x52); /* push %r10 */ + EMIT2(0x41, 0x51); /* push %r9 */ + /* need to adjust jmp offset, since + * pop %r9, pop %r10 take 4 bytes after call insn + */ + jmp_offset += 4; + } } if (!imm32 || !is_simm32(jmp_offset)) { pr_err("unsupported bpf func %d addr %p image %p\n", @@ -832,8 +831,13 @@ xadd: if (is_imm8(insn->off)) } EMIT1_off32(0xE8, jmp_offset); if (seen_ld_abs) { - EMIT2(0x41, 0x59); /* pop %r9 */ - EMIT2(0x41, 0x5A); /* pop %r10 */ + if (reload_skb_data) { + EMIT1(0x5F); /* pop %rdi */ + emit_load_skb_data_hlen(&prog); + } else { + EMIT2(0x41, 0x59); /* pop %r9 */ + EMIT2(0x41, 0x5A); /* pop %r10 */ + } } break; @@ -1099,7 +1103,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) } if (bpf_jit_enable > 1) - bpf_jit_dump(prog->len, proglen, 0, image); + bpf_jit_dump(prog->len, proglen, pass + 1, image); if (image) { bpf_flush_icache(header, image + proglen); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index d22f4b5bbc04..ff31ab464213 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -179,7 +179,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (ret) goto error; i = 0; - list_for_each_entry(msidesc, &dev->msi_list, list) { + for_each_pci_msi_entry(msidesc, dev) { irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? @@ -230,7 +230,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; - list_for_each_entry(msidesc, &dev->msi_list, list) { + for_each_pci_msi_entry(msidesc, dev) { __pci_read_msi_msg(msidesc, &msg); pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); @@ -274,7 +274,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret = 0; struct msi_desc *msidesc; - list_for_each_entry(msidesc, &dev->msi_list, list) { + for_each_pci_msi_entry(msidesc, dev) { struct physdev_map_pirq map_irq; domid_t domid; @@ -386,7 +386,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) { struct msi_desc *msidesc; - msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); + msidesc = first_pci_msi_entry(dev); if (msidesc->msi_attrib.is_msix) xen_pci_frontend_disable_msix(dev); else diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index e4308fe6afe8..1db84c0758b7 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) static void __init save_runtime_map(void) { -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; void *tmp, *p, *q = NULL; int count = 0; @@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift) static void __init kexec_enter_virtual_mode(void) { -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; void *p; diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 8570abe68be1..e1c24631afbb 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -89,7 +89,7 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL, - irq_data->node); + irq_data_get_node(irq_data)); if (!chip_data) return -ENOMEM; diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 020c101c255f..5c9f63fa6abf 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void) touch_nmi_watchdog(); } -#if defined(CONFIG_KEXEC) +#if defined(CONFIG_KEXEC_CORE) static atomic_t uv_nmi_kexec_failed; static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { @@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) uv_nmi_sync_exit(0); } -#else /* !CONFIG_KEXEC */ +#else /* !CONFIG_KEXEC_CORE */ static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) { if (master) pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); } -#endif /* !CONFIG_KEXEC */ +#endif /* !CONFIG_KEXEC_CORE */ #ifdef CONFIG_KGDB #ifdef CONFIG_KGDB_KDB diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 484145368a24..c7b15f3e2cf3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,6 +7,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help @@ -23,14 +24,18 @@ config XEN_PVHVM def_bool y depends on XEN && PCI && X86_LOCAL_APIC -config XEN_MAX_DOMAIN_MEMORY - int - default 500 if X86_64 - default 64 if X86_32 - depends on XEN - help - This only affects the sizing of some bss arrays, the unused - portions of which are freed. +config XEN_512GB + bool "Limit Xen pv-domain memory to 512GB" + depends on XEN && X86_64 + default y + help + Limit paravirtualized user domains to 512GB of RAM. + + The Xen tools and crash dump analysis tools might not support + pv-domains with more than 512 GB of RAM. This option controls the + default setting of the kernel to use only up to 512 GB or more. + It is always possible to change the default via specifying the + boot parameter "xen_512gb_limit". config XEN_SAVE_RESTORE bool diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 4b6e29ac0968..e47e52787d32 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o + p2m.o apic.o pmu.o obj-$(CONFIG_EVENT_TRACING) += trace.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 70e060ad879a..acda713ab5be 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -7,6 +7,7 @@ #include <xen/xen.h> #include <xen/interface/physdev.h> #include "xen-ops.h" +#include "pmu.h" #include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) @@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { + if (reg == APIC_LVTPC) { + (void)pmu_apic_update(reg); + return; + } + /* Warn to see if there's any stray references */ WARN(1,"register: %x, value: %x\n", reg, val); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d9cfa452da9d..30d12afe52ed 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -84,6 +84,7 @@ #include "mmu.h" #include "smp.h" #include "multicalls.h" +#include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0) static void xen_write_cr4(unsigned long cr4) { - cr4 &= ~X86_CR4_PGE; - cr4 &= ~X86_CR4_PSE; + cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); native_write_cr4(cr4); } @@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) { u64 val; + if (pmu_msr_read(msr, &val, err)) + return val; + val = native_read_msr_safe(msr, err); switch (msr) { case MSR_IA32_APICBASE: @@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) Xen console noise. */ default: - ret = native_write_msr_safe(msr, low, high); + if (!pmu_msr_write(msr, low, high, &ret)) + ret = native_write_msr_safe(msr, low, high); } return ret; @@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_msr = xen_read_msr_safe, .write_msr = xen_write_msr_safe, - .read_pmc = native_read_pmc, + .read_pmc = xen_read_pmc, .iret = xen_iret, #ifdef CONFIG_X86_64 @@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = { static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); @@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, + xen_start_info->nr_pages); + xen_reserve_special_pages(); /* * Modify the cache mode translation tables to match Xen's PAT diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dd151b2045b0..9c479fe40459 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +static phys_addr_t xen_pt_base, xen_pt_size __initdata; /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); +static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr, xen_mc_flush(); } +/* + * Make a page range writeable and free it. + */ +static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) +{ + void *vaddr = __va(paddr); + void *vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) + make_lowmem_page_readwrite(vaddr); + + memblock_free(paddr, size); +} + +static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) +{ + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; + + if (unpin) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); + ClearPagePinned(virt_to_page(__va(pa))); + xen_free_ro_pages(pa, PAGE_SIZE); +} + +/* + * Since it is well isolated we can (and since it is perhaps large we should) + * also free the page tables mapping the initial P->M table. + */ +static void __init xen_cleanmfnmap(unsigned long vaddr) +{ + unsigned long va = vaddr & PMD_MASK; + unsigned long pa; + pgd_t *pgd = pgd_offset_k(va); + pud_t *pud_page = pud_offset(pgd, 0); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned int i; + bool unpin; + + unpin = (vaddr == 2 * PGDIR_SIZE); + set_pgd(pgd, __pgd(0)); + do { + pud = pud_page + pud_index(va); + if (pud_none(*pud)) { + va += PUD_SIZE; + } else if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + va += PUD_SIZE; + } else { + pmd = pmd_offset(pud, va); + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + } else if (!pmd_none(*pmd)) { + pte = pte_offset_kernel(pmd, va); + set_pmd(pmd, __pmd(0)); + for (i = 0; i < PTRS_PER_PTE; ++i) { + if (pte_none(pte[i])) + break; + pa = pte_pfn(pte[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + xen_cleanmfnmap_free_pgtbl(pte, unpin); + } + va += PMD_SIZE; + if (pmd_index(va)) + continue; + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd, unpin); + } + + } while (pud_index(va) || pmd_index(va)); + xen_cleanmfnmap_free_pgtbl(pud_page, unpin); +} + static void __init xen_pagetable_p2m_free(void) { unsigned long size; @@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void) /* using __ka address and sticking INVALID_P2M_ENTRY! */ memset((void *)xen_start_info->mfn_list, 0xff, size); - /* We should be in __ka space. */ - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); addr = xen_start_info->mfn_list; - /* We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or xen_start_info->shared_info - * they are in going to crash. Fortunatly we have already revectored - * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + /* + * We could be in __ka space. + * We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or + * xen_start_info->shared_info they are in going to crash. Fortunatly + * we have already revectored in xen_setup_kernel_pagetable and in + * xen_setup_shared_info. + */ size = roundup(size, PMD_SIZE); - xen_cleanhighmap(addr, addr + size); - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(xen_start_info->mfn_list), size); + if (addr >= __START_KERNEL_map) { + xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + memblock_free(__pa(addr), size); + } else { + xen_cleanmfnmap(addr); + } +} + +static void __init xen_pagetable_cleanhighmap(void) +{ + unsigned long size; + unsigned long addr; /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void) #ifdef CONFIG_X86_64 xen_pagetable_p2m_free(); + + xen_pagetable_cleanhighmap(); #endif /* And revector! Bye bye old array */ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; @@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { + unsigned long pfn; + + if (xen_feature(XENFEAT_writable_page_tables) || + xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->mfn_list >= __START_KERNEL_map) + return pte; + + /* + * Pages belonging to the initial p2m list mapped outside the default + * address range must be mapped read-only. This region contains the + * page tables for mapping the p2m list, too, and page tables MUST be + * mapped read-only. + */ + pfn = pte_pfn(pte); + if (pfn >= xen_start_info->first_p2m_pfn && + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) + pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW); + return pte; } #endif /* CONFIG_X86_64 */ @@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } -static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) @@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * mappings. Considering that on Xen after the kernel mappings we * have the mappings of some pages that don't exist in pfn space, we * set max_pfn_mapped to the last real pfn mapped. */ - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + if (xen_start_info->mfn_list < __START_KERNEL_map) + max_pfn_mapped = xen_start_info->first_p2m_pfn; + else + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); pt_end = pt_base + xen_start_info->nr_pt_frames; @@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); + /* Copy the initial P->M table mappings if necessary. */ + i = pgd_index(xen_start_info->mfn_list); + if (i && i < pgd_index(__START_KERNEL_map)) + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); @@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) check_pt_base(&pt_base, &pt_end, addr[i]); /* Our (by three pages) smaller Xen pagetable that we are using */ - memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + xen_pt_base = PFN_PHYS(pt_base); + xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; + memblock_reserve(xen_pt_base, xen_pt_size); + /* Revector the xen_start_info */ xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } + +/* + * Read a value from a physical address. + */ +static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) +{ + unsigned long *vaddr; + unsigned long val; + + vaddr = early_memremap_ro(addr, sizeof(val)); + val = *vaddr; + early_memunmap(vaddr, sizeof(val)); + return val; +} + +/* + * Translate a virtual address to a physical one without relying on mapped + * page tables. + */ +static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) +{ + phys_addr_t pa; + pgd_t pgd; + pud_t pud; + pmd_t pmd; + pte_t pte; + + pa = read_cr3(); + pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * + sizeof(pgd))); + if (!pgd_present(pgd)) + return 0; + + pa = pgd_val(pgd) & PTE_PFN_MASK; + pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * + sizeof(pud))); + if (!pud_present(pud)) + return 0; + pa = pud_pfn(pud) << PAGE_SHIFT; + if (pud_large(pud)) + return pa + (vaddr & ~PUD_MASK); + + pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * + sizeof(pmd))); + if (!pmd_present(pmd)) + return 0; + pa = pmd_pfn(pmd) << PAGE_SHIFT; + if (pmd_large(pmd)) + return pa + (vaddr & ~PMD_MASK); + + pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * + sizeof(pte))); + if (!pte_present(pte)) + return 0; + pa = pte_pfn(pte) << PAGE_SHIFT; + + return pa | (vaddr & ~PAGE_MASK); +} + +/* + * Find a new area for the hypervisor supplied p2m list and relocate the p2m to + * this area. + */ +void __init xen_relocate_p2m(void) +{ + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; + int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + unsigned long *new_p2m; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + n_frames = n_pte + n_pt + n_pmd + n_pud; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); + BUG(); + } + + /* + * Setup the page tables for addressing the new p2m list. + * We have asked the hypervisor to map the p2m list at the user address + * PUD_SIZE. It may have done so, or it may have used a kernel space + * address depending on the Xen version. + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ + pud_phys = new_area; + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; + } + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; + } + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); + xen_p2m_addr = new_p2m; + + /* Release the old p2m list and set new list info. */ + p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); + BUG_ON(!p2m_pfn); + p2m_pfn_end = p2m_pfn + PFN_DOWN(size); + + if (xen_start_info->mfn_list < __START_KERNEL_map) { + pfn = xen_start_info->first_p2m_pfn; + pfn_end = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + set_pgd(pgd + 1, __pgd(0)); + } else { + pfn = p2m_pfn; + pfn_end = p2m_pfn_end; + } + + memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + while (pfn < pfn_end) { + if (pfn == p2m_pfn) { + pfn = p2m_pfn_end; + continue; + } + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + pfn++; + } + + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; + xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); + xen_start_info->nr_p2m_frames = n_frames; +} + #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); @@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } +/* + * For 32 bit domains xen_start_info->pt_base is the pgd address which might be + * not the first page table in the page table pool. + * Iterate through the initial page tables to find the real page table base. + */ +static phys_addr_t xen_find_pt_base(pmd_t *pmd) +{ + phys_addr_t pt_base, paddr; + unsigned pmdidx; + + pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); + + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) + if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { + paddr = m2p(pmd[pmdidx].pmd); + pt_base = min(pt_base, paddr); + } + + return pt_base; +} + void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + + xen_pt_base = xen_find_pt_base(kernel_pmd); + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; + initial_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + - xen_start_info->nr_pt_frames * PAGE_SIZE + - 512*1024); + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); @@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + memblock_reserve(xen_pt_base, xen_pt_size); } #endif /* CONFIG_X86_64 */ +void __init xen_reserve_special_pages(void) +{ + phys_addr_t paddr; + + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + if (xen_start_info->store_mfn) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } + if (!xen_initial_domain()) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } +} + +void __init xen_pt_check_e820(void) +{ + if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { + xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); + BUG(); + } +} + static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) @@ -2465,9 +2812,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, return 0; } -static int do_remap_mfn(struct vm_area_struct *vma, +static int do_remap_gfn(struct vm_area_struct *vma, unsigned long addr, - xen_pfn_t *mfn, int nr, + xen_pfn_t *gfn, int nr, int *err_ptr, pgprot_t prot, unsigned domid, struct page **pages) @@ -2483,14 +2830,14 @@ static int do_remap_mfn(struct vm_area_struct *vma, if (xen_feature(XENFEAT_auto_translated_physmap)) { #ifdef CONFIG_XEN_PVH /* We need to update the local page tables and the xen HAP */ - return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr, + return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, prot, domid, pages); #else return -EINVAL; #endif } - rmd.mfn = mfn; + rmd.mfn = gfn; rmd.prot = prot; /* We use the err_ptr to indicate if there we are doing a contigious * mapping or a discontigious mapping. */ @@ -2518,8 +2865,8 @@ static int do_remap_mfn(struct vm_area_struct *vma, batch_left, &done, domid); /* - * @err_ptr may be the same buffer as @mfn, so - * only clear it after each chunk of @mfn is + * @err_ptr may be the same buffer as @gfn, so + * only clear it after each chunk of @gfn is * used. */ if (err_ptr) { @@ -2549,19 +2896,19 @@ out: return err < 0 ? err : mapped; } -int xen_remap_domain_mfn_range(struct vm_area_struct *vma, +int xen_remap_domain_gfn_range(struct vm_area_struct *vma, unsigned long addr, - xen_pfn_t mfn, int nr, + xen_pfn_t gfn, int nr, pgprot_t prot, unsigned domid, struct page **pages) { - return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages); + return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages); } -EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); +EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range); -int xen_remap_domain_mfn_array(struct vm_area_struct *vma, +int xen_remap_domain_gfn_array(struct vm_area_struct *vma, unsigned long addr, - xen_pfn_t *mfn, int nr, + xen_pfn_t *gfn, int nr, int *err_ptr, pgprot_t prot, unsigned domid, struct page **pages) { @@ -2570,13 +2917,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma, * cause of "wrong memory was mapped in". */ BUG_ON(err_ptr == NULL); - return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages); + return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages); } -EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array); +EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); /* Returns: 0 success */ -int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, +int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, int numpgs, struct page **pages) { if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) @@ -2588,4 +2935,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, return -EINVAL; #endif } -EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); +EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8b7f18e200aa..bfc08b13044b 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -79,10 +79,14 @@ #include <xen/balloon.h> #include <xen/grant_table.h> -#include "p2m.h" #include "multicalls.h" #include "xen-ops.h" +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) unsigned long *xen_p2m_addr __read_mostly; @@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void) unsigned int level, topidx, mididx; unsigned long *mid_mfn_p; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) return; /* Pre-initialize p2m_top_mfn to be completely missing */ @@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); + if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL; + else + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; + HYPERVISOR_shared_info->arch.p2m_generation = 0; + HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr; + HYPERVISOR_shared_info->arch.p2m_cr3 = + xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) ptechk = lookup_address(vaddr, &level); if (ptechk == pte_pg) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pmd(pmdp, __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; pte_newpg[i] = NULL; } @@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) */ static bool alloc_p2m(unsigned long pfn) { - unsigned topidx, mididx; + unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; pte_t *ptep, *pte_pg; unsigned int level; @@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn) return false; } - if (p2m_top_mfn) { + if (p2m_top_mfn && pfn < MAX_P2M_PFN) { + topidx = p2m_top_index(pfn); top_mfn_p = &p2m_top_mfn[topidx]; mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); @@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn) spin_lock_irqsave(&p2m_update_lock, flags); if (pte_pfn(*ptep) == p2m_pfn) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pte(ptep, pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; if (mid_mfn) - mid_mfn[mididx] = virt_to_mfn(p2m); + mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m); p2m = NULL; } @@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + /* + * The interface requires atomic updates on p2m elements. + * xen_safe_write_ulong() is using __put_user which does an atomic + * store via asm(). + */ if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) return true; diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h deleted file mode 100644 index ad8aee24ab72..000000000000 --- a/arch/x86/xen/p2m.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _XEN_P2M_H -#define _XEN_P2M_H - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -#define MAX_REMAP_RANGES 10 - -extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); - -#endif /* _XEN_P2M_H */ diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index a8261716d58d..9586ff32810c 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -68,7 +68,7 @@ static int check_platform_magic(void) return 0; } -bool xen_has_pv_devices() +bool xen_has_pv_devices(void) { if (!xen_domain()) return false; diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c new file mode 100644 index 000000000000..724a08740a04 --- /dev/null +++ b/arch/x86/xen/pmu.c @@ -0,0 +1,570 @@ +#include <linux/types.h> +#include <linux/interrupt.h> + +#include <asm/xen/hypercall.h> +#include <xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> + +#include "xen-ops.h" +#include "pmu.h" + +/* x86_pmu.handle_irq definition */ +#include "../kernel/cpu/perf_event.h" + +#define XENPMU_IRQ_PROCESSING 1 +struct xenpmu { + /* Shared page between hypervisor and domain */ + struct xen_pmu_data *xenpmu_data; + + uint8_t flags; +}; +static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared); +#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data) +#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags) + +/* Macro for computing address of a PMU MSR bank */ +#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \ + (uintptr_t)ctxt->field)) + +/* AMD PMU */ +#define F15H_NUM_COUNTERS 6 +#define F10H_NUM_COUNTERS 4 + +static __read_mostly uint32_t amd_counters_base; +static __read_mostly uint32_t amd_ctrls_base; +static __read_mostly int amd_msr_step; +static __read_mostly int k7_counters_mirrored; +static __read_mostly int amd_num_counters; + +/* Intel PMU */ +#define MSR_TYPE_COUNTER 0 +#define MSR_TYPE_CTRL 1 +#define MSR_TYPE_GLOBAL 2 +#define MSR_TYPE_ARCH_COUNTER 3 +#define MSR_TYPE_ARCH_CTRL 4 + +/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */ +#define PMU_GENERAL_NR_SHIFT 8 +#define PMU_GENERAL_NR_BITS 8 +#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \ + << PMU_GENERAL_NR_SHIFT) + +/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */ +#define PMU_FIXED_NR_SHIFT 0 +#define PMU_FIXED_NR_BITS 5 +#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \ + << PMU_FIXED_NR_SHIFT) + +/* Alias registers (0x4c1) for full-width writes to PMCs */ +#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0)) + +#define INTEL_PMC_TYPE_SHIFT 30 + +static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters; + + +static void xen_pmu_arch_init(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + + switch (boot_cpu_data.x86) { + case 0x15: + amd_num_counters = F15H_NUM_COUNTERS; + amd_counters_base = MSR_F15H_PERF_CTR; + amd_ctrls_base = MSR_F15H_PERF_CTL; + amd_msr_step = 2; + k7_counters_mirrored = 1; + break; + case 0x10: + case 0x12: + case 0x14: + case 0x16: + default: + amd_num_counters = F10H_NUM_COUNTERS; + amd_counters_base = MSR_K7_PERFCTR0; + amd_ctrls_base = MSR_K7_EVNTSEL0; + amd_msr_step = 1; + k7_counters_mirrored = 0; + break; + } + } else { + uint32_t eax, ebx, ecx, edx; + + cpuid(0xa, &eax, &ebx, &ecx, &edx); + + intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >> + PMU_GENERAL_NR_SHIFT; + intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >> + PMU_FIXED_NR_SHIFT; + } +} + +static inline uint32_t get_fam15h_addr(u32 addr) +{ + switch (addr) { + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0); + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0); + default: + break; + } + + return addr; +} + +static inline bool is_amd_pmu_msr(unsigned int msr) +{ + if ((msr >= MSR_F15H_PERF_CTL && + msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || + (msr >= MSR_K7_EVNTSEL0 && + msr < MSR_K7_PERFCTR0 + amd_num_counters)) + return true; + + return false; +} + +static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) +{ + u32 msr_index_pmc; + + switch (msr_index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_IA32_DS_AREA: + case MSR_IA32_PEBS_ENABLE: + *type = MSR_TYPE_CTRL; + return true; + + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *type = MSR_TYPE_GLOBAL; + return true; + + default: + + if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) && + (msr_index < MSR_CORE_PERF_FIXED_CTR0 + + intel_num_fixed_counters)) { + *index = msr_index - MSR_CORE_PERF_FIXED_CTR0; + *type = MSR_TYPE_COUNTER; + return true; + } + + if ((msr_index >= MSR_P6_EVNTSEL0) && + (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) { + *index = msr_index - MSR_P6_EVNTSEL0; + *type = MSR_TYPE_ARCH_CTRL; + return true; + } + + msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK; + if ((msr_index_pmc >= MSR_IA32_PERFCTR0) && + (msr_index_pmc < MSR_IA32_PERFCTR0 + + intel_num_arch_counters)) { + *type = MSR_TYPE_ARCH_COUNTER; + *index = msr_index_pmc - MSR_IA32_PERFCTR0; + return true; + } + return false; + } +} + +static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, + int index, bool is_read) +{ + uint64_t *reg = NULL; + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fix_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + ctxt = &xenpmu_data->pmu.c.intel; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + reg = &ctxt->global_ovf_ctrl; + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + reg = &ctxt->global_status; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + reg = &ctxt->global_ctrl; + break; + case MSR_CORE_PERF_FIXED_CTR_CTRL: + reg = &ctxt->fixed_ctrl; + break; + default: + switch (type) { + case MSR_TYPE_COUNTER: + fix_counters = field_offset(ctxt, fixed_counters); + reg = &fix_counters[index]; + break; + case MSR_TYPE_ARCH_COUNTER: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].counter; + break; + case MSR_TYPE_ARCH_CTRL: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].control; + break; + default: + return false; + } + } + + if (reg) { + if (is_read) + *val = *reg; + else { + *reg = *val; + + if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL) + ctxt->global_status &= (~(*val)); + } + return true; + } + + return false; +} + +static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) +{ + uint64_t *reg = NULL; + int i, off = 0; + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs, *ctrl_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + if (k7_counters_mirrored && + ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3))) + msr = get_fam15h_addr(msr); + + ctxt = &xenpmu_data->pmu.c.amd; + for (i = 0; i < amd_num_counters; i++) { + if (msr == amd_ctrls_base + off) { + ctrl_regs = field_offset(ctxt, ctrls); + reg = &ctrl_regs[i]; + break; + } else if (msr == amd_counters_base + off) { + counter_regs = field_offset(ctxt, counters); + reg = &counter_regs[i]; + break; + } + off += amd_msr_step; + } + + if (reg) { + if (is_read) + *val = *reg; + else + *reg = *val; + + return true; + } + return false; +} + +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + if (!xen_amd_pmu_emulate(msr, val, 1)) + *val = native_read_msr_safe(msr, err); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) + *val = native_read_msr_safe(msr, err); + return true; + } + } + + return false; +} + +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) +{ + uint64_t val = ((uint64_t)high << 32) | low; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + if (!xen_amd_pmu_emulate(msr, &val, 0)) + *err = native_write_msr_safe(msr, low, high); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) + *err = native_write_msr_safe(msr, low, high); + return true; + } + } + + return false; +} + +static unsigned long long xen_amd_read_pmc(int counter) +{ + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + msr = amd_counters_base + (counter * amd_msr_step); + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.amd; + counter_regs = field_offset(ctxt, counters); + return counter_regs[counter]; +} + +static unsigned long long xen_intel_read_pmc(int counter) +{ + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fixed_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) + msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff); + else + msr = MSR_IA32_PERFCTR0 + counter; + + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.intel; + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) { + fixed_counters = field_offset(ctxt, fixed_counters); + return fixed_counters[counter & 0xffff]; + } + + arch_cntr_pair = field_offset(ctxt, arch_counters); + return arch_cntr_pair[counter].counter; +} + +unsigned long long xen_read_pmc(int counter) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return xen_amd_read_pmc(counter); + else + return xen_intel_read_pmc(counter); +} + +int pmu_apic_update(uint32_t val) +{ + int ret; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return -EINVAL; + } + + xenpmu_data->pmu.l.lapic_lvtpc = val; + + if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING) + return 0; + + ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL); + + return ret; +} + +/* perf callbacks */ +static int xen_is_in_guest(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) + return 0; + + return 1; +} + +static int xen_is_user_mode(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) + return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); + else + return !!(xenpmu_data->pmu.r.regs.cpl & 3); +} + +static unsigned long xen_get_guest_ip(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + return xenpmu_data->pmu.r.regs.ip; +} + +static struct perf_guest_info_callbacks xen_guest_cbs = { + .is_in_guest = xen_is_in_guest, + .is_user_mode = xen_is_user_mode, + .get_guest_ip = xen_get_guest_ip, +}; + +/* Convert registers from Xen's format to Linux' */ +static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, + struct pt_regs *regs, uint64_t pmu_flags) +{ + regs->ip = xen_regs->ip; + regs->cs = xen_regs->cs; + regs->sp = xen_regs->sp; + + if (pmu_flags & PMU_SAMPLE_PV) { + if (pmu_flags & PMU_SAMPLE_USER) + regs->cs |= 3; + else + regs->cs &= ~3; + } else { + if (xen_regs->cpl) + regs->cs |= 3; + else + regs->cs &= ~3; + } +} + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) +{ + int err, ret = IRQ_NONE; + struct pt_regs regs; + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return ret; + } + + this_cpu_ptr(&xenpmu_shared)->flags = + xenpmu_flags | XENPMU_IRQ_PROCESSING; + xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, + xenpmu_data->pmu.pmu_flags); + if (x86_pmu.handle_irq(®s)) + ret = IRQ_HANDLED; + + /* Write out cached context to HW */ + err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL); + this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags; + if (err) { + pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err); + return IRQ_NONE; + } + + return ret; +} + +bool is_xen_pmu(int cpu) +{ + return (get_xenpmu_data() != NULL); +} + +void xen_pmu_init(int cpu) +{ + int err; + struct xen_pmu_params xp; + unsigned long pfn; + struct xen_pmu_data *xenpmu_data; + + BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE); + + if (xen_hvm_domain()) + return; + + xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL); + if (!xenpmu_data) { + pr_err("VPMU init: No memory\n"); + return; + } + pfn = virt_to_pfn(xenpmu_data); + + xp.val = pfn_to_mfn(pfn); + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp); + if (err) + goto fail; + + per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data; + per_cpu(xenpmu_shared, cpu).flags = 0; + + if (cpu == 0) { + perf_register_guest_info_callbacks(&xen_guest_cbs); + xen_pmu_arch_init(); + } + + return; + +fail: + pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n", + cpu, err); + free_pages((unsigned long)xenpmu_data, 0); +} + +void xen_pmu_finish(int cpu) +{ + struct xen_pmu_params xp; + + if (xen_hvm_domain()) + return; + + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + + (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp); + + free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0); + per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL; +} diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h new file mode 100644 index 000000000000..af5f0ad94078 --- /dev/null +++ b/arch/x86/xen/pmu.h @@ -0,0 +1,15 @@ +#ifndef __XEN_PMU_H +#define __XEN_PMU_H + +#include <xen/interface/xenpmu.h> + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +void xen_pmu_init(int cpu); +void xen_pmu_finish(int cpu); +bool is_xen_pmu(int cpu); +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); +int pmu_apic_update(uint32_t reg); +unsigned long long xen_read_pmc(int counter); + +#endif /* __XEN_PMU_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 55f388ef481a..f5ef6746d47a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,17 +27,23 @@ #include <xen/interface/memory.h> #include <xen/interface/physdev.h> #include <xen/features.h> +#include <xen/hvc-console.h> #include "xen-ops.h" #include "vdso.h" -#include "p2m.h" #include "mmu.h" +#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) + /* Amount of extra memory space we add to the e820 ranges */ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* E820 map used during setting up memory. */ +static struct e820entry xen_e820_map[E820MAX] __initdata; +static u32 xen_e820_map_entries __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; */ #define EXTRA_MEM_RATIO (10) -static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) +static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); + +static void __init xen_parse_512gb(void) +{ + bool val = false; + char *arg; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit"); + if (!arg) + return; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); + if (!arg) + val = true; + else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + return; + + xen_512gb_limit = val; +} + +static void __init xen_add_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { /* Add new region. */ - if (xen_extra_mem[i].size == 0) { - xen_extra_mem[i].start = start; - xen_extra_mem[i].size = size; + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; break; } /* Append to existing region. */ - if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { - xen_extra_mem[i].size += size; + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; break; } } if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_reserve(start, size); + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } -static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) +static void __init xen_del_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; - phys_addr_t start_r, size_r; + unsigned long start_r, size_r; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - start_r = xen_extra_mem[i].start; - size_r = xen_extra_mem[i].size; + start_r = xen_extra_mem[i].start_pfn; + size_r = xen_extra_mem[i].n_pfns; /* Start of region. */ - if (start_r == start) { - BUG_ON(size > size_r); - xen_extra_mem[i].start += size; - xen_extra_mem[i].size -= size; + if (start_r == start_pfn) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].start_pfn += n_pfns; + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* End of region. */ - if (start_r + size_r == start + size) { - BUG_ON(size > size_r); - xen_extra_mem[i].size -= size; + if (start_r + size_r == start_pfn + n_pfns) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* Mid of region. */ - if (start > start_r && start < start_r + size_r) { - BUG_ON(start + size > start_r + size_r); - xen_extra_mem[i].size = start - start_r; + if (start_pfn > start_r && start_pfn < start_r + size_r) { + BUG_ON(start_pfn + n_pfns > start_r + size_r); + xen_extra_mem[i].n_pfns = start_pfn - start_r; /* Calling memblock_reserve() again is okay. */ - xen_add_extra_mem(start + size, start_r + size_r - - (start + size)); + xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r - + (start_pfn + n_pfns)); break; } } - memblock_free(start, size); + memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { int i; - phys_addr_t addr = PFN_PHYS(pfn); for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (addr >= xen_extra_mem[i].start && - addr < xen_extra_mem[i].start + xen_extra_mem[i].size) + if (pfn >= xen_extra_mem[i].start_pfn && + pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns) return INVALID_P2M_ENTRY; } @@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void) int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (!xen_extra_mem[i].size) + if (!xen_extra_mem[i].n_pfns) continue; - pfn_s = PFN_DOWN(xen_extra_mem[i].start); - pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); + pfn_s = xen_extra_mem[i].start_pfn; + pfn_e = pfn_s + xen_extra_mem[i].n_pfns; for (pfn = pfn_s; pfn < pfn_e; pfn++) set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void) * This function updates min_pfn with the pfn found and returns * the size of that range or zero if not found. */ -static unsigned long __init xen_find_pfn_range( - const struct e820entry *list, size_t map_size, - unsigned long *min_pfn) +static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) { - const struct e820entry *entry; + const struct e820entry *entry = xen_e820_map; unsigned int i; unsigned long done = 0; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; @@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) + unsigned long end_pfn, unsigned long nr_pages) { unsigned long pfn, end; int ret; @@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { - (*released)++; + xen_released_pages++; if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) break; } else @@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - const struct e820entry *list, size_t map_size, unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, - unsigned long *released, unsigned long *remapped) + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; @@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (cur_pfn + size > nr_pages) size = nr_pages - cur_pfn; - remap_range_size = xen_find_pfn_range(list, map_size, - &remap_pfn); + remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warning("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages, released); + cur_pfn + left, nr_pages); break; } /* Adjust size to fit in current e820 RAM region */ @@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk( /* Update variables to reflect new mappings. */ i += size; remap_pfn += size; - *remapped += size; } /* @@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static void __init xen_set_identity_and_remap( - const struct e820entry *list, size_t map_size, unsigned long nr_pages, - unsigned long *released, unsigned long *remapped) +static void __init xen_set_identity_and_remap(unsigned long nr_pages) { phys_addr_t start = 0; unsigned long last_pfn = nr_pages; - const struct e820entry *entry; - unsigned long num_released = 0; - unsigned long num_remapped = 0; + const struct e820entry *entry = xen_e820_map; int i; /* @@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap( * example) the DMI tables in a reserved region that begins on * a non-page boundary. */ - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == map_size - 1) { + if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap( if (start_pfn < end_pfn) last_pfn = xen_set_identity_and_remap_chunk( - list, map_size, start_pfn, - end_pfn, nr_pages, last_pfn, - &num_released, &num_remapped); + start_pfn, end_pfn, nr_pages, + last_pfn); start = end; } } - *released = num_released; - *remapped = num_remapped; - - pr_info("Released %ld page(s)\n", num_released); + pr_info("Released %ld page(s)\n", xen_released_pages); } /* @@ -494,7 +513,7 @@ void __init xen_remap_memory(void) } else if (pfn_s + len == xen_remap_buf.target_pfn) { len += xen_remap_buf.size; } else { - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } @@ -504,19 +523,36 @@ void __init xen_remap_memory(void) } if (pfn_s != ~0UL && len) - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); } +static unsigned long __init xen_get_pages_limit(void) +{ + unsigned long limit; + +#ifdef CONFIG_X86_32 + limit = GB(64) / PAGE_SIZE; +#else + limit = MAXMEM / PAGE_SIZE; + if (!xen_initial_domain() && xen_512gb_limit) + limit = GB(512) / PAGE_SIZE; +#endif + return limit; +} + static unsigned long __init xen_get_max_pages(void) { - unsigned long max_pages = MAX_DOMAIN_PAGES; + unsigned long max_pages, limit; domid_t domid = DOMID_SELF; int ret; + limit = xen_get_pages_limit(); + max_pages = limit; + /* * For the initial domain we use the maximum reservation as * the maximum page. @@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void) max_pages = ret; } - return min(max_pages, MAX_DOMAIN_PAGES); + return min(max_pages, limit); } static void __init xen_align_and_add_e820_region(phys_addr_t start, @@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start, e820_add_region(start, end - start, type); } -static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) +static void __init xen_ignore_unusable(void) { - struct e820entry *entry; + struct e820entry *entry = xen_e820_map; unsigned int i; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { if (entry->type == E820_UNUSABLE) entry->type = E820_RAM; } } +static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) +{ + unsigned long extra = 0; + unsigned long start_pfn, end_pfn; + const struct e820entry *entry = xen_e820_map; + int i; + + end_pfn = 0; + for (i = 0; i < xen_e820_map_entries; i++, entry++) { + start_pfn = PFN_DOWN(entry->addr); + /* Adjacent regions on non-page boundaries handling! */ + end_pfn = min(end_pfn, start_pfn); + + if (start_pfn >= max_pfn) + return extra + max_pfn - end_pfn; + + /* Add any holes in map to result. */ + extra += start_pfn - end_pfn; + + end_pfn = PFN_UP(entry->addr + entry->size); + end_pfn = min(end_pfn, max_pfn); + + if (entry->type != E820_RAM) + extra += end_pfn - start_pfn; + } + + return extra; +} + +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +{ + struct e820entry *entry; + unsigned mapcnt; + phys_addr_t end; + + if (!size) + return false; + + end = start + size; + entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) { + if (entry->type == E820_RAM && entry->addr <= start && + (entry->addr + entry->size) >= end) + return false; + + entry++; + } + + return true; +} + +/* + * Find a free area in physical memory not yet reserved and compliant with + * E820 map. + * Used to relocate pre-allocated areas like initrd or p2m list which are in + * conflict with the to be used E820 map. + * In case no area is found, return 0. Otherwise return the physical address + * of the area which is already reserved for convenience. + */ +phys_addr_t __init xen_find_free_area(phys_addr_t size) +{ + unsigned mapcnt; + phys_addr_t addr, start; + struct e820entry *entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) { + if (entry->type != E820_RAM || entry->size < size) + continue; + start = entry->addr; + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + if (!memblock_is_reserved(addr)) + continue; + start = addr + PAGE_SIZE; + if (start + size > entry->addr + entry->size) + break; + } + if (addr >= start + size) { + memblock_reserve(start, size); + return start; + } + } + + return 0; +} + +/* + * Like memcpy, but with physical addresses for dest and src. + */ +static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src, + phys_addr_t n) +{ + phys_addr_t dest_off, src_off, dest_len, src_len, len; + void *from, *to; + + while (n) { + dest_off = dest & ~PAGE_MASK; + src_off = src & ~PAGE_MASK; + dest_len = n; + if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off) + dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off; + src_len = n; + if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off) + src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off; + len = min(dest_len, src_len); + to = early_memremap(dest - dest_off, dest_len + dest_off); + from = early_memremap(src - src_off, src_len + src_off); + memcpy(to, from, len); + early_memunmap(to, dest_len + dest_off); + early_memunmap(from, src_len + src_off); + n -= len; + dest += len; + src += len; + } +} + +/* + * Reserve Xen mfn_list. + */ +static void __init xen_reserve_xen_mfnlist(void) +{ + phys_addr_t start, size; + + if (xen_start_info->mfn_list >= __START_KERNEL_map) { + start = __pa(xen_start_info->mfn_list); + size = PFN_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + } else { + start = PFN_PHYS(xen_start_info->first_p2m_pfn); + size = PFN_PHYS(xen_start_info->nr_p2m_frames); + } + + if (!xen_is_e820_reserved(start, size)) { + memblock_reserve(start, size); + return; + } + +#ifdef CONFIG_X86_32 + /* + * Relocating the p2m on 32 bit system to an arbitrary virtual address + * is not supported, so just give up. + */ + xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n"); + BUG(); +#else + xen_relocate_p2m(); +#endif +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - - unsigned long max_pfn = xen_start_info->nr_pages; - phys_addr_t mem_end; + unsigned long max_pfn, pfn_s, n_pfns; + phys_addr_t mem_end, addr, size, chunk_size; + u32 type; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; - unsigned long remapped_pages; int i; int op; - max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + xen_parse_512gb(); + max_pfn = xen_get_pages_limit(); + max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? XENMEM_machine_memory_map : @@ -590,15 +775,16 @@ char * __init xen_memory_setup(void) if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; - map[0].addr = 0ULL; - map[0].size = mem_end; + xen_e820_map[0].addr = 0ULL; + xen_e820_map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ - map[0].size += 8ULL << 20; - map[0].type = E820_RAM; + xen_e820_map[0].size += 8ULL << 20; + xen_e820_map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); BUG_ON(memmap.nr_entries == 0); + xen_e820_map_entries = memmap.nr_entries; /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE @@ -609,24 +795,19 @@ char * __init xen_memory_setup(void) * a patch in the future. */ if (xen_initial_domain()) - xen_ignore_unusable(map, memmap.nr_entries); + xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ - sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); + sanitize_e820_map(xen_e820_map, xen_e820_map_entries, + &xen_e820_map_entries); max_pages = xen_get_max_pages(); - if (max_pages > max_pfn) - extra_pages += max_pages - max_pfn; - /* - * Set identity map on non-RAM pages and prepare remapping the - * underlying RAM. - */ - xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, - &xen_released_pages, &remapped_pages); + /* How many extra pages do we need due to remapping? */ + max_pages += xen_count_remap_pages(max_pfn); - extra_pages += xen_released_pages; - extra_pages += remapped_pages; + if (max_pages > max_pfn) + extra_pages += max_pages - max_pfn; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO @@ -635,46 +816,54 @@ char * __init xen_memory_setup(void) * is limited to the max size of lowmem, so that it doesn't * get completely filled. * + * Make sure we have no memory above max_pages, as this area + * isn't handled by the p2m management. + * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ - extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - extra_pages); + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages, max_pages - max_pfn); i = 0; - while (i < memmap.nr_entries) { - phys_addr_t addr = map[i].addr; - phys_addr_t size = map[i].size; - u32 type = map[i].type; + addr = xen_e820_map[0].addr; + size = xen_e820_map[0].size; + while (i < xen_e820_map_entries) { + chunk_size = size; + type = xen_e820_map[i].type; if (type == E820_RAM) { if (addr < mem_end) { - size = min(size, mem_end - addr); + chunk_size = min(size, mem_end - addr); } else if (extra_pages) { - size = min(size, PFN_PHYS(extra_pages)); - extra_pages -= PFN_DOWN(size); - xen_add_extra_mem(addr, size); - xen_max_p2m_pfn = PFN_DOWN(addr + size); + chunk_size = min(size, PFN_PHYS(extra_pages)); + pfn_s = PFN_UP(addr); + n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s; + extra_pages -= n_pfns; + xen_add_extra_mem(pfn_s, n_pfns); + xen_max_p2m_pfn = pfn_s + n_pfns; } else type = E820_UNUSABLE; } - xen_align_and_add_e820_region(addr, size, type); + xen_align_and_add_e820_region(addr, chunk_size, type); - map[i].addr += size; - map[i].size -= size; - if (map[i].size == 0) + addr += chunk_size; + size -= chunk_size; + if (size == 0) { i++; + if (i < xen_e820_map_entries) { + addr = xen_e820_map[i].addr; + size = xen_e820_map[i].size; + } + } } /* * Set the rest as identity mapped, in case PCI BARs are * located here. - * - * PFNs above MAX_P2M_PFN are considered identity mapped as - * well. */ - set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + set_phys_range_identity(addr / PAGE_SIZE, ~0ul); /* * In domU, the ISA region is normal, usable memory, but we @@ -684,34 +873,53 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* - * Reserve Xen bits: - * - mfn_list - * - xen_start_info - * See comment above "struct start_info" in <xen/interface/xen.h> - * We tried to make the the memblock_reserve more selective so - * that it would be clear what region is reserved. Sadly we ran - * in the problem wherein on a 64-bit hypervisor with a 32-bit - * initial domain, the pt_base has the cr3 value which is not - * neccessarily where the pagetable starts! As Jan put it: " - * Actually, the adjustment turns out to be correct: The page - * tables for a 32-on-64 dom0 get allocated in the order "first L1", - * "first L2", "first L3", so the offset to the page table base is - * indeed 2. When reading xen/include/public/xen.h's comment - * very strictly, this is not a violation (since there nothing is said - * that the first thing in the page table space is pointed to by - * pt_base; I admit that this seems to be implied though, namely - * do I think that it is implied that the page table space is the - * range [pt_base, pt_base + nt_pt_frames), whereas that - * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), - * which - without a priori knowledge - the kernel would have - * difficulty to figure out)." - so lets just fall back to the - * easy way and reserve the whole region. + * Check whether the kernel itself conflicts with the target E820 map. + * Failing now is better than running into weird problems later due + * to relocating (and even reusing) pages with kernel text or data. */ - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + if (xen_is_e820_reserved(__pa_symbol(_text), + __pa_symbol(__bss_stop) - __pa_symbol(_text))) { + xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); + BUG(); + } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* + * Check for a conflict of the hypervisor supplied page tables with + * the target E820 map. + */ + xen_pt_check_e820(); + + xen_reserve_xen_mfnlist(); + + /* Check for a conflict of the initrd with the target E820 map. */ + if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image, + boot_params.hdr.ramdisk_size)) { + phys_addr_t new_area, start, size; + + new_area = xen_find_free_area(boot_params.hdr.ramdisk_size); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n"); + BUG(); + } + + start = boot_params.hdr.ramdisk_image; + size = boot_params.hdr.ramdisk_size; + xen_phys_memcpy(new_area, start, size); + pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", + start, start + size, new_area, new_area + size); + memblock_free(start, size); + boot_params.hdr.ramdisk_image = new_area; + boot_params.ext_ramdisk_image = new_area >> 32; + } + + /* + * Set identity map on non-RAM pages and prepare remapping the + * underlying RAM. + */ + xen_set_identity_and_remap(max_pfn); return "Xen"; } @@ -721,26 +929,30 @@ char * __init xen_memory_setup(void) */ char * __init xen_auto_xlated_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - struct xen_memory_map memmap; int i; int rc; memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); if (rc < 0) panic("No memory map (%d)\n", rc); - sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); + xen_e820_map_entries = memmap.nr_entries; + + sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), + &xen_e820_map_entries); - for (i = 0; i < memmap.nr_entries; i++) - e820_add_region(map[i].addr, map[i].size, map[i].type); + for (i = 0; i < xen_e820_map_entries; i++) + e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, + xen_e820_map[i].type); - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + /* Remove p2m info, it is not needed. */ + xen_start_info->mfn_list = 0; + xen_start_info->first_p2m_pfn = 0; + xen_start_info->nr_p2m_frames = 0; return "Xen"; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 86484384492e..3f4ebf0261f2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -26,6 +26,7 @@ #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> @@ -38,6 +39,7 @@ #include "xen-ops.h" #include "mmu.h" #include "smp.h" +#include "pmu.h" cpumask_var_t xen_cpu_initialized_map; @@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu) kfree(per_cpu(xen_irq_work, cpu).name); per_cpu(xen_irq_work, cpu).name = NULL; } + + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); + per_cpu(xen_pmu_irq, cpu).irq = -1; + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; + } }; static int xen_smp_intr_init(unsigned int cpu) { int rc; - char *resched_name, *callfunc_name, *debug_name; + char *resched_name, *callfunc_name, *debug_name, *pmu_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_irq_work, cpu).irq = rc; per_cpu(xen_irq_work, cpu).name = callfunc_name; + if (is_xen_pmu(cpu)) { + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, + xen_pmu_irq_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + pmu_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_pmu_irq, cpu).irq = rc; + per_cpu(xen_pmu_irq, cpu).name = pmu_name; + } + return 0; fail: @@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); + xen_pmu_init(0); + if (xen_smp_intr_init(0)) BUG(); @@ -429,7 +453,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) } #endif ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); - ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) BUG(); @@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; + xen_pmu_init(cpu); + rc = xen_smp_intr_init(cpu); if (rc) return rc; @@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu) xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 53b4c0811f4f..feddabdab448 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -11,6 +11,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "pmu.h" static void xen_pv_pre_suspend(void) { @@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - if (xen_pv_domain()) - xen_pv_pre_suspend(); + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + + if (xen_pv_domain()) + xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - if (xen_pv_domain()) - xen_pv_post_suspend(cancelled); - else - xen_hvm_post_suspend(cancelled); + int cpu; + + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 8afdfccf6086..b65f59a358a2 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -104,6 +104,8 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) #else ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) + /* Map the p2m table to a 512GB-aligned user address. */ + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2292721b1d10..1399423f3418 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); +void __init xen_reserve_special_pages(void); +void __init xen_pt_check_e820(void); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); +#ifdef CONFIG_X86_64 +void __init xen_relocate_p2m(void); +#endif +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); +phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); |