summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorTony Lindgren <tony@atomide.com>2015-09-14 22:42:11 +0200
committerTony Lindgren <tony@atomide.com>2015-09-14 22:42:11 +0200
commit7168e947291f0ead07e5638b4599fb7845288b69 (patch)
treeb39cdf02f6d2282eeb91696d4a1c7ddb6afb0e38 /arch/x86
parentARM: omap2plus_defconfig: Enable MUSB DMA support (diff)
parentARM: dts: Fixup model name for HP t410 dts (diff)
downloadlinux-7168e947291f0ead07e5638b4599fb7845288b69.tar.xz
linux-7168e947291f0ead07e5638b4599fb7845288b69.zip
Merge branch 'fixes-rc1' into omap-for-v4.3/fixes
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/include/asm/atomic.h25
-rw-r--r--arch/x86/include/asm/atomic64_32.h14
-rw-r--r--arch/x86/include/asm/atomic64_64.h15
-rw-r--r--arch/x86/include/asm/barrier.h8
-rw-r--r--arch/x86/include/asm/cacheflush.h73
-rw-r--r--arch/x86/include/asm/dma-mapping.h34
-rw-r--r--arch/x86/include/asm/ftrace.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h6
-rw-r--r--arch/x86/include/asm/io.h6
-rw-r--r--arch/x86/include/asm/irq.h4
-rw-r--r--arch/x86/include/asm/jump_label.h23
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/msr-index.h6
-rw-r--r--arch/x86/include/asm/pmem.h153
-rw-r--r--arch/x86/include/asm/qrwlock.h10
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/include/asm/xen/events.h11
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/interface.h219
-rw-r--r--arch/x86/include/asm/xen/page.h47
-rw-r--r--arch/x86/include/uapi/asm/e820.h2
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/boot.c1
-rw-r--r--arch/x86/kernel/apic/apic.c84
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c133
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/vector.c85
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c9
-rw-r--r--arch/x86/kernel/hpet.c202
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/irq.c102
-rw-r--r--arch/x86/kernel/irq_32.c10
-rw-r--r--arch/x86/kernel/irq_64.c9
-rw-r--r--arch/x86/kernel/irqinit.c6
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c4
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/pci-dma.c60
-rw-r--r--arch/x86/kernel/pmem.c79
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c24
-rw-r--r--arch/x86/kernel/tsc.c22
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/mmu.c7
-rw-r--r--arch/x86/kvm/vmx.c10
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/lguest/boot.c67
-rw-r--r--arch/x86/mm/init_32.c4
-rw-r--r--arch/x86/mm/init_64.c4
-rw-r--r--arch/x86/mm/mpx.c51
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--arch/x86/mm/tlb.c1
-rw-r--r--arch/x86/net/bpf_jit_comp.c88
-rw-r--r--arch/x86/pci/xen.c8
-rw-r--r--arch/x86/platform/efi/efi.c4
-rw-r--r--arch/x86/platform/uv/uv_irq.c2
-rw-r--r--arch/x86/platform/uv/uv_nmi.c6
-rw-r--r--arch/x86/xen/Kconfig21
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/apic.c6
-rw-r--r--arch/x86/xen/enlighten.c20
-rw-r--r--arch/x86/xen/mmu.c431
-rw-r--r--arch/x86/xen/p2m.c43
-rw-r--r--arch/x86/xen/p2m.h15
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/pmu.c570
-rw-r--r--arch/x86/xen/pmu.h15
-rw-r--r--arch/x86/xen/setup.c496
-rw-r--r--arch/x86/xen/smp.c31
-rw-r--r--arch/x86/xen/suspend.c23
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h7
82 files changed, 2439 insertions, 1061 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 48f7433dac6f..7aef2d52daa0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,7 +27,8 @@ config X86
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_PMEM_API
+ select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_SG_CHAIN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
@@ -41,6 +42,7 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
@@ -1449,10 +1451,14 @@ config ILLEGAL_POINTER_VALUE
source "mm/Kconfig"
+config X86_PMEM_LEGACY_DEVICE
+ bool
+
config X86_PMEM_LEGACY
- bool "Support non-standard NVDIMMs and ADR protected memory"
+ tristate "Support non-standard NVDIMMs and ADR protected memory"
depends on PHYS_ADDR_T_64BIT
depends on BLK_DEV
+ select X86_PMEM_LEGACY_DEVICE
select LIBNVDIMM
help
Treat memory marked using the non-standard e820 type of 12 as used
@@ -1748,6 +1754,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
+ select KEXEC_CORE
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -1764,8 +1771,8 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based system call"
+ select KEXEC_CORE
select BUILD_BIN2C
- depends on KEXEC
depends on X86_64
depends on CRYPTO=y
depends on CRYPTO_SHA256=y
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index f63797942bb5..79dac1758e7c 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -448,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
#endif
debug_putstr("\nDecompressing Linux... ");
- decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+ __decompress(input_data, input_len, NULL, NULL, output, output_len,
+ NULL, error);
parse_elf(output);
/*
* 32-bit always performs relocations. 64-bit relocations are only
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 16ef02596db2..2d6b309c8e9a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -414,7 +414,7 @@ xloadflags:
# define XLF23 0
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
# define XLF4 XLF_EFI_KEXEC
#else
# define XLF4 0
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 962297d244b3..cb5b3ab5beec 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -208,7 +208,6 @@ CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
CONFIG_DRM=y
CONFIG_DRM_I915=y
-CONFIG_DRM_I915_KMS=y
CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y
CONFIG_FB_EFI=y
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 64d7cf1b50e1..440df0c7a2ee 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -294,6 +294,7 @@ static struct ahash_alg ghash_async_alg = {
.cra_name = "ghash",
.cra_driver_name = "ghash-clmulni",
.cra_priority = 400,
+ .cra_ctxsize = sizeof(struct ghash_async_ctx),
.cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_type = &crypto_ahash_type,
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 25e3cf1cd8fd..7663c455b9f6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -380,3 +380,5 @@
371 i386 recvfrom sys_recvfrom compat_sys_recvfrom
372 i386 recvmsg sys_recvmsg compat_sys_recvmsg
373 i386 shutdown sys_shutdown
+374 i386 userfaultfd sys_userfaultfd
+375 i386 membarrier sys_membarrier
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..278842fdf1f6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,8 @@
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+323 common userfaultfd sys_userfaultfd
+324 common membarrier sys_membarrier
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 26a46f44e298..b160c0c6baed 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
{
return "[vsyscall]";
}
-static struct vm_operations_struct gate_vma_ops = {
+static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
static struct vm_area_struct gate_vma = {
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index e9168955c42f..fb52aa644aab 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -182,6 +182,21 @@ static inline int atomic_xchg(atomic_t *v, int new)
return xchg(&v->counter, new);
}
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ asm volatile(LOCK_PREFIX #op"l %1,%0" \
+ : "+m" (v->counter) \
+ : "ir" (i) \
+ : "memory"); \
+}
+
+ATOMIC_OP(and)
+ATOMIC_OP(or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_OP
+
/**
* __atomic_add_unless - add unless the number is already a given value
* @v: pointer of type atomic_t
@@ -219,16 +234,6 @@ static __always_inline short int atomic_inc_short(short int *v)
return *v;
}
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "andl %0,%1" \
- : : "r" (~(mask)), "m" (*(addr)) : "memory")
-
-#define atomic_set_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "orl %0,%1" \
- : : "r" ((unsigned)(mask)), "m" (*(addr)) \
- : "memory")
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index b154de75c90c..a11c30b77fb5 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -313,4 +313,18 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
#undef alternative_atomic64
#undef __alternative_atomic64
+#define ATOMIC64_OP(op, c_op) \
+static inline void atomic64_##op(long long i, atomic64_t *v) \
+{ \
+ long long old, c = 0; \
+ while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \
+ c = old; \
+}
+
+ATOMIC64_OP(and, &)
+ATOMIC64_OP(or, |)
+ATOMIC64_OP(xor, ^)
+
+#undef ATOMIC64_OP
+
#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index b965f9e03f2a..50e33eff58de 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -220,4 +220,19 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
return dec;
}
+#define ATOMIC64_OP(op) \
+static inline void atomic64_##op(long i, atomic64_t *v) \
+{ \
+ asm volatile(LOCK_PREFIX #op"q %1,%0" \
+ : "+m" (v->counter) \
+ : "er" (i) \
+ : "memory"); \
+}
+
+ATOMIC64_OP(and)
+ATOMIC64_OP(or)
+ATOMIC64_OP(xor)
+
+#undef ATOMIC64_OP
+
#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 818cb8788225..0681d2532527 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -57,12 +57,12 @@
do { \
compiletime_assert_atomic_type(*p); \
smp_mb(); \
- ACCESS_ONCE(*p) = (v); \
+ WRITE_ONCE(*p, v); \
} while (0)
#define smp_load_acquire(p) \
({ \
- typeof(*p) ___p1 = ACCESS_ONCE(*p); \
+ typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
smp_mb(); \
___p1; \
@@ -74,12 +74,12 @@ do { \
do { \
compiletime_assert_atomic_type(*p); \
barrier(); \
- ACCESS_ONCE(*p) = (v); \
+ WRITE_ONCE(*p, v); \
} while (0)
#define smp_load_acquire(p) \
({ \
- typeof(*p) ___p1 = ACCESS_ONCE(*p); \
+ typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
barrier(); \
___p1; \
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 9bf3ea14b9f0..e63aa38e85fb 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages);
void clflush_cache_range(void *addr, unsigned int size);
+#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
+
#ifdef CONFIG_DEBUG_RODATA
void mark_rodata_ro(void);
extern const int rodata_test_data;
@@ -109,75 +111,4 @@ static inline int rodata_test(void)
}
#endif
-#ifdef ARCH_HAS_NOCACHE_UACCESS
-
-/**
- * arch_memcpy_to_pmem - copy data to persistent memory
- * @dst: destination buffer for the copy
- * @src: source buffer for the copy
- * @n: length of the copy in bytes
- *
- * Copy data to persistent memory media via non-temporal stores so that
- * a subsequent arch_wmb_pmem() can flush cpu and memory controller
- * write buffers to guarantee durability.
- */
-static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
- size_t n)
-{
- int unwritten;
-
- /*
- * We are copying between two kernel buffers, if
- * __copy_from_user_inatomic_nocache() returns an error (page
- * fault) we would have already reported a general protection fault
- * before the WARN+BUG.
- */
- unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
- (void __user *) src, n);
- if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
- __func__, dst, src, unwritten))
- BUG();
-}
-
-/**
- * arch_wmb_pmem - synchronize writes to persistent memory
- *
- * After a series of arch_memcpy_to_pmem() operations this drains data
- * from cpu write buffers and any platform (memory controller) buffers
- * to ensure that written data is durable on persistent memory media.
- */
-static inline void arch_wmb_pmem(void)
-{
- /*
- * wmb() to 'sfence' all previous writes such that they are
- * architecturally visible to 'pcommit'. Note, that we've
- * already arranged for pmem writes to avoid the cache via
- * arch_memcpy_to_pmem().
- */
- wmb();
- pcommit_sfence();
-}
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-#ifdef CONFIG_X86_64
- /*
- * We require that wmb() be an 'sfence', that is only guaranteed on
- * 64-bit builds
- */
- return static_cpu_has(X86_FEATURE_PCOMMIT);
-#else
- return false;
-#endif
-}
-#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
-extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
-extern void arch_wmb_pmem(void);
-
-static inline bool __arch_has_wmb_pmem(void)
-{
- return false;
-}
-#endif
-
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1f5b7287d1ad..953b7263f844 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,7 +12,6 @@
#include <linux/dma-attrs.h>
#include <asm/io.h>
#include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
#include <linux/dma-contiguous.h>
#ifdef CONFIG_ISA
@@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
#endif
}
-#include <asm-generic/dma-mapping-common.h>
-
-/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
- debug_dma_mapping_error(dev, dma_addr);
- if (ops->mapping_error)
- return ops->mapping_error(dev, dma_addr);
-
- return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
+#define HAVE_ARCH_DMA_SUPPORTED 1
extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
+
+#include <asm-generic/dma-mapping-common.h>
extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag,
@@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
return gfp;
}
-#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
-
-void *
-dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp, struct dma_attrs *attrs);
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-void dma_free_attrs(struct device *dev, size_t size,
- void *vaddr, dma_addr_t bus,
- struct dma_attrs *attrs);
-
#endif
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index f45acad3c4b6..24938852db30 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -3,9 +3,9 @@
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CC_USING_FENTRY
-# define MCOUNT_ADDR ((long)(__fentry__))
+# define MCOUNT_ADDR ((unsigned long)(__fentry__))
#else
-# define MCOUNT_ADDR ((long)(mcount))
+# define MCOUNT_ADDR ((unsigned long)(mcount))
#endif
#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6615032e19c8..1e3408e88604 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -182,10 +182,10 @@ extern char irq_entries_start[];
#define trace_irq_entries_start irq_entries_start
#endif
-#define VECTOR_UNDEFINED (-1)
-#define VECTOR_RETRIGGERED (-2)
+#define VECTOR_UNUSED NULL
+#define VECTOR_RETRIGGERED ((void *)~0UL)
-typedef int vector_irq_t[NR_VECTORS];
+typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 7cfc085b6879..de25aad07853 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -250,12 +250,6 @@ static inline void flush_write_buffers(void)
#endif
}
-static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
- unsigned long size)
-{
- return (void __force __pmem *) ioremap_cache(offset, size);
-}
-
#endif /* __KERNEL__ */
extern void native_io_delay(void);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 8008d06581c7..881b4768644a 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,7 +36,9 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
extern void (*x86_platform_ipi_callback)(void);
extern void native_init_IRQ(void);
-extern bool handle_irq(unsigned irq, struct pt_regs *regs);
+
+struct irq_desc;
+extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
extern __visible unsigned int do_IRQ(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index a4c1cf7e93f8..5daeca3d0f9e 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -16,15 +16,32 @@
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
asm_volatile_goto("1:"
".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
".pushsection __jump_table, \"aw\" \n\t"
_ASM_ALIGN "\n\t"
- _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
".popsection \n\t"
- : : "i" (key) : : l_yes);
+ : : "i" (key), "i" (branch) : : l_yes);
+
+ return false;
+l_yes:
+ return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+ asm_volatile_goto("1:"
+ ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
+ "2:\n\t"
+ ".pushsection __jump_table, \"aw\" \n\t"
+ _ASM_ALIGN "\n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
+ ".popsection \n\t"
+ : : "i" (key), "i" (branch) : : l_yes);
+
return false;
l_yes:
return true;
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 32ce71375b21..b130d59406fb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
extern void __show_regs(struct pt_regs *regs, int all);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
extern int in_crash_kexec;
#else
/* no crash dump is ever in progress if no crash kernel can be kexec'd */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fcd17c1fc0c6..c1c0a1c14344 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -184,6 +184,12 @@
#define MSR_PP1_ENERGY_STATUS 0x00000641
#define MSR_PP1_POLICY 0x00000642
+#define MSR_CONFIG_TDP_NOMINAL 0x00000648
+#define MSR_CONFIG_TDP_LEVEL_1 0x00000649
+#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A
+#define MSR_CONFIG_TDP_CONTROL 0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
+
#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658
#define MSR_PKG_ANY_CORE_C0_RES 0x00000659
#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
new file mode 100644
index 000000000000..d8ce3ec816ab
--- /dev/null
+++ b/arch/x86/include/asm/pmem.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ASM_X86_PMEM_H__
+#define __ASM_X86_PMEM_H__
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+ size_t n)
+{
+ int unwritten;
+
+ /*
+ * We are copying between two kernel buffers, if
+ * __copy_from_user_inatomic_nocache() returns an error (page
+ * fault) we would have already reported a general protection fault
+ * before the WARN+BUG.
+ */
+ unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+ (void __user *) src, n);
+ if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+ __func__, dst, src, unwritten))
+ BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+ /*
+ * wmb() to 'sfence' all previous writes such that they are
+ * architecturally visible to 'pcommit'. Note, that we've
+ * already arranged for pmem writes to avoid the cache via
+ * arch_memcpy_to_pmem().
+ */
+ wmb();
+ pcommit_sfence();
+}
+
+/**
+ * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * @vaddr: virtual start address
+ * @size: number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction. This function requires explicit ordering with an
+ * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation.
+ */
+static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+{
+ u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+ unsigned long clflush_mask = x86_clflush_size - 1;
+ void *vend = vaddr + size;
+ void *p;
+
+ for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ p < vend; p += x86_clflush_size)
+ clwb(p);
+}
+
+/*
+ * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
+ * iterators, so for other types (bvec & kvec) we must do a cache write-back.
+ */
+static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
+{
+ return iter_is_iovec(i) == false;
+}
+
+/**
+ * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
+ * @addr: PMEM destination address
+ * @bytes: number of bytes to copy
+ * @i: iterator with source data
+ *
+ * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+ struct iov_iter *i)
+{
+ void *vaddr = (void __force *)addr;
+ size_t len;
+
+ /* TODO: skip the write-back by always using non-temporal stores */
+ len = copy_from_iter_nocache(vaddr, bytes, i);
+
+ if (__iter_needs_pmem_wb(i))
+ __arch_wb_cache_pmem(vaddr, bytes);
+
+ return len;
+}
+
+/**
+ * arch_clear_pmem - zero a PMEM memory range
+ * @addr: virtual start address
+ * @size: number of bytes to zero
+ *
+ * Write zeros into the memory range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+{
+ void *vaddr = (void __force *)addr;
+
+ /* TODO: implement the zeroing via non-temporal writes */
+ if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
+ clear_page(vaddr);
+ else
+ memset(vaddr, 0, size);
+
+ __arch_wb_cache_pmem(vaddr, size);
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+ /*
+ * We require that wmb() be an 'sfence', that is only guaranteed on
+ * 64-bit builds
+ */
+ return static_cpu_has(X86_FEATURE_PCOMMIT);
+}
+#endif /* CONFIG_ARCH_HAS_PMEM_API */
+#endif /* __ASM_X86_PMEM_H__ */
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
index ae0e241e228b..c537cbb038a7 100644
--- a/arch/x86/include/asm/qrwlock.h
+++ b/arch/x86/include/asm/qrwlock.h
@@ -2,16 +2,6 @@
#define _ASM_X86_QRWLOCK_H
#include <asm-generic/qrwlock_types.h>
-
-#ifndef CONFIG_X86_PPRO_FENCE
-#define queue_write_unlock queue_write_unlock
-static inline void queue_write_unlock(struct qrwlock *lock)
-{
- barrier();
- ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
-}
-#endif
-
#include <asm-generic/qrwlock.h>
#endif /* _ASM_X86_QRWLOCK_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
#endif /* SMP */
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() { \
+ inc_irq_stat(irq_tlb_count); \
+ local_flush_tlb(); \
+}
+
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \
native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 608a79d5a466..e6911caf5bbf 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
/* No need for a barrier -- XCHG is a barrier on x86. */
#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+ return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ca08a27b90b3..83aea8055119 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
return _hypercall1(int, tmem_op, op);
}
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
+{
+ return _hypercall2(int, xenpmu_op, op, arg);
+}
+
static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 3400dbaec3c3..62ca03ef5c65 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -3,12 +3,38 @@
*
* Guest OS interface to x86 Xen.
*
- * Copyright (c) 2004, K A Fraser
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
*/
#ifndef _ASM_X86_XEN_INTERFACE_H
#define _ASM_X86_XEN_INTERFACE_H
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
#ifdef __XEN__
#define __DEFINE_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
* start of the GDT because some stupid OSes export hard-coded selector values
* in their ABI. These hard-coded values are always near the start of the GDT,
* so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
*/
#define FIRST_RESERVED_GDT_PAGE 14
#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
/*
- * Send an array of these to HYPERVISOR_set_trap_table()
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
* The privilege level specifies which modes may enter a trap via a software
* interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
* privilege levels as follows:
@@ -118,10 +147,41 @@ struct trap_info {
DEFINE_GUEST_HANDLE_STRUCT(trap_info);
struct arch_shared_info {
- unsigned long max_pfn; /* max pfn that appears in table */
- /* Frame containing list of mfns containing list of mfns containing p2m. */
- unsigned long pfn_to_mfn_frame_list_list;
- unsigned long nmi_reason;
+ /*
+ * Number of valid entries in the p2m table(s) anchored at
+ * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+ */
+ unsigned long max_pfn;
+ /*
+ * Frame containing list of mfns containing list of mfns containing p2m.
+ * A value of 0 indicates it has not yet been set up, ~0 indicates it
+ * has been set to invalid e.g. due to the p2m being too large for the
+ * 3-level p2m tree. In this case the linear mapper p2m list anchored
+ * at p2m_vaddr is to be used.
+ */
+ xen_pfn_t pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
+ /*
+ * Following three fields are valid if p2m_cr3 contains a value
+ * different from 0.
+ * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+ * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+ * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+ * of a L3 or L4 page table.
+ * p2m_vaddr holds the virtual address of the linear p2m list. All
+ * entries in the range [0...max_pfn[ are accessible via this pointer.
+ * p2m_generation will be incremented by the guest before and after each
+ * change of the mappings of the p2m list. p2m_generation starts at 0
+ * and a value with the least significant bit set indicates that a
+ * mapping update is in progress. This allows guest external software
+ * (e.g. in Dom0) to verify that read mappings are consistent and
+ * whether they have changed since the last check.
+ * Modifying a p2m element in the linear p2m list is allowed via an
+ * atomic write only.
+ */
+ unsigned long p2m_cr3; /* cr3 value of the p2m address space */
+ unsigned long p2m_vaddr; /* virtual address of the p2m list */
+ unsigned long p2m_generation; /* generation count of p2m mapping */
};
#endif /* !__ASSEMBLY__ */
@@ -137,13 +197,31 @@ struct arch_shared_info {
/*
* The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
*/
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST (1<<1)
-#define VGCF_IN_KERNEL (1<<2)
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_IN_KERNEL (1<<2)
+#define _VGCF_i387_valid 0
+#define VGCF_i387_valid (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel 2
+#define VGCF_in_kernel (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events 4
+#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online 5
+#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
@@ -172,6 +250,129 @@ struct vcpu_guest_context {
#endif
};
DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+ /*
+ * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t counters;
+ uint32_t ctrls;
+
+ /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+ uint64_t counter;
+ uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+ /*
+ * Offsets to fixed and architectural counter MSRs (relative to
+ * xen_pmu_arch.c.intel).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t fixed_counters;
+ uint32_t arch_counters;
+
+ /* PMU registers */
+ uint64_t global_ctrl;
+ uint64_t global_ovf_ctrl;
+ uint64_t global_status;
+ uint64_t fixed_ctrl;
+ uint64_t ds_area;
+ uint64_t pebs_enable;
+ uint64_t debugctl;
+
+ /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+ uint64_t ip;
+ uint64_t sp;
+ uint64_t flags;
+ uint16_t cs;
+ uint16_t ss;
+ uint8_t cpl;
+ uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+ union {
+ /*
+ * Processor's registers at the time of interrupt.
+ * WO for hypervisor, RO for guests.
+ */
+ struct xen_pmu_regs regs;
+ /*
+ * Padding for adding new registers to xen_pmu_regs in
+ * the future
+ */
+#define XENPMU_REGS_PAD_SZ 64
+ uint8_t pad[XENPMU_REGS_PAD_SZ];
+ } r;
+
+ /* WO for hypervisor, RO for guest */
+ uint64_t pmu_flags;
+
+ /*
+ * APIC LVTPC register.
+ * RW for both hypervisor and guest.
+ * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+ * during XENPMU_flush or XENPMU_lvtpc_set.
+ */
+ union {
+ uint32_t lapic_lvtpc;
+ uint64_t pad;
+ } l;
+
+ /*
+ * Vendor-specific PMU registers.
+ * RW for both hypervisor and guest (see exceptions above).
+ * Guest's updates to this field are verified and then loaded by the
+ * hypervisor into hardware during XENPMU_flush
+ */
+ union {
+ struct xen_pmu_amd_ctxt amd;
+ struct xen_pmu_intel_ctxt intel;
+
+ /*
+ * Padding for contexts (fixed parts only, does not include
+ * MSR banks that are specified by offsets)
+ */
+#define XENPMU_CTXT_PAD_SZ 128
+ uint8_t pad[XENPMU_CTXT_PAD_SZ];
+ } c;
+};
+
#endif /* !__ASSEMBLY__ */
/*
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c44a5d53e464..0679e11d2cf7 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -35,9 +35,7 @@ typedef struct xpaddr {
#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES \
- ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
extern unsigned long *machine_to_phys_mapping;
extern unsigned long machine_to_phys_nr;
@@ -48,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern unsigned long set_phys_range_identity(unsigned long pfn_s,
- unsigned long pfn_e);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e);
extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
@@ -103,6 +101,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
+ /*
+ * Some x86 code are still using pfn_to_mfn instead of
+ * pfn_to_mfn. This will have to be removed when we figured
+ * out which call.
+ */
if (xen_feature(XENFEAT_auto_translated_physmap))
return pfn;
@@ -149,6 +152,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
{
unsigned long pfn;
+ /*
+ * Some x86 code are still using mfn_to_pfn instead of
+ * gfn_to_pfn. This will have to be removed when we figure
+ * out which call.
+ */
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
@@ -178,6 +186,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
}
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+ else
+ return pfn_to_mfn(pfn);
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return gfn;
+ else
+ return mfn_to_pfn(gfn);
+}
+
+/* Pseudo-physical <-> Bus conversion */
+#define pfn_to_bfn(pfn) pfn_to_gfn(pfn)
+#define bfn_to_pfn(bfn) gfn_to_pfn(bfn)
+
/*
* We detect special mappings in one of two ways:
* 1. If the MFN is an I/O page then Xen will set the m2p entry
@@ -198,7 +227,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
* require. In all the cases we care about, the FOREIGN_FRAME bit is
* masked (e.g., pfn_to_mfn()) so behaviour there is correct.
*/
-static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
{
unsigned long pfn;
@@ -217,6 +246,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(g) (__va(gfn_to_pfn(g) << PAGE_SHIFT))
+
static inline unsigned long pte_mfn(pte_t pte)
{
return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
@@ -264,7 +297,7 @@ void make_lowmem_page_readwrite(void *vaddr);
static inline bool xen_arch_need_swiotlb(struct device *dev,
unsigned long pfn,
- unsigned long mfn)
+ unsigned long bfn)
{
return false;
}
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 0f457e6eab18..9dafe59cf6e2 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -37,7 +37,7 @@
/*
* This is a non-standardized way to represent ADR or NVDIMM regions that
* persist over a reboot. The kernel will ignore their special capabilities
- * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
*
* ( Note that older platforms also used 6 for the same type of memory,
* but newer versions switched to 12 as 6 was assigned differently. Some
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3c3622176340..b1b78ffe01d0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
-obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
@@ -94,7 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
-obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 75e8bad53798..ded848c20e05 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -445,6 +445,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ acpi_penalize_sci_irq(bus_irq, trigger, polarity);
/*
* stash over-ride to indicate we've been here
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 5aba9220a5ac..3ca3e46aa405 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -462,40 +462,40 @@ static int lapic_next_deadline(unsigned long delta,
return 0;
}
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int lapic_timer_shutdown(struct clock_event_device *evt)
{
- unsigned long flags;
unsigned int v;
/* Lapic used as dummy for broadcast ? */
if (evt->features & CLOCK_EVT_FEAT_DUMMY)
- return;
+ return 0;
- local_irq_save(flags);
+ v = apic_read(APIC_LVTT);
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, v);
+ apic_write(APIC_TMICT, 0);
+ return 0;
+}
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- case CLOCK_EVT_MODE_ONESHOT:
- __setup_APIC_LVTT(lapic_timer_frequency,
- mode != CLOCK_EVT_MODE_PERIODIC, 1);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- v = apic_read(APIC_LVTT);
- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, v);
- apic_write(APIC_TMICT, 0);
- break;
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
- local_irq_restore(flags);
+ __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+ return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, true);
}
/*
@@ -513,15 +513,18 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
* The local apic timer can be used for any function which is CPU local.
*/
static struct clock_event_device lapic_clockevent = {
- .name = "lapic",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
- | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
- .shift = 32,
- .set_mode = lapic_timer_setup,
- .set_next_event = lapic_next_event,
- .broadcast = lapic_timer_broadcast,
- .rating = 100,
- .irq = -1,
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+ | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_state_shutdown = lapic_timer_shutdown,
+ .set_state_periodic = lapic_timer_set_periodic,
+ .set_state_oneshot = lapic_timer_set_oneshot,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
@@ -778,7 +781,7 @@ static int __init calibrate_APIC_clock(void)
* Setup the apic timer manually
*/
levt->event_handler = lapic_cal_handler;
- lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+ lapic_timer_set_periodic(levt);
lapic_cal_loops = -1;
/* Let the interrupts run */
@@ -788,7 +791,8 @@ static int __init calibrate_APIC_clock(void)
cpu_relax();
/* Stop the lapic timer */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+ local_irq_disable();
+ lapic_timer_shutdown(levt);
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
@@ -799,8 +803,8 @@ static int __init calibrate_APIC_clock(void)
apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
else
levt->features |= CLOCK_EVT_FEAT_DUMMY;
- } else
- local_irq_enable();
+ }
+ local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
pr_warning("APIC timer disabled due to verification failure\n");
@@ -878,7 +882,7 @@ static void local_apic_timer_interrupt(void)
if (!evt->event_handler) {
pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
/* Switch it off */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+ lapic_timer_shutdown(evt);
return;
}
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..045e424fb368 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#endif
#ifdef arch_trigger_all_cpu_backtrace
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
-
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
-static unsigned long backtrace_flag;
-
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
+ apic->send_IPI_mask(mask, NMI_VECTOR);
}
void arch_trigger_all_cpu_backtrace(bool include_self)
{
- struct nmi_seq_buf *s;
- int len;
- int cpu;
- int i;
- int this_cpu = get_cpu();
-
- if (test_and_set_bit(0, &backtrace_flag)) {
- /*
- * If there is already a trigger_all_cpu_backtrace() in progress
- * (backtrace_flag == 1), don't output double cpu dump infos.
- */
- put_cpu();
- return;
- }
-
- cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
- if (!include_self)
- cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
-
- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
- if (!cpumask_empty(to_cpumask(backtrace_mask))) {
- pr_info("sending NMI to %s CPUs:\n",
- (include_self ? "all" : "other"));
- apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
- }
-
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(to_cpumask(backtrace_mask)))
- break;
- mdelay(1);
- touch_softlockup_watchdog();
- }
-
- /*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
- */
- for_each_cpu(cpu, &printtrace_mask) {
- int last_i = 0;
-
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
-
- clear_bit(0, &backtrace_flag);
- smp_mb__after_atomic();
- put_cpu();
-}
-
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
+ nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
}
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
- int cpu;
-
- cpu = smp_processor_id();
-
- if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
- printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
- show_regs(regs);
- this_cpu_write(printk_func, printk_func_save);
-
- cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+ if (nmi_cpu_backtrace(regs))
return NMI_HANDLED;
- }
return NMI_DONE;
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 206052e55517..38a76f826530 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2541,7 +2541,7 @@ void __init setup_ioapic_dest(void)
* Honour affinities which have been set in early boot
*/
if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
- mask = idata->affinity;
+ mask = irq_data_get_affinity_mask(idata);
else
mask = apic->target_cpus();
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 1a9d735e09c6..5f1feb6854af 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -264,7 +264,7 @@ static inline int hpet_dev_id(struct irq_domain *domain)
static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
- hpet_msi_write(data->handler_data, msg);
+ hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
}
static struct irq_chip hpet_msi_controller = {
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 2683f36e4e0a..1bbd0fe2c806 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -169,8 +169,7 @@ next:
goto next;
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
- if (per_cpu(vector_irq, new_cpu)[vector] >
- VECTOR_UNDEFINED)
+ if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
goto next;
}
/* Found one! */
@@ -182,7 +181,7 @@ next:
cpumask_intersects(d->old_domain, cpu_online_mask);
}
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
- per_cpu(vector_irq, new_cpu)[vector] = irq;
+ per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
d->cfg.vector = vector;
cpumask_copy(d->domain, vector_cpumask);
err = 0;
@@ -224,15 +223,16 @@ static int assign_irq_vector_policy(int irq, int node,
static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
- int cpu, vector;
+ struct irq_desc *desc;
unsigned long flags;
+ int cpu, vector;
raw_spin_lock_irqsave(&vector_lock, flags);
BUG_ON(!data->cfg.vector);
vector = data->cfg.vector;
for_each_cpu_and(cpu, data->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
data->cfg.vector = 0;
cpumask_clear(data->domain);
@@ -242,12 +242,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
return;
}
+ desc = irq_to_desc(irq);
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
- if (per_cpu(vector_irq, cpu)[vector] != irq)
+ if (per_cpu(vector_irq, cpu)[vector] != desc)
continue;
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
break;
}
}
@@ -296,7 +297,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
struct irq_alloc_info *info = arg;
struct apic_chip_data *data;
struct irq_data *irq_data;
- int i, err;
+ int i, err, node;
if (disable_apic)
return -ENXIO;
@@ -308,12 +309,13 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
for (i = 0; i < nr_irqs; i++) {
irq_data = irq_domain_get_irq_data(domain, virq + i);
BUG_ON(!irq_data);
+ node = irq_data_get_node(irq_data);
#ifdef CONFIG_X86_IO_APIC
if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
data = legacy_irq_data[virq + i];
else
#endif
- data = alloc_apic_chip_data(irq_data->node);
+ data = alloc_apic_chip_data(node);
if (!data) {
err = -ENOMEM;
goto error;
@@ -322,8 +324,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irq_data->chip = &lapic_controller;
irq_data->chip_data = data;
irq_data->hwirq = virq + i;
- err = assign_irq_vector_policy(virq + i, irq_data->node, data,
- info);
+ err = assign_irq_vector_policy(virq + i, node, data, info);
if (err)
goto error;
}
@@ -403,32 +404,32 @@ int __init arch_early_irq_init(void)
return arch_early_ioapic_init();
}
+/* Initialize vector_irq on a new cpu */
static void __setup_vector_irq(int cpu)
{
- /* Initialize vector_irq on a new cpu */
- int irq, vector;
struct apic_chip_data *data;
+ struct irq_desc *desc;
+ int irq, vector;
/* Mark the inuse vectors */
- for_each_active_irq(irq) {
- data = apic_chip_data(irq_get_irq_data(irq));
- if (!data)
- continue;
+ for_each_irq_desc(irq, desc) {
+ struct irq_data *idata = irq_desc_get_irq_data(desc);
- if (!cpumask_test_cpu(cpu, data->domain))
+ data = apic_chip_data(idata);
+ if (!data || !cpumask_test_cpu(cpu, data->domain))
continue;
vector = data->cfg.vector;
- per_cpu(vector_irq, cpu)[vector] = irq;
+ per_cpu(vector_irq, cpu)[vector] = desc;
}
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
- irq = per_cpu(vector_irq, cpu)[vector];
- if (irq <= VECTOR_UNDEFINED)
+ desc = per_cpu(vector_irq, cpu)[vector];
+ if (IS_ERR_OR_NULL(desc))
continue;
- data = apic_chip_data(irq_get_irq_data(irq));
+ data = apic_chip_data(irq_desc_get_irq_data(desc));
if (!cpumask_test_cpu(cpu, data->domain))
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
}
}
@@ -448,7 +449,7 @@ void setup_vector_irq(int cpu)
* legacy vector to irq mapping:
*/
for (irq = 0; irq < nr_legacy_irqs(); irq++)
- per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
+ per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
__setup_vector_irq(cpu);
}
@@ -490,7 +491,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
if (err) {
struct irq_data *top = irq_get_irq_data(irq);
- if (assign_irq_vector(irq, data, top->affinity))
+ if (assign_irq_vector(irq, data,
+ irq_data_get_affinity_mask(top)))
pr_err("Failed to recover vector for irq %d\n", irq);
return err;
}
@@ -538,27 +540,30 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
entering_ack_irq();
+ /* Prevent vectors vanishing under us */
+ raw_spin_lock(&vector_lock);
+
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- int irq;
- unsigned int irr;
- struct irq_desc *desc;
struct apic_chip_data *data;
+ struct irq_desc *desc;
+ unsigned int irr;
- irq = __this_cpu_read(vector_irq[vector]);
-
- if (irq <= VECTOR_UNDEFINED)
+ retry:
+ desc = __this_cpu_read(vector_irq[vector]);
+ if (IS_ERR_OR_NULL(desc))
continue;
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
+ if (!raw_spin_trylock(&desc->lock)) {
+ raw_spin_unlock(&vector_lock);
+ cpu_relax();
+ raw_spin_lock(&vector_lock);
+ goto retry;
+ }
- data = apic_chip_data(&desc->irq_data);
+ data = apic_chip_data(irq_desc_get_irq_data(desc));
if (!data)
- continue;
-
- raw_spin_lock(&desc->lock);
+ goto unlock;
/*
* Check if the irq migration is in progress. If so, we
@@ -583,11 +588,13 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
goto unlock;
}
- __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
unlock:
raw_spin_unlock(&desc->lock);
}
+ raw_spin_unlock(&vector_lock);
+
exiting_irq();
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3f124d553c5a..cd9b6d0b10bf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void)
return 0;
}
- watchdog_nmi_disable_all();
+ if (lockup_detector_suspend() != 0) {
+ pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+ return 0;
+ }
x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void)
x86_pmu.commit_scheduling = NULL;
x86_pmu.stop_scheduling = NULL;
- watchdog_nmi_enable_all();
+ lockup_detector_resume();
get_online_cpus();
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index f75c5908c7a6..88b4da373081 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -226,22 +226,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
*/
static unsigned long hpet_freq;
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt);
-
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
- .name = "hpet",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = hpet_legacy_set_mode,
- .set_next_event = hpet_legacy_next_event,
- .irq = 0,
- .rating = 50,
-};
+static struct clock_event_device hpet_clockevent;
static void hpet_stop_counter(void)
{
@@ -306,64 +291,74 @@ static void hpet_legacy_clockevent_register(void)
printk(KERN_DEBUG "hpet clockevent registered\n");
}
-static void hpet_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt, int timer)
+static int hpet_set_periodic(struct clock_event_device *evt, int timer)
{
unsigned int cfg, cmp, now;
uint64_t delta;
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- hpet_stop_counter();
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
- delta >>= evt->shift;
- now = hpet_readl(HPET_COUNTER);
- cmp = now + (unsigned int) delta;
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
- HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- hpet_writel(cmp, HPET_Tn_CMP(timer));
- udelay(1);
- /*
- * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
- * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
- * bit is automatically cleared after the first write.
- * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
- * Publication # 24674)
- */
- hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
- hpet_start_counter();
- hpet_print_config();
- break;
+ hpet_stop_counter();
+ delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+ delta >>= evt->shift;
+ now = hpet_readl(HPET_COUNTER);
+ cmp = now + (unsigned int)delta;
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+ hpet_writel(cmp, HPET_Tn_CMP(timer));
+ udelay(1);
+ /*
+ * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+ * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+ * bit is automatically cleared after the first write.
+ * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+ * Publication # 24674)
+ */
+ hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+ hpet_start_counter();
+ hpet_print_config();
- case CLOCK_EVT_MODE_ONESHOT:
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg &= ~HPET_TN_PERIODIC;
- cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- break;
+ return 0;
+}
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- break;
+static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg;
- case CLOCK_EVT_MODE_RESUME:
- if (timer == 0) {
- hpet_enable_legacy_int();
- } else {
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
- disable_irq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
- enable_irq(hdev->irq);
- }
- hpet_print_config();
- break;
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+ return 0;
+}
+
+static int hpet_shutdown(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg;
+
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+ return 0;
+}
+
+static int hpet_resume(struct clock_event_device *evt, int timer)
+{
+ if (!timer) {
+ hpet_enable_legacy_int();
+ } else {
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
+ disable_irq(hdev->irq);
+ irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
+ enable_irq(hdev->irq);
}
+ hpet_print_config();
+
+ return 0;
}
static int hpet_next_event(unsigned long delta,
@@ -403,10 +398,24 @@ static int hpet_next_event(unsigned long delta,
return res < HPET_MIN_CYCLES ? -ETIME : 0;
}
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int hpet_legacy_shutdown(struct clock_event_device *evt)
+{
+ return hpet_shutdown(evt, 0);
+}
+
+static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
+{
+ return hpet_set_oneshot(evt, 0);
+}
+
+static int hpet_legacy_set_periodic(struct clock_event_device *evt)
{
- hpet_set_mode(mode, evt, 0);
+ return hpet_set_periodic(evt, 0);
+}
+
+static int hpet_legacy_resume(struct clock_event_device *evt)
+{
+ return hpet_resume(evt, 0);
}
static int hpet_legacy_next_event(unsigned long delta,
@@ -416,6 +425,22 @@ static int hpet_legacy_next_event(unsigned long delta,
}
/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+ .name = "hpet",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT,
+ .set_state_periodic = hpet_legacy_set_periodic,
+ .set_state_oneshot = hpet_legacy_set_oneshot,
+ .set_state_shutdown = hpet_legacy_shutdown,
+ .tick_resume = hpet_legacy_resume,
+ .set_next_event = hpet_legacy_next_event,
+ .irq = 0,
+ .rating = 50,
+};
+
+/*
* HPET MSI Support
*/
#ifdef CONFIG_PCI_MSI
@@ -426,7 +451,7 @@ static struct irq_domain *hpet_domain;
void hpet_msi_unmask(struct irq_data *data)
{
- struct hpet_dev *hdev = data->handler_data;
+ struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
unsigned int cfg;
/* unmask it */
@@ -437,7 +462,7 @@ void hpet_msi_unmask(struct irq_data *data)
void hpet_msi_mask(struct irq_data *data)
{
- struct hpet_dev *hdev = data->handler_data;
+ struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
unsigned int cfg;
/* mask it */
@@ -459,11 +484,32 @@ void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
msg->address_hi = 0;
}
-static void hpet_msi_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int hpet_msi_shutdown(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_shutdown(evt, hdev->num);
+}
+
+static int hpet_msi_set_oneshot(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_set_oneshot(evt, hdev->num);
+}
+
+static int hpet_msi_set_periodic(struct clock_event_device *evt)
{
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- hpet_set_mode(mode, evt, hdev->num);
+
+ return hpet_set_periodic(evt, hdev->num);
+}
+
+static int hpet_msi_resume(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_resume(evt, hdev->num);
}
static int hpet_msi_next_event(unsigned long delta,
@@ -523,10 +569,14 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
evt->rating = 110;
evt->features = CLOCK_EVT_FEAT_ONESHOT;
- if (hdev->flags & HPET_DEV_PERI_CAP)
+ if (hdev->flags & HPET_DEV_PERI_CAP) {
evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+ evt->set_state_periodic = hpet_msi_set_periodic;
+ }
- evt->set_mode = hpet_msi_set_mode;
+ evt->set_state_shutdown = hpet_msi_shutdown;
+ evt->set_state_oneshot = hpet_msi_set_oneshot;
+ evt->tick_resume = hpet_msi_resume;
evt->set_next_event = hpet_msi_next_event;
evt->cpumask = cpumask_of(hdev->cpu);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index f2b96de3c7c1..efb82f07b29c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -34,7 +34,7 @@ static int __init init_pit_clocksource(void)
* - when local APIC timer is active (PIT is switched off)
*/
if (num_possible_cpus() > 1 || is_hpet_enabled() ||
- i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+ !clockevent_state_periodic(&i8253_clockevent))
return 0;
return clocksource_i8253_init();
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index ae00b355114d..f8062aaf5df9 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -214,10 +214,9 @@ u64 arch_irq_stat(void)
__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
-
+ struct irq_desc * desc;
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
- unsigned irq;
/*
* NB: Unlike exception entries, IRQ entries do not reliably
@@ -236,17 +235,17 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/* entering_irq() tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
- irq = __this_cpu_read(vector_irq[vector]);
+ desc = __this_cpu_read(vector_irq[vector]);
- if (!handle_irq(irq, regs)) {
+ if (!handle_irq(desc, regs)) {
ack_APIC_irq();
- if (irq != VECTOR_RETRIGGERED) {
- pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+ if (desc != VECTOR_RETRIGGERED) {
+ pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
- vector, irq);
+ vector);
} else {
- __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
@@ -348,10 +347,10 @@ static struct cpumask affinity_new, online_new;
*/
int check_irq_vectors_for_cpu_disable(void)
{
- int irq, cpu;
unsigned int this_cpu, vector, this_count, count;
struct irq_desc *desc;
struct irq_data *data;
+ int cpu;
this_cpu = smp_processor_id();
cpumask_copy(&online_new, cpu_online_mask);
@@ -359,47 +358,43 @@ int check_irq_vectors_for_cpu_disable(void)
this_count = 0;
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- irq = __this_cpu_read(vector_irq[vector]);
- if (irq >= 0) {
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
-
- /*
- * Protect against concurrent action removal,
- * affinity changes etc.
- */
- raw_spin_lock(&desc->lock);
- data = irq_desc_get_irq_data(desc);
- cpumask_copy(&affinity_new, data->affinity);
- cpumask_clear_cpu(this_cpu, &affinity_new);
-
- /* Do not count inactive or per-cpu irqs. */
- if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
- raw_spin_unlock(&desc->lock);
- continue;
- }
+ desc = __this_cpu_read(vector_irq[vector]);
+ if (IS_ERR_OR_NULL(desc))
+ continue;
+ /*
+ * Protect against concurrent action removal, affinity
+ * changes etc.
+ */
+ raw_spin_lock(&desc->lock);
+ data = irq_desc_get_irq_data(desc);
+ cpumask_copy(&affinity_new,
+ irq_data_get_affinity_mask(data));
+ cpumask_clear_cpu(this_cpu, &affinity_new);
+ /* Do not count inactive or per-cpu irqs. */
+ if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) {
raw_spin_unlock(&desc->lock);
- /*
- * A single irq may be mapped to multiple
- * cpu's vector_irq[] (for example IOAPIC cluster
- * mode). In this case we have two
- * possibilities:
- *
- * 1) the resulting affinity mask is empty; that is
- * this the down'd cpu is the last cpu in the irq's
- * affinity mask, or
- *
- * 2) the resulting affinity mask is no longer
- * a subset of the online cpus but the affinity
- * mask is not zero; that is the down'd cpu is the
- * last online cpu in a user set affinity mask.
- */
- if (cpumask_empty(&affinity_new) ||
- !cpumask_subset(&affinity_new, &online_new))
- this_count++;
+ continue;
}
+
+ raw_spin_unlock(&desc->lock);
+ /*
+ * A single irq may be mapped to multiple cpu's
+ * vector_irq[] (for example IOAPIC cluster mode). In
+ * this case we have two possibilities:
+ *
+ * 1) the resulting affinity mask is empty; that is
+ * this the down'd cpu is the last cpu in the irq's
+ * affinity mask, or
+ *
+ * 2) the resulting affinity mask is no longer a
+ * subset of the online cpus but the affinity mask is
+ * not zero; that is the down'd cpu is the last online
+ * cpu in a user set affinity mask.
+ */
+ if (cpumask_empty(&affinity_new) ||
+ !cpumask_subset(&affinity_new, &online_new))
+ this_count++;
}
count = 0;
@@ -418,8 +413,8 @@ int check_irq_vectors_for_cpu_disable(void)
for (vector = FIRST_EXTERNAL_VECTOR;
vector < first_system_vector; vector++) {
if (!test_bit(vector, used_vectors) &&
- per_cpu(vector_irq, cpu)[vector] < 0)
- count++;
+ IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
+ count++;
}
}
@@ -455,7 +450,7 @@ void fixup_irqs(void)
raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
- affinity = data->affinity;
+ affinity = irq_data_get_affinity_mask(data);
if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
cpumask_subset(affinity, cpu_online_mask)) {
raw_spin_unlock(&desc->lock);
@@ -523,14 +518,13 @@ void fixup_irqs(void)
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
- if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
+ if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
if (irr & (1 << (vector % 32))) {
- irq = __this_cpu_read(vector_irq[vector]);
+ desc = __this_cpu_read(vector_irq[vector]);
- desc = irq_to_desc(irq);
raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
@@ -541,7 +535,7 @@ void fixup_irqs(void)
raw_spin_unlock(&desc->lock);
}
if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
- __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index cd74f5978ab9..c80cf6699678 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -148,21 +148,21 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
- struct irq_desc *desc;
+ unsigned int irq;
int overflow;
overflow = check_stack_overflow();
- desc = irq_to_desc(irq);
- if (unlikely(!desc))
+ if (IS_ERR_OR_NULL(desc))
return false;
+ irq = irq_desc_get_irq(desc);
if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
print_stack_overflow();
- desc->handle_irq(irq, desc);
+ generic_handle_irq_desc(irq, desc);
}
return true;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index bc4604e500a3..ff16ccb918f2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -68,16 +68,13 @@ static inline void stack_overflow_check(struct pt_regs *regs)
#endif
}
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
- struct irq_desc *desc;
-
stack_overflow_check(regs);
- desc = irq_to_desc(irq);
- if (unlikely(!desc))
+ if (unlikely(IS_ERR_OR_NULL(desc)))
return false;
- generic_handle_irq_desc(irq, desc);
+ generic_handle_irq_desc(irq_desc_get_irq(desc), desc);
return true;
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a3a5e158ed69..1423ab1b0312 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
+ [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
int cpu;
for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
+ if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
return 1;
}
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
* irq's migrate etc.
*/
for (i = 0; i < nr_legacy_irqs(); i++)
- per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
+ per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
x86_init.irqs.intr_init();
}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 26d5a55a2736..e565e0e4d216 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -45,7 +45,7 @@ static void __jump_label_transform(struct jump_entry *entry,
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
- if (type == JUMP_LABEL_ENABLE) {
+ if (type == JUMP_LABEL_JMP) {
if (init) {
/*
* Jump label is enabled for the first time.
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 961e51e9c6f6..0f8a6bbaaa44 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -533,7 +533,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
int ret;
ret = verify_pefile_signature(kernel, kernel_len,
- system_trusted_keyring, &trusted);
+ system_trusted_keyring,
+ VERIFYING_KEXEC_PE_SIGNATURE,
+ &trusted);
if (ret < 0)
return ret;
if (!trusted)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 49487b488061..2c7aafa70702 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
* kind of shutdown from our side, we unregister the clock by writting anything
* that does not have the 'enable' bit set in the msr
*/
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
static void kvm_crash_shutdown(struct pt_regs *regs)
{
native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 353972c1946c..84b8ef82a159 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
/* Number of entries preallocated for DMA-API debugging */
#define PREALLOC_DMA_DEBUG_ENTRIES 65536
-int dma_set_mask(struct device *dev, u64 mask)
-{
- if (!dev->dma_mask || !dma_supported(dev, mask))
- return -EIO;
-
- *dev->dma_mask = mask;
-
- return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
void __init pci_iommu_alloc(void)
{
struct iommu_table_entry *p;
@@ -140,50 +129,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
free_pages((unsigned long)vaddr, get_order(size));
}
-void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp, struct dma_attrs *attrs)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
{
- struct dma_map_ops *ops = get_dma_ops(dev);
- void *memory;
-
- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
- if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
- return memory;
-
- if (!dev)
- dev = &x86_dma_fallback_dev;
-
- if (!is_device_dma_capable(dev))
- return NULL;
-
- if (!ops->alloc)
- return NULL;
-
- memory = ops->alloc(dev, size, dma_handle,
- dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
- debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
- return memory;
-}
-EXPORT_SYMBOL(dma_alloc_attrs);
-
-void dma_free_attrs(struct device *dev, size_t size,
- void *vaddr, dma_addr_t bus,
- struct dma_attrs *attrs)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
-
- WARN_ON(irqs_disabled()); /* for portability */
+ *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
+ *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- if (dma_release_from_coherent(dev, get_order(size), vaddr))
- return;
+ if (!*dev)
+ *dev = &x86_dma_fallback_dev;
+ if (!is_device_dma_capable(*dev))
+ return false;
+ return true;
- debug_dma_free_coherent(dev, size, vaddr, bus);
- if (ops->free)
- ops->free(dev, size, vaddr, bus, attrs);
}
-EXPORT_SYMBOL(dma_free_attrs);
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
/*
* See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 64f90f53bb85..4f00b63d7ff3 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -3,80 +3,17 @@
* Copyright (c) 2015, Intel Corporation.
*/
#include <linux/platform_device.h>
-#include <linux/libnvdimm.h>
#include <linux/module.h>
-#include <asm/e820.h>
-
-static void e820_pmem_release(struct device *dev)
-{
- struct nvdimm_bus *nvdimm_bus = dev->platform_data;
-
- if (nvdimm_bus)
- nvdimm_bus_unregister(nvdimm_bus);
-}
-
-static struct platform_device e820_pmem = {
- .name = "e820_pmem",
- .id = -1,
- .dev = {
- .release = e820_pmem_release,
- },
-};
-
-static const struct attribute_group *e820_pmem_attribute_groups[] = {
- &nvdimm_bus_attribute_group,
- NULL,
-};
-
-static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
- &nd_region_attribute_group,
- &nd_device_attribute_group,
- NULL,
-};
static __init int register_e820_pmem(void)
{
- static struct nvdimm_bus_descriptor nd_desc;
- struct device *dev = &e820_pmem.dev;
- struct nvdimm_bus *nvdimm_bus;
- int rc, i;
-
- rc = platform_device_register(&e820_pmem);
- if (rc)
- return rc;
-
- nd_desc.attr_groups = e820_pmem_attribute_groups;
- nd_desc.provider_name = "e820";
- nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
- if (!nvdimm_bus)
- goto err;
- dev->platform_data = nvdimm_bus;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- struct resource res = {
- .flags = IORESOURCE_MEM,
- .start = ei->addr,
- .end = ei->addr + ei->size - 1,
- };
- struct nd_region_desc ndr_desc;
-
- if (ei->type != E820_PRAM)
- continue;
-
- memset(&ndr_desc, 0, sizeof(ndr_desc));
- ndr_desc.res = &res;
- ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
- ndr_desc.numa_node = NUMA_NO_NODE;
- if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
- goto err;
- }
-
- return 0;
-
- err:
- dev_err(dev, "failed to register legacy persistent memory ranges\n");
- platform_device_unregister(&e820_pmem);
- return -ENXIO;
+ struct platform_device *pdev;
+
+ /*
+ * See drivers/nvdimm/e820.c for the implementation, this is
+ * simply here to trigger the module to load on demand.
+ */
+ pdev = platform_device_alloc("e820_pmem", -1);
+ return platform_device_add(pdev);
}
device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 86db4bcd7ce5..02693dd9a079 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
.crash_shutdown = native_machine_crash_shutdown,
#endif
};
@@ -703,7 +703,7 @@ void machine_halt(void)
machine_ops.halt();
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b143c2d04420..fdb7f2a2d328 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void)
return ramdisk_size;
}
-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- unsigned long slop, clen, mapaddr;
- char *p, *q;
/* We need to move the initrd down into directly mapped mem */
relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
@@ -343,25 +340,8 @@ static void __init relocate_initrd(void)
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
- q = (char *)initrd_start;
-
- /* Copy the initrd */
- while (ramdisk_size) {
- slop = ramdisk_image & ~PAGE_MASK;
- clen = ramdisk_size;
- if (clen > MAX_MAP_CHUNK-slop)
- clen = MAX_MAP_CHUNK-slop;
- mapaddr = ramdisk_image & PAGE_MASK;
- p = early_memremap(mapaddr, clen+slop);
- memcpy(q, p+slop, clen);
- early_memunmap(p, clen+slop);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
+ copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
- ramdisk_image = get_ramdisk_image();
- ramdisk_size = get_ramdisk_size();
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
@@ -498,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
* --------- Crashkernel reservation ------------------------------
*/
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 79055cf2c497..c8d52cb4cb6e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -38,7 +38,7 @@ static int __read_mostly tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int __read_mostly tsc_disabled = -1;
-static struct static_key __use_tsc = STATIC_KEY_INIT;
+static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
@@ -274,7 +274,12 @@ done:
*/
u64 native_sched_clock(void)
{
- u64 tsc_now;
+ if (static_branch_likely(&__use_tsc)) {
+ u64 tsc_now = rdtsc();
+
+ /* return the value in ns */
+ return cycles_2_ns(tsc_now);
+ }
/*
* Fall back to jiffies if there's no TSC available:
@@ -284,16 +289,9 @@ u64 native_sched_clock(void)
* very important for it to be as fast as the platform
* can achieve it. )
*/
- if (!static_key_false(&__use_tsc)) {
- /* No locking but a rare wrong value is not a big deal: */
- return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
- }
-
- /* read the Time Stamp Counter: */
- tsc_now = rdtsc();
- /* return the value in ns */
- return cycles_2_ns(tsc_now);
+ /* No locking but a rare wrong value is not a big deal: */
+ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}
/*
@@ -1212,7 +1210,7 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
- static_key_slow_inc(&__use_tsc);
+ static_branch_enable(&__use_tsc);
if (!no_sched_irq_time)
enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..74e4bf11f562 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
#include <asm/kexec.h>
. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e7a4fde5d631..b372a7557c16 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
u16 sel;
la = seg_base(ctxt, addr.seg) + addr.ea;
+ *linear = la;
*max_size = 0;
switch (mode) {
case X86EMUL_MODE_PROT64:
@@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
}
if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
return emulate_gp(ctxt, 0);
- *linear = la;
return X86EMUL_CONTINUE;
bad:
if (addr.seg == VCPU_SREG_SS)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fb16a8ea3dee..69088a1ba509 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3309,13 +3309,14 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
walk_shadow_page_lockless_begin(vcpu);
- for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level;
+ for (shadow_walk_init(&iterator, vcpu, addr),
+ leaf = root = iterator.level;
shadow_walk_okay(&iterator);
__shadow_walk_next(&iterator, spte)) {
- leaf = iterator.level;
spte = mmu_spte_get_lockless(iterator.sptep);
sptes[leaf - 1] = spte;
+ leaf--;
if (!is_shadow_present_pte(spte))
break;
@@ -3329,7 +3330,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
if (reserved) {
pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
__func__, addr);
- while (root >= leaf) {
+ while (root > leaf) {
pr_err("------ spte 0x%llx level %d.\n",
sptes[root - 1], root);
root--;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4a4eec30cc08..d01986832afc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
vmcs, phys_addr);
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* This bitmap is used to indicate whether the vmclear
* operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
#else
static inline void crash_enable_local_vmclear(int cpu) { }
static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
@@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
struct page *pages;
struct vmcs *vmcs;
- pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
+ pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
if (r)
return r;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
static void __exit vmx_exit(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1e7e76e14e89..a60bdbccff51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5943,6 +5943,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
}
+#ifdef CONFIG_X86_64
static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
@@ -5958,6 +5959,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
put_smstate(u32, buf, offset + 4, seg.limit);
put_smstate(u64, buf, offset + 8, seg.base);
}
+#endif
static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
{
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 433e5a7dd37f..161804de124a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -835,16 +835,46 @@ static struct irq_chip lguest_irq_controller = {
.irq_unmask = enable_lguest_irq,
};
+/*
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
+ */
+static int lguest_setup_irq(unsigned int irq)
+{
+ struct irq_desc *desc;
+ int err;
+
+ /* Returns -ve error or vector number. */
+ err = irq_alloc_desc_at(irq, 0);
+ if (err < 0 && err != -EEXIST)
+ return err;
+
+ /*
+ * Tell the Linux infrastructure that the interrupt is
+ * controlled by our level-based lguest interrupt controller.
+ */
+ irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
+ handle_level_irq, "level");
+
+ /* Some systems map "vectors" to interrupts weirdly. Not us! */
+ desc = irq_to_desc(irq);
+ __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
+ return 0;
+}
+
static int lguest_enable_irq(struct pci_dev *dev)
{
+ int err;
u8 line = 0;
/* We literally use the PCI interrupt line as the irq number. */
pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
- irq_set_chip_and_handler_name(line, &lguest_irq_controller,
- handle_level_irq, "level");
- dev->irq = line;
- return 0;
+ err = lguest_setup_irq(line);
+ if (!err)
+ dev->irq = line;
+ return err;
}
/* We don't do hotplug PCI, so this shouldn't be called. */
@@ -855,17 +885,13 @@ static void lguest_disable_irq(struct pci_dev *dev)
/*
* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls), and then tells the
- * Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller.
+ * interrupt (except 128, which is used for system calls).
*/
static void __init lguest_init_IRQ(void)
{
unsigned int i;
for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
- /* Some systems map "vectors" to interrupts weirdly. Not us! */
- __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != IA32_SYSCALL_VECTOR)
set_intr_gate(i, irq_entries_start +
8 * (i - FIRST_EXTERNAL_VECTOR));
@@ -879,26 +905,6 @@ static void __init lguest_init_IRQ(void)
}
/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-int lguest_setup_irq(unsigned int irq)
-{
- int err;
-
- /* Returns -ve error or vector number. */
- err = irq_alloc_desc_at(irq, 0);
- if (err < 0 && err != -EEXIST)
- return err;
-
- irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
- handle_level_irq, "level");
- return 0;
-}
-
-/*
* Time.
*
* It would be far better for everyone if the Guest had its own clock, but
@@ -1028,7 +1034,8 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
static void lguest_time_init(void)
{
/* Set up the timer interrupt (0) to go to our simple timer routine */
- lguest_setup_irq(0);
+ if (lguest_setup_irq(0) != 0)
+ panic("Could not set up timer irq");
irq_set_handler(0, lguest_time_irq);
clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 68aec42545c2..7562f42914b4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -823,11 +823,11 @@ void __init mem_init(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
struct pglist_data *pgdata = NODE_DATA(nid);
struct zone *zone = pgdata->node_zones +
- zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+ zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3fba623e3ba5..30564e2752d3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
* Memory is added always to NORMAL zone. This means you will never get
* additional DMA/DMA32 memory.
*/
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones +
- zone_for_memory(nid, start, size, ZONE_NORMAL);
+ zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index db1b0bc5017c..134948b0926f 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
*/
static unsigned long mpx_mmap(unsigned long len)
{
- unsigned long ret;
- unsigned long addr, pgoff;
struct mm_struct *mm = current->mm;
- vm_flags_t vm_flags;
- struct vm_area_struct *vma;
+ unsigned long addr, populate;
/* Only bounds table can be allocated here */
if (len != mpx_bt_size_bytes(mm))
return -EINVAL;
down_write(&mm->mmap_sem);
-
- /* Too many mappings? */
- if (mm->map_count > sysctl_max_map_count) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* Obtain the address to map to. we verify (or select) it and ensure
- * that it represents a valid section of the address space.
- */
- addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
- if (addr & ~PAGE_MASK) {
- ret = addr;
- goto out;
- }
-
- vm_flags = VM_READ | VM_WRITE | VM_MPX |
- mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
- /* Set pgoff according to addr for anon_vma */
- pgoff = addr >> PAGE_SHIFT;
-
- ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
- if (IS_ERR_VALUE(ret))
- goto out;
-
- vma = find_vma(mm, ret);
- if (!vma) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (vm_flags & VM_LOCKED) {
- up_write(&mm->mmap_sem);
- mm_populate(ret, len);
- return ret;
- }
-
-out:
+ addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
up_write(&mm->mmap_sem);
- return ret;
+ if (populate)
+ mm_populate(addr, populate);
+
+ return addr;
}
enum reg_type {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb58bf92..c3b3f653ed0c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
- /* and there's no empty block */
- if (bi->start >= bi->end)
+ /* and there's no empty or non-exist block */
+ if (bi->start >= bi->end ||
+ !memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start))
numa_remove_memblk_from(i--, mi);
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90b924acd982..8ddb5d0d66fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_end = end;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
if (is_uv_system()) {
unsigned int cpu;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index be2e7a2b10d7..70efcd0940f9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
* goto out;
* if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
* goto out;
- * prog = array->prog[index];
+ * prog = array->ptrs[index];
* if (prog == NULL)
* goto out;
* goto *(prog->bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
- /* prog = array->prog[index]; */
+ /* prog = array->ptrs[index]; */
EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
- offsetof(struct bpf_array, prog));
+ offsetof(struct bpf_array, ptrs));
EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */
/* if (prog == NULL)
@@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog)
*pprog = prog;
}
+
+static void emit_load_skb_data_hlen(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /* r9d = skb->len - skb->data_len (headlen)
+ * r10 = skb->data
+ */
+ /* mov %r9d, off32(%rdi) */
+ EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
+
+ /* sub %r9d, off32(%rdi) */
+ EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
+
+ /* mov %r10, off32(%rdi) */
+ EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
+ *pprog = prog;
+}
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
@@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
emit_prologue(&prog);
- if (seen_ld_abs) {
- /* r9d : skb->len - skb->data_len (headlen)
- * r10 : skb->data
- */
- if (is_imm8(offsetof(struct sk_buff, len)))
- /* mov %r9d, off8(%rdi) */
- EMIT4(0x44, 0x8b, 0x4f,
- offsetof(struct sk_buff, len));
- else
- /* mov %r9d, off32(%rdi) */
- EMIT3_off32(0x44, 0x8b, 0x8f,
- offsetof(struct sk_buff, len));
-
- if (is_imm8(offsetof(struct sk_buff, data_len)))
- /* sub %r9d, off8(%rdi) */
- EMIT4(0x44, 0x2b, 0x4f,
- offsetof(struct sk_buff, data_len));
- else
- EMIT3_off32(0x44, 0x2b, 0x8f,
- offsetof(struct sk_buff, data_len));
-
- if (is_imm8(offsetof(struct sk_buff, data)))
- /* mov %r10, off8(%rdi) */
- EMIT4(0x4c, 0x8b, 0x57,
- offsetof(struct sk_buff, data));
- else
- /* mov %r10, off32(%rdi) */
- EMIT3_off32(0x4c, 0x8b, 0x97,
- offsetof(struct sk_buff, data));
- }
+ if (seen_ld_abs)
+ emit_load_skb_data_hlen(&prog);
for (i = 0; i < insn_cnt; i++, insn++) {
const s32 imm32 = insn->imm;
@@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
u8 b1 = 0, b2 = 0, b3 = 0;
s64 jmp_offset;
u8 jmp_cond;
+ bool reload_skb_data;
int ilen;
u8 *func;
@@ -818,12 +811,18 @@ xadd: if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
if (seen_ld_abs) {
- EMIT2(0x41, 0x52); /* push %r10 */
- EMIT2(0x41, 0x51); /* push %r9 */
- /* need to adjust jmp offset, since
- * pop %r9, pop %r10 take 4 bytes after call insn
- */
- jmp_offset += 4;
+ reload_skb_data = bpf_helper_changes_skb_data(func);
+ if (reload_skb_data) {
+ EMIT1(0x57); /* push %rdi */
+ jmp_offset += 22; /* pop, mov, sub, mov */
+ } else {
+ EMIT2(0x41, 0x52); /* push %r10 */
+ EMIT2(0x41, 0x51); /* push %r9 */
+ /* need to adjust jmp offset, since
+ * pop %r9, pop %r10 take 4 bytes after call insn
+ */
+ jmp_offset += 4;
+ }
}
if (!imm32 || !is_simm32(jmp_offset)) {
pr_err("unsupported bpf func %d addr %p image %p\n",
@@ -832,8 +831,13 @@ xadd: if (is_imm8(insn->off))
}
EMIT1_off32(0xE8, jmp_offset);
if (seen_ld_abs) {
- EMIT2(0x41, 0x59); /* pop %r9 */
- EMIT2(0x41, 0x5A); /* pop %r10 */
+ if (reload_skb_data) {
+ EMIT1(0x5F); /* pop %rdi */
+ emit_load_skb_data_hlen(&prog);
+ } else {
+ EMIT2(0x41, 0x59); /* pop %r9 */
+ EMIT2(0x41, 0x5A); /* pop %r10 */
+ }
}
break;
@@ -1099,7 +1103,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
}
if (bpf_jit_enable > 1)
- bpf_jit_dump(prog->len, proglen, 0, image);
+ bpf_jit_dump(prog->len, proglen, pass + 1, image);
if (image) {
bpf_flush_icache(header, image + proglen);
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index d22f4b5bbc04..ff31ab464213 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -179,7 +179,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (ret)
goto error;
i = 0;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
+ for_each_pci_msi_entry(msidesc, dev) {
irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
(type == PCI_CAP_ID_MSI) ? nvec : 1,
(type == PCI_CAP_ID_MSIX) ?
@@ -230,7 +230,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
+ for_each_pci_msi_entry(msidesc, dev) {
__pci_read_msi_msg(msidesc, &msg);
pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
@@ -274,7 +274,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int ret = 0;
struct msi_desc *msidesc;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
+ for_each_pci_msi_entry(msidesc, dev) {
struct physdev_map_pirq map_irq;
domid_t domid;
@@ -386,7 +386,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
{
struct msi_desc *msidesc;
- msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+ msidesc = first_pci_msi_entry(dev);
if (msidesc->msi_attrib.is_msix)
xen_pci_frontend_disable_msix(dev);
else
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e4308fe6afe8..1db84c0758b7 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
static void __init save_runtime_map(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
void *tmp, *p, *q = NULL;
int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
static void __init kexec_enter_virtual_mode(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
void *p;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 8570abe68be1..e1c24631afbb 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -89,7 +89,7 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
return -EINVAL;
chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
- irq_data->node);
+ irq_data_get_node(irq_data));
if (!chip_data)
return -ENOMEM;
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 020c101c255f..5c9f63fa6abf 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
touch_nmi_watchdog();
}
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
static atomic_t uv_nmi_kexec_failed;
static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
uv_nmi_sync_exit(0);
}
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
if (master)
pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
}
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
#ifdef CONFIG_KGDB
#ifdef CONFIG_KGDB_KDB
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 484145368a24..c7b15f3e2cf3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -7,6 +7,7 @@ config XEN
depends on PARAVIRT
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
+ select XEN_HAVE_VPMU
depends on X86_64 || (X86_32 && X86_PAE)
depends on X86_LOCAL_APIC && X86_TSC
help
@@ -23,14 +24,18 @@ config XEN_PVHVM
def_bool y
depends on XEN && PCI && X86_LOCAL_APIC
-config XEN_MAX_DOMAIN_MEMORY
- int
- default 500 if X86_64
- default 64 if X86_32
- depends on XEN
- help
- This only affects the sizing of some bss arrays, the unused
- portions of which are freed.
+config XEN_512GB
+ bool "Limit Xen pv-domain memory to 512GB"
+ depends on XEN && X86_64
+ default y
+ help
+ Limit paravirtualized user domains to 512GB of RAM.
+
+ The Xen tools and crash dump analysis tools might not support
+ pv-domains with more than 512 GB of RAM. This option controls the
+ default setting of the kernel to use only up to 512 GB or more.
+ It is always possible to change the default via specifying the
+ boot parameter "xen_512gb_limit".
config XEN_SAVE_RESTORE
bool
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 4b6e29ac0968..e47e52787d32 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
- p2m.o apic.o
+ p2m.o apic.o pmu.o
obj-$(CONFIG_EVENT_TRACING) += trace.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 70e060ad879a..acda713ab5be 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
+#include "pmu.h"
#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
static void xen_apic_write(u32 reg, u32 val)
{
+ if (reg == APIC_LVTPC) {
+ (void)pmu_apic_update(reg);
+ return;
+ }
+
/* Warn to see if there's any stray references */
WARN(1,"register: %x, value: %x\n", reg, val);
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d9cfa452da9d..30d12afe52ed 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -84,6 +84,7 @@
#include "mmu.h"
#include "smp.h"
#include "multicalls.h"
+#include "pmu.h"
EXPORT_SYMBOL_GPL(hypercall_page);
@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
static void xen_write_cr4(unsigned long cr4)
{
- cr4 &= ~X86_CR4_PGE;
- cr4 &= ~X86_CR4_PSE;
+ cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
native_write_cr4(cr4);
}
@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
{
u64 val;
+ if (pmu_msr_read(msr, &val, err))
+ return val;
+
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
@@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
Xen console noise. */
default:
- ret = native_write_msr_safe(msr, low, high);
+ if (!pmu_msr_write(msr, low, high, &ret))
+ ret = native_write_msr_safe(msr, low, high);
}
return ret;
@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_msr = xen_read_msr_safe,
.write_msr = xen_write_msr_safe,
- .read_pmc = native_read_pmc,
+ .read_pmc = xen_read_pmc,
.iret = xen_iret,
#ifdef CONFIG_X86_64
@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n");
- xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+ xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+ xen_start_info->nr_pages);
+ xen_reserve_special_pages();
/*
* Modify the cache mode translation tables to match Xen's PAT
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dd151b2045b0..9c479fe40459 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void);
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
xen_mc_flush();
}
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+ void *vaddr = __va(paddr);
+ void *vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+ make_lowmem_page_readwrite(vaddr);
+
+ memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+ unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+ if (unpin)
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+ ClearPagePinned(virt_to_page(__va(pa)));
+ xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+ unsigned long va = vaddr & PMD_MASK;
+ unsigned long pa;
+ pgd_t *pgd = pgd_offset_k(va);
+ pud_t *pud_page = pud_offset(pgd, 0);
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned int i;
+ bool unpin;
+
+ unpin = (vaddr == 2 * PGDIR_SIZE);
+ set_pgd(pgd, __pgd(0));
+ do {
+ pud = pud_page + pud_index(va);
+ if (pud_none(*pud)) {
+ va += PUD_SIZE;
+ } else if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PUD_SIZE);
+ va += PUD_SIZE;
+ } else {
+ pmd = pmd_offset(pud, va);
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PMD_SIZE);
+ } else if (!pmd_none(*pmd)) {
+ pte = pte_offset_kernel(pmd, va);
+ set_pmd(pmd, __pmd(0));
+ for (i = 0; i < PTRS_PER_PTE; ++i) {
+ if (pte_none(pte[i]))
+ break;
+ pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+ xen_free_ro_pages(pa, PAGE_SIZE);
+ }
+ xen_cleanmfnmap_free_pgtbl(pte, unpin);
+ }
+ va += PMD_SIZE;
+ if (pmd_index(va))
+ continue;
+ set_pud(pud, __pud(0));
+ xen_cleanmfnmap_free_pgtbl(pmd, unpin);
+ }
+
+ } while (pud_index(va) || pmd_index(va));
+ xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+}
+
static void __init xen_pagetable_p2m_free(void)
{
unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
/* using __ka address and sticking INVALID_P2M_ENTRY! */
memset((void *)xen_start_info->mfn_list, 0xff, size);
- /* We should be in __ka space. */
- BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
addr = xen_start_info->mfn_list;
- /* We roundup to the PMD, which means that if anybody at this stage is
- * using the __ka address of xen_start_info or xen_start_info->shared_info
- * they are in going to crash. Fortunatly we have already revectored
- * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+ /*
+ * We could be in __ka space.
+ * We roundup to the PMD, which means that if anybody at this stage is
+ * using the __ka address of xen_start_info or
+ * xen_start_info->shared_info they are in going to crash. Fortunatly
+ * we have already revectored in xen_setup_kernel_pagetable and in
+ * xen_setup_shared_info.
+ */
size = roundup(size, PMD_SIZE);
- xen_cleanhighmap(addr, addr + size);
- size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
- memblock_free(__pa(xen_start_info->mfn_list), size);
+ if (addr >= __START_KERNEL_map) {
+ xen_cleanhighmap(addr, addr + size);
+ size = PAGE_ALIGN(xen_start_info->nr_pages *
+ sizeof(unsigned long));
+ memblock_free(__pa(addr), size);
+ } else {
+ xen_cleanmfnmap(addr);
+ }
+}
+
+static void __init xen_pagetable_cleanhighmap(void)
+{
+ unsigned long size;
+ unsigned long addr;
/* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
#ifdef CONFIG_X86_64
xen_pagetable_p2m_free();
+
+ xen_pagetable_cleanhighmap();
#endif
/* And revector! Bye bye old array */
xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
#else /* CONFIG_X86_64 */
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_writable_page_tables) ||
+ xen_feature(XENFEAT_auto_translated_physmap) ||
+ xen_start_info->mfn_list >= __START_KERNEL_map)
+ return pte;
+
+ /*
+ * Pages belonging to the initial p2m list mapped outside the default
+ * address range must be mapped read-only. This region contains the
+ * page tables for mapping the p2m list, too, and page tables MUST be
+ * mapped read-only.
+ */
+ pfn = pte_pfn(pte);
+ if (pfn >= xen_start_info->first_p2m_pfn &&
+ pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+ pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
+
return pte;
}
#endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
-static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
- struct mmuext_op op;
- op.cmd = cmd;
- op.arg1.mfn = pfn_to_mfn(pfn);
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
-}
-
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
* mappings. Considering that on Xen after the kernel mappings we
* have the mappings of some pages that don't exist in pfn space, we
* set max_pfn_mapped to the last real pfn mapped. */
- max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+ if (xen_start_info->mfn_list < __START_KERNEL_map)
+ max_pfn_mapped = xen_start_info->first_p2m_pfn;
+ else
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Graft it onto L4[511][510] */
copy_page(level2_kernel_pgt, l2);
+ /* Copy the initial P->M table mappings if necessary. */
+ i = pgd_index(xen_start_info->mfn_list);
+ if (i && i < pgd_index(__START_KERNEL_map))
+ init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
check_pt_base(&pt_base, &pt_end, addr[i]);
/* Our (by three pages) smaller Xen pagetable that we are using */
- memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+ xen_pt_base = PFN_PHYS(pt_base);
+ xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
+ memblock_reserve(xen_pt_base, xen_pt_size);
+
/* Revector the xen_start_info */
xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
}
+
+/*
+ * Read a value from a physical address.
+ */
+static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
+{
+ unsigned long *vaddr;
+ unsigned long val;
+
+ vaddr = early_memremap_ro(addr, sizeof(val));
+ val = *vaddr;
+ early_memunmap(vaddr, sizeof(val));
+ return val;
+}
+
+/*
+ * Translate a virtual address to a physical one without relying on mapped
+ * page tables.
+ */
+static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
+{
+ phys_addr_t pa;
+ pgd_t pgd;
+ pud_t pud;
+ pmd_t pmd;
+ pte_t pte;
+
+ pa = read_cr3();
+ pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
+ sizeof(pgd)));
+ if (!pgd_present(pgd))
+ return 0;
+
+ pa = pgd_val(pgd) & PTE_PFN_MASK;
+ pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
+ sizeof(pud)));
+ if (!pud_present(pud))
+ return 0;
+ pa = pud_pfn(pud) << PAGE_SHIFT;
+ if (pud_large(pud))
+ return pa + (vaddr & ~PUD_MASK);
+
+ pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
+ sizeof(pmd)));
+ if (!pmd_present(pmd))
+ return 0;
+ pa = pmd_pfn(pmd) << PAGE_SHIFT;
+ if (pmd_large(pmd))
+ return pa + (vaddr & ~PMD_MASK);
+
+ pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
+ sizeof(pte)));
+ if (!pte_present(pte))
+ return 0;
+ pa = pte_pfn(pte) << PAGE_SHIFT;
+
+ return pa | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
+ * this area.
+ */
+void __init xen_relocate_p2m(void)
+{
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+ unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
+ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+ pte_t *pt;
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd;
+ unsigned long *new_p2m;
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
+ n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
+ n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
+ n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+ n_frames = n_pte + n_pt + n_pmd + n_pud;
+
+ new_area = xen_find_free_area(PFN_PHYS(n_frames));
+ if (!new_area) {
+ xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
+ BUG();
+ }
+
+ /*
+ * Setup the page tables for addressing the new p2m list.
+ * We have asked the hypervisor to map the p2m list at the user address
+ * PUD_SIZE. It may have done so, or it may have used a kernel space
+ * address depending on the Xen version.
+ * To avoid any possible virtual address collision, just use
+ * 2 * PUD_SIZE for the new area.
+ */
+ pud_phys = new_area;
+ pmd_phys = pud_phys + PFN_PHYS(n_pud);
+ pt_phys = pmd_phys + PFN_PHYS(n_pmd);
+ p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
+
+ pgd = __va(read_cr3());
+ new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ set_pte(pt + idx_pte,
+ pfn_pte(p2m_pfn, PAGE_KERNEL));
+ p2m_pfn++;
+ }
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ set_pmd(pmd + idx_pt,
+ __pmd(_PAGE_TABLE | pt_phys));
+ pt_phys += PAGE_SIZE;
+ }
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pmd_phys += PAGE_SIZE;
+ }
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
+ }
+
+ /* Now copy the old p2m info to the new area. */
+ memcpy(new_p2m, xen_p2m_addr, size);
+ xen_p2m_addr = new_p2m;
+
+ /* Release the old p2m list and set new list info. */
+ p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
+ BUG_ON(!p2m_pfn);
+ p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
+
+ if (xen_start_info->mfn_list < __START_KERNEL_map) {
+ pfn = xen_start_info->first_p2m_pfn;
+ pfn_end = xen_start_info->first_p2m_pfn +
+ xen_start_info->nr_p2m_frames;
+ set_pgd(pgd + 1, __pgd(0));
+ } else {
+ pfn = p2m_pfn;
+ pfn_end = p2m_pfn_end;
+ }
+
+ memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+ while (pfn < pfn_end) {
+ if (pfn == p2m_pfn) {
+ pfn = p2m_pfn_end;
+ continue;
+ }
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ pfn++;
+ }
+
+ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+ xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
+ xen_start_info->nr_p2m_frames = n_frames;
+}
+
#else /* !CONFIG_X86_64 */
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
pv_mmu_ops.write_cr3 = &xen_write_cr3;
}
+/*
+ * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
+ * not the first page table in the page table pool.
+ * Iterate through the initial page tables to find the real page table base.
+ */
+static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+{
+ phys_addr_t pt_base, paddr;
+ unsigned pmdidx;
+
+ pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
+
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
+ if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
+ paddr = m2p(pmd[pmdidx].pmd);
+ pt_base = min(pt_base, paddr);
+ }
+
+ return pt_base;
+}
+
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
pmd_t *kernel_pmd;
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+
+ xen_pt_base = xen_find_pt_base(kernel_pmd);
+ xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
initial_kernel_pmd =
extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
- max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
- xen_start_info->nr_pt_frames * PAGE_SIZE +
- 512*1024);
+ max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
- kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
copy_page(initial_kernel_pmd, kernel_pmd);
xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
PFN_DOWN(__pa(initial_page_table)));
xen_write_cr3(__pa(initial_page_table));
- memblock_reserve(__pa(xen_start_info->pt_base),
- xen_start_info->nr_pt_frames * PAGE_SIZE);
+ memblock_reserve(xen_pt_base, xen_pt_size);
}
#endif /* CONFIG_X86_64 */
+void __init xen_reserve_special_pages(void)
+{
+ phys_addr_t paddr;
+
+ memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+ if (xen_start_info->store_mfn) {
+ paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
+ memblock_reserve(paddr, PAGE_SIZE);
+ }
+ if (!xen_initial_domain()) {
+ paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
+ memblock_reserve(paddr, PAGE_SIZE);
+ }
+}
+
+void __init xen_pt_check_e820(void)
+{
+ if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
+ xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
+ BUG();
+ }
+}
+
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
@@ -2465,9 +2812,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
return 0;
}
-static int do_remap_mfn(struct vm_area_struct *vma,
+static int do_remap_gfn(struct vm_area_struct *vma,
unsigned long addr,
- xen_pfn_t *mfn, int nr,
+ xen_pfn_t *gfn, int nr,
int *err_ptr, pgprot_t prot,
unsigned domid,
struct page **pages)
@@ -2483,14 +2830,14 @@ static int do_remap_mfn(struct vm_area_struct *vma,
if (xen_feature(XENFEAT_auto_translated_physmap)) {
#ifdef CONFIG_XEN_PVH
/* We need to update the local page tables and the xen HAP */
- return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+ return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
prot, domid, pages);
#else
return -EINVAL;
#endif
}
- rmd.mfn = mfn;
+ rmd.mfn = gfn;
rmd.prot = prot;
/* We use the err_ptr to indicate if there we are doing a contigious
* mapping or a discontigious mapping. */
@@ -2518,8 +2865,8 @@ static int do_remap_mfn(struct vm_area_struct *vma,
batch_left, &done, domid);
/*
- * @err_ptr may be the same buffer as @mfn, so
- * only clear it after each chunk of @mfn is
+ * @err_ptr may be the same buffer as @gfn, so
+ * only clear it after each chunk of @gfn is
* used.
*/
if (err_ptr) {
@@ -2549,19 +2896,19 @@ out:
return err < 0 ? err : mapped;
}
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
unsigned long addr,
- xen_pfn_t mfn, int nr,
+ xen_pfn_t gfn, int nr,
pgprot_t prot, unsigned domid,
struct page **pages)
{
- return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
+ return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
}
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
unsigned long addr,
- xen_pfn_t *mfn, int nr,
+ xen_pfn_t *gfn, int nr,
int *err_ptr, pgprot_t prot,
unsigned domid, struct page **pages)
{
@@ -2570,13 +2917,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
* cause of "wrong memory was mapped in".
*/
BUG_ON(err_ptr == NULL);
- return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+ return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
}
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
/* Returns: 0 success */
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
int numpgs, struct page **pages)
{
if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
@@ -2588,4 +2935,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
return -EINVAL;
#endif
}
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 8b7f18e200aa..bfc08b13044b 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -79,10 +79,14 @@
#include <xen/balloon.h>
#include <xen/grant_table.h>
-#include "p2m.h"
#include "multicalls.h"
#include "xen-ops.h"
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
unsigned long *xen_p2m_addr __read_mostly;
@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void)
unsigned int level, topidx, mididx;
unsigned long *mid_mfn_p;
- if (xen_feature(XENFEAT_auto_translated_physmap))
+ if (xen_feature(XENFEAT_auto_translated_physmap) ||
+ xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
return;
/* Pre-initialize p2m_top_mfn to be completely missing */
@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn);
+ if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
+ else
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+ HYPERVISOR_shared_info->arch.p2m_generation = 0;
+ HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
+ HYPERVISOR_shared_info->arch.p2m_cr3 =
+ xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
ptechk = lookup_address(vaddr, &level);
if (ptechk == pte_pg) {
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ wmb(); /* Tools are synchronizing via p2m_generation. */
set_pmd(pmdp,
__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ HYPERVISOR_shared_info->arch.p2m_generation++;
pte_newpg[i] = NULL;
}
@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
*/
static bool alloc_p2m(unsigned long pfn)
{
- unsigned topidx, mididx;
+ unsigned topidx;
unsigned long *top_mfn_p, *mid_mfn;
pte_t *ptep, *pte_pg;
unsigned int level;
@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn)
unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
unsigned long p2m_pfn;
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
-
ptep = lookup_address(addr, &level);
BUG_ON(!ptep || level != PG_LEVEL_4K);
pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
return false;
}
- if (p2m_top_mfn) {
+ if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+ topidx = p2m_top_index(pfn);
top_mfn_p = &p2m_top_mfn[topidx];
mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn)
spin_lock_irqsave(&p2m_update_lock, flags);
if (pte_pfn(*ptep) == p2m_pfn) {
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ wmb(); /* Tools are synchronizing via p2m_generation. */
set_pte(ptep,
pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ HYPERVISOR_shared_info->arch.p2m_generation++;
if (mid_mfn)
- mid_mfn[mididx] = virt_to_mfn(p2m);
+ mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
p2m = NULL;
}
@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
+ /*
+ * The interface requires atomic updates on p2m elements.
+ * xen_safe_write_ulong() is using __put_user which does an atomic
+ * store via asm().
+ */
if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
return true;
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
deleted file mode 100644
index ad8aee24ab72..000000000000
--- a/arch/x86/xen/p2m.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _XEN_P2M_H
-#define _XEN_P2M_H
-
-#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-#define MAX_REMAP_RANGES 10
-
-extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
- unsigned long pfn_e);
-
-#endif /* _XEN_P2M_H */
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index a8261716d58d..9586ff32810c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int check_platform_magic(void)
return 0;
}
-bool xen_has_pv_devices()
+bool xen_has_pv_devices(void)
{
if (!xen_domain())
return false;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
new file mode 100644
index 000000000000..724a08740a04
--- /dev/null
+++ b/arch/x86/xen/pmu.c
@@ -0,0 +1,570 @@
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include "xen-ops.h"
+#include "pmu.h"
+
+/* x86_pmu.handle_irq definition */
+#include "../kernel/cpu/perf_event.h"
+
+#define XENPMU_IRQ_PROCESSING 1
+struct xenpmu {
+ /* Shared page between hypervisor and domain */
+ struct xen_pmu_data *xenpmu_data;
+
+ uint8_t flags;
+};
+static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
+#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
+#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags)
+
+/* Macro for computing address of a PMU MSR bank */
+#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
+ (uintptr_t)ctxt->field))
+
+/* AMD PMU */
+#define F15H_NUM_COUNTERS 6
+#define F10H_NUM_COUNTERS 4
+
+static __read_mostly uint32_t amd_counters_base;
+static __read_mostly uint32_t amd_ctrls_base;
+static __read_mostly int amd_msr_step;
+static __read_mostly int k7_counters_mirrored;
+static __read_mostly int amd_num_counters;
+
+/* Intel PMU */
+#define MSR_TYPE_COUNTER 0
+#define MSR_TYPE_CTRL 1
+#define MSR_TYPE_GLOBAL 2
+#define MSR_TYPE_ARCH_COUNTER 3
+#define MSR_TYPE_ARCH_CTRL 4
+
+/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
+#define PMU_GENERAL_NR_SHIFT 8
+#define PMU_GENERAL_NR_BITS 8
+#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \
+ << PMU_GENERAL_NR_SHIFT)
+
+/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
+#define PMU_FIXED_NR_SHIFT 0
+#define PMU_FIXED_NR_BITS 5
+#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \
+ << PMU_FIXED_NR_SHIFT)
+
+/* Alias registers (0x4c1) for full-width writes to PMCs */
+#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
+
+#define INTEL_PMC_TYPE_SHIFT 30
+
+static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
+
+
+static void xen_pmu_arch_init(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+
+ switch (boot_cpu_data.x86) {
+ case 0x15:
+ amd_num_counters = F15H_NUM_COUNTERS;
+ amd_counters_base = MSR_F15H_PERF_CTR;
+ amd_ctrls_base = MSR_F15H_PERF_CTL;
+ amd_msr_step = 2;
+ k7_counters_mirrored = 1;
+ break;
+ case 0x10:
+ case 0x12:
+ case 0x14:
+ case 0x16:
+ default:
+ amd_num_counters = F10H_NUM_COUNTERS;
+ amd_counters_base = MSR_K7_PERFCTR0;
+ amd_ctrls_base = MSR_K7_EVNTSEL0;
+ amd_msr_step = 1;
+ k7_counters_mirrored = 0;
+ break;
+ }
+ } else {
+ uint32_t eax, ebx, ecx, edx;
+
+ cpuid(0xa, &eax, &ebx, &ecx, &edx);
+
+ intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
+ PMU_GENERAL_NR_SHIFT;
+ intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
+ PMU_FIXED_NR_SHIFT;
+ }
+}
+
+static inline uint32_t get_fam15h_addr(u32 addr)
+{
+ switch (addr) {
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
+ default:
+ break;
+ }
+
+ return addr;
+}
+
+static inline bool is_amd_pmu_msr(unsigned int msr)
+{
+ if ((msr >= MSR_F15H_PERF_CTL &&
+ msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
+ (msr >= MSR_K7_EVNTSEL0 &&
+ msr < MSR_K7_PERFCTR0 + amd_num_counters))
+ return true;
+
+ return false;
+}
+
+static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+{
+ u32 msr_index_pmc;
+
+ switch (msr_index) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ case MSR_IA32_DS_AREA:
+ case MSR_IA32_PEBS_ENABLE:
+ *type = MSR_TYPE_CTRL;
+ return true;
+
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ *type = MSR_TYPE_GLOBAL;
+ return true;
+
+ default:
+
+ if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
+ (msr_index < MSR_CORE_PERF_FIXED_CTR0 +
+ intel_num_fixed_counters)) {
+ *index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
+ *type = MSR_TYPE_COUNTER;
+ return true;
+ }
+
+ if ((msr_index >= MSR_P6_EVNTSEL0) &&
+ (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) {
+ *index = msr_index - MSR_P6_EVNTSEL0;
+ *type = MSR_TYPE_ARCH_CTRL;
+ return true;
+ }
+
+ msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
+ if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
+ (msr_index_pmc < MSR_IA32_PERFCTR0 +
+ intel_num_arch_counters)) {
+ *type = MSR_TYPE_ARCH_COUNTER;
+ *index = msr_index_pmc - MSR_IA32_PERFCTR0;
+ return true;
+ }
+ return false;
+ }
+}
+
+static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
+ int index, bool is_read)
+{
+ uint64_t *reg = NULL;
+ struct xen_pmu_intel_ctxt *ctxt;
+ uint64_t *fix_counters;
+ struct xen_pmu_cntr_pair *arch_cntr_pair;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+ return false;
+
+ ctxt = &xenpmu_data->pmu.c.intel;
+
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ reg = &ctxt->global_ovf_ctrl;
+ break;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ reg = &ctxt->global_status;
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ reg = &ctxt->global_ctrl;
+ break;
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ reg = &ctxt->fixed_ctrl;
+ break;
+ default:
+ switch (type) {
+ case MSR_TYPE_COUNTER:
+ fix_counters = field_offset(ctxt, fixed_counters);
+ reg = &fix_counters[index];
+ break;
+ case MSR_TYPE_ARCH_COUNTER:
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ reg = &arch_cntr_pair[index].counter;
+ break;
+ case MSR_TYPE_ARCH_CTRL:
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ reg = &arch_cntr_pair[index].control;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ if (reg) {
+ if (is_read)
+ *val = *reg;
+ else {
+ *reg = *val;
+
+ if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
+ ctxt->global_status &= (~(*val));
+ }
+ return true;
+ }
+
+ return false;
+}
+
+static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
+{
+ uint64_t *reg = NULL;
+ int i, off = 0;
+ struct xen_pmu_amd_ctxt *ctxt;
+ uint64_t *counter_regs, *ctrl_regs;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+ return false;
+
+ if (k7_counters_mirrored &&
+ ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
+ msr = get_fam15h_addr(msr);
+
+ ctxt = &xenpmu_data->pmu.c.amd;
+ for (i = 0; i < amd_num_counters; i++) {
+ if (msr == amd_ctrls_base + off) {
+ ctrl_regs = field_offset(ctxt, ctrls);
+ reg = &ctrl_regs[i];
+ break;
+ } else if (msr == amd_counters_base + off) {
+ counter_regs = field_offset(ctxt, counters);
+ reg = &counter_regs[i];
+ break;
+ }
+ off += amd_msr_step;
+ }
+
+ if (reg) {
+ if (is_read)
+ *val = *reg;
+ else
+ *reg = *val;
+
+ return true;
+ }
+ return false;
+}
+
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (is_amd_pmu_msr(msr)) {
+ if (!xen_amd_pmu_emulate(msr, val, 1))
+ *val = native_read_msr_safe(msr, err);
+ return true;
+ }
+ } else {
+ int type, index;
+
+ if (is_intel_pmu_msr(msr, &type, &index)) {
+ if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
+ *val = native_read_msr_safe(msr, err);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
+{
+ uint64_t val = ((uint64_t)high << 32) | low;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (is_amd_pmu_msr(msr)) {
+ if (!xen_amd_pmu_emulate(msr, &val, 0))
+ *err = native_write_msr_safe(msr, low, high);
+ return true;
+ }
+ } else {
+ int type, index;
+
+ if (is_intel_pmu_msr(msr, &type, &index)) {
+ if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
+ *err = native_write_msr_safe(msr, low, high);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static unsigned long long xen_amd_read_pmc(int counter)
+{
+ struct xen_pmu_amd_ctxt *ctxt;
+ uint64_t *counter_regs;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+ uint32_t msr;
+ int err;
+
+ msr = amd_counters_base + (counter * amd_msr_step);
+ return native_read_msr_safe(msr, &err);
+ }
+
+ ctxt = &xenpmu_data->pmu.c.amd;
+ counter_regs = field_offset(ctxt, counters);
+ return counter_regs[counter];
+}
+
+static unsigned long long xen_intel_read_pmc(int counter)
+{
+ struct xen_pmu_intel_ctxt *ctxt;
+ uint64_t *fixed_counters;
+ struct xen_pmu_cntr_pair *arch_cntr_pair;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+ uint32_t msr;
+ int err;
+
+ if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
+ msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
+ else
+ msr = MSR_IA32_PERFCTR0 + counter;
+
+ return native_read_msr_safe(msr, &err);
+ }
+
+ ctxt = &xenpmu_data->pmu.c.intel;
+ if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
+ fixed_counters = field_offset(ctxt, fixed_counters);
+ return fixed_counters[counter & 0xffff];
+ }
+
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ return arch_cntr_pair[counter].counter;
+}
+
+unsigned long long xen_read_pmc(int counter)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return xen_amd_read_pmc(counter);
+ else
+ return xen_intel_read_pmc(counter);
+}
+
+int pmu_apic_update(uint32_t val)
+{
+ int ret;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return -EINVAL;
+ }
+
+ xenpmu_data->pmu.l.lapic_lvtpc = val;
+
+ if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
+ return 0;
+
+ ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
+
+ return ret;
+}
+
+/* perf callbacks */
+static int xen_is_in_guest(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
+ return 0;
+
+ return 1;
+}
+
+static int xen_is_user_mode(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
+ return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
+ else
+ return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+}
+
+static unsigned long xen_get_guest_ip(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ return xenpmu_data->pmu.r.regs.ip;
+}
+
+static struct perf_guest_info_callbacks xen_guest_cbs = {
+ .is_in_guest = xen_is_in_guest,
+ .is_user_mode = xen_is_user_mode,
+ .get_guest_ip = xen_get_guest_ip,
+};
+
+/* Convert registers from Xen's format to Linux' */
+static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
+ struct pt_regs *regs, uint64_t pmu_flags)
+{
+ regs->ip = xen_regs->ip;
+ regs->cs = xen_regs->cs;
+ regs->sp = xen_regs->sp;
+
+ if (pmu_flags & PMU_SAMPLE_PV) {
+ if (pmu_flags & PMU_SAMPLE_USER)
+ regs->cs |= 3;
+ else
+ regs->cs &= ~3;
+ } else {
+ if (xen_regs->cpl)
+ regs->cs |= 3;
+ else
+ regs->cs &= ~3;
+ }
+}
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
+{
+ int err, ret = IRQ_NONE;
+ struct pt_regs regs;
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return ret;
+ }
+
+ this_cpu_ptr(&xenpmu_shared)->flags =
+ xenpmu_flags | XENPMU_IRQ_PROCESSING;
+ xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
+ xenpmu_data->pmu.pmu_flags);
+ if (x86_pmu.handle_irq(&regs))
+ ret = IRQ_HANDLED;
+
+ /* Write out cached context to HW */
+ err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
+ this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
+ if (err) {
+ pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
+ return IRQ_NONE;
+ }
+
+ return ret;
+}
+
+bool is_xen_pmu(int cpu)
+{
+ return (get_xenpmu_data() != NULL);
+}
+
+void xen_pmu_init(int cpu)
+{
+ int err;
+ struct xen_pmu_params xp;
+ unsigned long pfn;
+ struct xen_pmu_data *xenpmu_data;
+
+ BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
+
+ if (xen_hvm_domain())
+ return;
+
+ xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
+ if (!xenpmu_data) {
+ pr_err("VPMU init: No memory\n");
+ return;
+ }
+ pfn = virt_to_pfn(xenpmu_data);
+
+ xp.val = pfn_to_mfn(pfn);
+ xp.vcpu = cpu;
+ xp.version.maj = XENPMU_VER_MAJ;
+ xp.version.min = XENPMU_VER_MIN;
+ err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
+ if (err)
+ goto fail;
+
+ per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
+ per_cpu(xenpmu_shared, cpu).flags = 0;
+
+ if (cpu == 0) {
+ perf_register_guest_info_callbacks(&xen_guest_cbs);
+ xen_pmu_arch_init();
+ }
+
+ return;
+
+fail:
+ pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
+ cpu, err);
+ free_pages((unsigned long)xenpmu_data, 0);
+}
+
+void xen_pmu_finish(int cpu)
+{
+ struct xen_pmu_params xp;
+
+ if (xen_hvm_domain())
+ return;
+
+ xp.vcpu = cpu;
+ xp.version.maj = XENPMU_VER_MAJ;
+ xp.version.min = XENPMU_VER_MIN;
+
+ (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
+
+ free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
+ per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
+}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
new file mode 100644
index 000000000000..af5f0ad94078
--- /dev/null
+++ b/arch/x86/xen/pmu.h
@@ -0,0 +1,15 @@
+#ifndef __XEN_PMU_H
+#define __XEN_PMU_H
+
+#include <xen/interface/xenpmu.h>
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+bool is_xen_pmu(int cpu);
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
+int pmu_apic_update(uint32_t reg);
+unsigned long long xen_read_pmc(int counter);
+
+#endif /* __XEN_PMU_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 55f388ef481a..f5ef6746d47a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,17 +27,23 @@
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
+#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "vdso.h"
-#include "p2m.h"
#include "mmu.h"
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
/* Amount of extra memory space we add to the e820 ranges */
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
+/* E820 map used during setting up memory. */
+static struct e820entry xen_e820_map[E820MAX] __initdata;
+static u32 xen_e820_map_entries __initdata;
+
/*
* Buffer used to remap identity mapped pages. We only need the virtual space.
* The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
*/
#define EXTRA_MEM_RATIO (10)
-static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
+static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
+
+static void __init xen_parse_512gb(void)
+{
+ bool val = false;
+ char *arg;
+
+ arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
+ if (!arg)
+ return;
+
+ arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
+ if (!arg)
+ val = true;
+ else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+ return;
+
+ xen_512gb_limit = val;
+}
+
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+ unsigned long n_pfns)
{
int i;
+ /*
+ * No need to check for zero size, should happen rarely and will only
+ * write a new entry regarded to be unused due to zero size.
+ */
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
/* Add new region. */
- if (xen_extra_mem[i].size == 0) {
- xen_extra_mem[i].start = start;
- xen_extra_mem[i].size = size;
+ if (xen_extra_mem[i].n_pfns == 0) {
+ xen_extra_mem[i].start_pfn = start_pfn;
+ xen_extra_mem[i].n_pfns = n_pfns;
break;
}
/* Append to existing region. */
- if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
- xen_extra_mem[i].size += size;
+ if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+ start_pfn) {
+ xen_extra_mem[i].n_pfns += n_pfns;
break;
}
}
if (i == XEN_EXTRA_MEM_MAX_REGIONS)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");
- memblock_reserve(start, size);
+ memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
-static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+ unsigned long n_pfns)
{
int i;
- phys_addr_t start_r, size_r;
+ unsigned long start_r, size_r;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- start_r = xen_extra_mem[i].start;
- size_r = xen_extra_mem[i].size;
+ start_r = xen_extra_mem[i].start_pfn;
+ size_r = xen_extra_mem[i].n_pfns;
/* Start of region. */
- if (start_r == start) {
- BUG_ON(size > size_r);
- xen_extra_mem[i].start += size;
- xen_extra_mem[i].size -= size;
+ if (start_r == start_pfn) {
+ BUG_ON(n_pfns > size_r);
+ xen_extra_mem[i].start_pfn += n_pfns;
+ xen_extra_mem[i].n_pfns -= n_pfns;
break;
}
/* End of region. */
- if (start_r + size_r == start + size) {
- BUG_ON(size > size_r);
- xen_extra_mem[i].size -= size;
+ if (start_r + size_r == start_pfn + n_pfns) {
+ BUG_ON(n_pfns > size_r);
+ xen_extra_mem[i].n_pfns -= n_pfns;
break;
}
/* Mid of region. */
- if (start > start_r && start < start_r + size_r) {
- BUG_ON(start + size > start_r + size_r);
- xen_extra_mem[i].size = start - start_r;
+ if (start_pfn > start_r && start_pfn < start_r + size_r) {
+ BUG_ON(start_pfn + n_pfns > start_r + size_r);
+ xen_extra_mem[i].n_pfns = start_pfn - start_r;
/* Calling memblock_reserve() again is okay. */
- xen_add_extra_mem(start + size, start_r + size_r -
- (start + size));
+ xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+ (start_pfn + n_pfns));
break;
}
}
- memblock_free(start, size);
+ memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
/*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
int i;
- phys_addr_t addr = PFN_PHYS(pfn);
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- if (addr >= xen_extra_mem[i].start &&
- addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+ if (pfn >= xen_extra_mem[i].start_pfn &&
+ pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
return INVALID_P2M_ENTRY;
}
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- if (!xen_extra_mem[i].size)
+ if (!xen_extra_mem[i].n_pfns)
continue;
- pfn_s = PFN_DOWN(xen_extra_mem[i].start);
- pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+ pfn_s = xen_extra_mem[i].start_pfn;
+ pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
for (pfn = pfn_s; pfn < pfn_e; pfn++)
set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
* This function updates min_pfn with the pfn found and returns
* the size of that range or zero if not found.
*/
-static unsigned long __init xen_find_pfn_range(
- const struct e820entry *list, size_t map_size,
- unsigned long *min_pfn)
+static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
{
- const struct e820entry *entry;
+ const struct e820entry *entry = xen_e820_map;
unsigned int i;
unsigned long done = 0;
- for (i = 0, entry = list; i < map_size; i++, entry++) {
+ for (i = 0; i < xen_e820_map_entries; i++, entry++) {
unsigned long s_pfn;
unsigned long e_pfn;
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
- unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
+ unsigned long end_pfn, unsigned long nr_pages)
{
unsigned long pfn, end;
int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
if (ret == 1) {
- (*released)++;
+ xen_released_pages++;
if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
break;
} else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
* to Xen and not remapped.
*/
static unsigned long __init xen_set_identity_and_remap_chunk(
- const struct e820entry *list, size_t map_size, unsigned long start_pfn,
- unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
- unsigned long *released, unsigned long *remapped)
+ unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+ unsigned long remap_pfn)
{
unsigned long pfn;
unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
if (cur_pfn + size > nr_pages)
size = nr_pages - cur_pfn;
- remap_range_size = xen_find_pfn_range(list, map_size,
- &remap_pfn);
+ remap_range_size = xen_find_pfn_range(&remap_pfn);
if (!remap_range_size) {
pr_warning("Unable to find available pfn range, not remapping identity pages\n");
xen_set_identity_and_release_chunk(cur_pfn,
- cur_pfn + left, nr_pages, released);
+ cur_pfn + left, nr_pages);
break;
}
/* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
/* Update variables to reflect new mappings. */
i += size;
remap_pfn += size;
- *remapped += size;
}
/*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
return remap_pfn;
}
-static void __init xen_set_identity_and_remap(
- const struct e820entry *list, size_t map_size, unsigned long nr_pages,
- unsigned long *released, unsigned long *remapped)
+static void __init xen_set_identity_and_remap(unsigned long nr_pages)
{
phys_addr_t start = 0;
unsigned long last_pfn = nr_pages;
- const struct e820entry *entry;
- unsigned long num_released = 0;
- unsigned long num_remapped = 0;
+ const struct e820entry *entry = xen_e820_map;
int i;
/*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
* example) the DMI tables in a reserved region that begins on
* a non-page boundary.
*/
- for (i = 0, entry = list; i < map_size; i++, entry++) {
+ for (i = 0; i < xen_e820_map_entries; i++, entry++) {
phys_addr_t end = entry->addr + entry->size;
- if (entry->type == E820_RAM || i == map_size - 1) {
+ if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
if (start_pfn < end_pfn)
last_pfn = xen_set_identity_and_remap_chunk(
- list, map_size, start_pfn,
- end_pfn, nr_pages, last_pfn,
- &num_released, &num_remapped);
+ start_pfn, end_pfn, nr_pages,
+ last_pfn);
start = end;
}
}
- *released = num_released;
- *remapped = num_remapped;
-
- pr_info("Released %ld page(s)\n", num_released);
+ pr_info("Released %ld page(s)\n", xen_released_pages);
}
/*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
} else if (pfn_s + len == xen_remap_buf.target_pfn) {
len += xen_remap_buf.size;
} else {
- xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+ xen_del_extra_mem(pfn_s, len);
pfn_s = xen_remap_buf.target_pfn;
len = xen_remap_buf.size;
}
@@ -504,19 +523,36 @@ void __init xen_remap_memory(void)
}
if (pfn_s != ~0UL && len)
- xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+ xen_del_extra_mem(pfn_s, len);
set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
pr_info("Remapped %ld page(s)\n", remapped);
}
+static unsigned long __init xen_get_pages_limit(void)
+{
+ unsigned long limit;
+
+#ifdef CONFIG_X86_32
+ limit = GB(64) / PAGE_SIZE;
+#else
+ limit = MAXMEM / PAGE_SIZE;
+ if (!xen_initial_domain() && xen_512gb_limit)
+ limit = GB(512) / PAGE_SIZE;
+#endif
+ return limit;
+}
+
static unsigned long __init xen_get_max_pages(void)
{
- unsigned long max_pages = MAX_DOMAIN_PAGES;
+ unsigned long max_pages, limit;
domid_t domid = DOMID_SELF;
int ret;
+ limit = xen_get_pages_limit();
+ max_pages = limit;
+
/*
* For the initial domain we use the maximum reservation as
* the maximum page.
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
max_pages = ret;
}
- return min(max_pages, MAX_DOMAIN_PAGES);
+ return min(max_pages, limit);
}
static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
e820_add_region(start, end - start, type);
}
-static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(void)
{
- struct e820entry *entry;
+ struct e820entry *entry = xen_e820_map;
unsigned int i;
- for (i = 0, entry = list; i < map_size; i++, entry++) {
+ for (i = 0; i < xen_e820_map_entries; i++, entry++) {
if (entry->type == E820_UNUSABLE)
entry->type = E820_RAM;
}
}
+static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
+{
+ unsigned long extra = 0;
+ unsigned long start_pfn, end_pfn;
+ const struct e820entry *entry = xen_e820_map;
+ int i;
+
+ end_pfn = 0;
+ for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+ start_pfn = PFN_DOWN(entry->addr);
+ /* Adjacent regions on non-page boundaries handling! */
+ end_pfn = min(end_pfn, start_pfn);
+
+ if (start_pfn >= max_pfn)
+ return extra + max_pfn - end_pfn;
+
+ /* Add any holes in map to result. */
+ extra += start_pfn - end_pfn;
+
+ end_pfn = PFN_UP(entry->addr + entry->size);
+ end_pfn = min(end_pfn, max_pfn);
+
+ if (entry->type != E820_RAM)
+ extra += end_pfn - start_pfn;
+ }
+
+ return extra;
+}
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+{
+ struct e820entry *entry;
+ unsigned mapcnt;
+ phys_addr_t end;
+
+ if (!size)
+ return false;
+
+ end = start + size;
+ entry = xen_e820_map;
+
+ for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
+ if (entry->type == E820_RAM && entry->addr <= start &&
+ (entry->addr + entry->size) >= end)
+ return false;
+
+ entry++;
+ }
+
+ return true;
+}
+
+/*
+ * Find a free area in physical memory not yet reserved and compliant with
+ * E820 map.
+ * Used to relocate pre-allocated areas like initrd or p2m list which are in
+ * conflict with the to be used E820 map.
+ * In case no area is found, return 0. Otherwise return the physical address
+ * of the area which is already reserved for convenience.
+ */
+phys_addr_t __init xen_find_free_area(phys_addr_t size)
+{
+ unsigned mapcnt;
+ phys_addr_t addr, start;
+ struct e820entry *entry = xen_e820_map;
+
+ for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
+ if (entry->type != E820_RAM || entry->size < size)
+ continue;
+ start = entry->addr;
+ for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+ if (!memblock_is_reserved(addr))
+ continue;
+ start = addr + PAGE_SIZE;
+ if (start + size > entry->addr + entry->size)
+ break;
+ }
+ if (addr >= start + size) {
+ memblock_reserve(start, size);
+ return start;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Like memcpy, but with physical addresses for dest and src.
+ */
+static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
+ phys_addr_t n)
+{
+ phys_addr_t dest_off, src_off, dest_len, src_len, len;
+ void *from, *to;
+
+ while (n) {
+ dest_off = dest & ~PAGE_MASK;
+ src_off = src & ~PAGE_MASK;
+ dest_len = n;
+ if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
+ dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
+ src_len = n;
+ if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
+ src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
+ len = min(dest_len, src_len);
+ to = early_memremap(dest - dest_off, dest_len + dest_off);
+ from = early_memremap(src - src_off, src_len + src_off);
+ memcpy(to, from, len);
+ early_memunmap(to, dest_len + dest_off);
+ early_memunmap(from, src_len + src_off);
+ n -= len;
+ dest += len;
+ src += len;
+ }
+}
+
+/*
+ * Reserve Xen mfn_list.
+ */
+static void __init xen_reserve_xen_mfnlist(void)
+{
+ phys_addr_t start, size;
+
+ if (xen_start_info->mfn_list >= __START_KERNEL_map) {
+ start = __pa(xen_start_info->mfn_list);
+ size = PFN_ALIGN(xen_start_info->nr_pages *
+ sizeof(unsigned long));
+ } else {
+ start = PFN_PHYS(xen_start_info->first_p2m_pfn);
+ size = PFN_PHYS(xen_start_info->nr_p2m_frames);
+ }
+
+ if (!xen_is_e820_reserved(start, size)) {
+ memblock_reserve(start, size);
+ return;
+ }
+
+#ifdef CONFIG_X86_32
+ /*
+ * Relocating the p2m on 32 bit system to an arbitrary virtual address
+ * is not supported, so just give up.
+ */
+ xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
+ BUG();
+#else
+ xen_relocate_p2m();
+#endif
+}
+
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
- static struct e820entry map[E820MAX] __initdata;
-
- unsigned long max_pfn = xen_start_info->nr_pages;
- phys_addr_t mem_end;
+ unsigned long max_pfn, pfn_s, n_pfns;
+ phys_addr_t mem_end, addr, size, chunk_size;
+ u32 type;
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
unsigned long extra_pages = 0;
- unsigned long remapped_pages;
int i;
int op;
- max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+ xen_parse_512gb();
+ max_pfn = xen_get_pages_limit();
+ max_pfn = min(max_pfn, xen_start_info->nr_pages);
mem_end = PFN_PHYS(max_pfn);
memmap.nr_entries = E820MAX;
- set_xen_guest_handle(memmap.buffer, map);
+ set_xen_guest_handle(memmap.buffer, xen_e820_map);
op = xen_initial_domain() ?
XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
if (rc == -ENOSYS) {
BUG_ON(xen_initial_domain());
memmap.nr_entries = 1;
- map[0].addr = 0ULL;
- map[0].size = mem_end;
+ xen_e820_map[0].addr = 0ULL;
+ xen_e820_map[0].size = mem_end;
/* 8MB slack (to balance backend allocations). */
- map[0].size += 8ULL << 20;
- map[0].type = E820_RAM;
+ xen_e820_map[0].size += 8ULL << 20;
+ xen_e820_map[0].type = E820_RAM;
rc = 0;
}
BUG_ON(rc);
BUG_ON(memmap.nr_entries == 0);
+ xen_e820_map_entries = memmap.nr_entries;
/*
* Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
* a patch in the future.
*/
if (xen_initial_domain())
- xen_ignore_unusable(map, memmap.nr_entries);
+ xen_ignore_unusable();
/* Make sure the Xen-supplied memory map is well-ordered. */
- sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+ sanitize_e820_map(xen_e820_map, xen_e820_map_entries,
+ &xen_e820_map_entries);
max_pages = xen_get_max_pages();
- if (max_pages > max_pfn)
- extra_pages += max_pages - max_pfn;
- /*
- * Set identity map on non-RAM pages and prepare remapping the
- * underlying RAM.
- */
- xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
- &xen_released_pages, &remapped_pages);
+ /* How many extra pages do we need due to remapping? */
+ max_pages += xen_count_remap_pages(max_pfn);
- extra_pages += xen_released_pages;
- extra_pages += remapped_pages;
+ if (max_pages > max_pfn)
+ extra_pages += max_pages - max_pfn;
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void)
* is limited to the max size of lowmem, so that it doesn't
* get completely filled.
*
+ * Make sure we have no memory above max_pages, as this area
+ * isn't handled by the p2m management.
+ *
* In principle there could be a problem in lowmem systems if
* the initial memory is also very large with respect to
* lowmem, but we won't try to deal with that here.
*/
- extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
- extra_pages);
+ extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ extra_pages, max_pages - max_pfn);
i = 0;
- while (i < memmap.nr_entries) {
- phys_addr_t addr = map[i].addr;
- phys_addr_t size = map[i].size;
- u32 type = map[i].type;
+ addr = xen_e820_map[0].addr;
+ size = xen_e820_map[0].size;
+ while (i < xen_e820_map_entries) {
+ chunk_size = size;
+ type = xen_e820_map[i].type;
if (type == E820_RAM) {
if (addr < mem_end) {
- size = min(size, mem_end - addr);
+ chunk_size = min(size, mem_end - addr);
} else if (extra_pages) {
- size = min(size, PFN_PHYS(extra_pages));
- extra_pages -= PFN_DOWN(size);
- xen_add_extra_mem(addr, size);
- xen_max_p2m_pfn = PFN_DOWN(addr + size);
+ chunk_size = min(size, PFN_PHYS(extra_pages));
+ pfn_s = PFN_UP(addr);
+ n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+ extra_pages -= n_pfns;
+ xen_add_extra_mem(pfn_s, n_pfns);
+ xen_max_p2m_pfn = pfn_s + n_pfns;
} else
type = E820_UNUSABLE;
}
- xen_align_and_add_e820_region(addr, size, type);
+ xen_align_and_add_e820_region(addr, chunk_size, type);
- map[i].addr += size;
- map[i].size -= size;
- if (map[i].size == 0)
+ addr += chunk_size;
+ size -= chunk_size;
+ if (size == 0) {
i++;
+ if (i < xen_e820_map_entries) {
+ addr = xen_e820_map[i].addr;
+ size = xen_e820_map[i].size;
+ }
+ }
}
/*
* Set the rest as identity mapped, in case PCI BARs are
* located here.
- *
- * PFNs above MAX_P2M_PFN are considered identity mapped as
- * well.
*/
- set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+ set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
/*
* In domU, the ISA region is normal, usable memory, but we
@@ -684,34 +873,53 @@ char * __init xen_memory_setup(void)
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
/*
- * Reserve Xen bits:
- * - mfn_list
- * - xen_start_info
- * See comment above "struct start_info" in <xen/interface/xen.h>
- * We tried to make the the memblock_reserve more selective so
- * that it would be clear what region is reserved. Sadly we ran
- * in the problem wherein on a 64-bit hypervisor with a 32-bit
- * initial domain, the pt_base has the cr3 value which is not
- * neccessarily where the pagetable starts! As Jan put it: "
- * Actually, the adjustment turns out to be correct: The page
- * tables for a 32-on-64 dom0 get allocated in the order "first L1",
- * "first L2", "first L3", so the offset to the page table base is
- * indeed 2. When reading xen/include/public/xen.h's comment
- * very strictly, this is not a violation (since there nothing is said
- * that the first thing in the page table space is pointed to by
- * pt_base; I admit that this seems to be implied though, namely
- * do I think that it is implied that the page table space is the
- * range [pt_base, pt_base + nt_pt_frames), whereas that
- * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
- * which - without a priori knowledge - the kernel would have
- * difficulty to figure out)." - so lets just fall back to the
- * easy way and reserve the whole region.
+ * Check whether the kernel itself conflicts with the target E820 map.
+ * Failing now is better than running into weird problems later due
+ * to relocating (and even reusing) pages with kernel text or data.
*/
- memblock_reserve(__pa(xen_start_info->mfn_list),
- xen_start_info->pt_base - xen_start_info->mfn_list);
+ if (xen_is_e820_reserved(__pa_symbol(_text),
+ __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
+ xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
+ BUG();
+ }
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ /*
+ * Check for a conflict of the hypervisor supplied page tables with
+ * the target E820 map.
+ */
+ xen_pt_check_e820();
+
+ xen_reserve_xen_mfnlist();
+
+ /* Check for a conflict of the initrd with the target E820 map. */
+ if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
+ boot_params.hdr.ramdisk_size)) {
+ phys_addr_t new_area, start, size;
+
+ new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
+ if (!new_area) {
+ xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
+ BUG();
+ }
+
+ start = boot_params.hdr.ramdisk_image;
+ size = boot_params.hdr.ramdisk_size;
+ xen_phys_memcpy(new_area, start, size);
+ pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
+ start, start + size, new_area, new_area + size);
+ memblock_free(start, size);
+ boot_params.hdr.ramdisk_image = new_area;
+ boot_params.ext_ramdisk_image = new_area >> 32;
+ }
+
+ /*
+ * Set identity map on non-RAM pages and prepare remapping the
+ * underlying RAM.
+ */
+ xen_set_identity_and_remap(max_pfn);
return "Xen";
}
@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void)
*/
char * __init xen_auto_xlated_memory_setup(void)
{
- static struct e820entry map[E820MAX] __initdata;
-
struct xen_memory_map memmap;
int i;
int rc;
memmap.nr_entries = E820MAX;
- set_xen_guest_handle(memmap.buffer, map);
+ set_xen_guest_handle(memmap.buffer, xen_e820_map);
rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
if (rc < 0)
panic("No memory map (%d)\n", rc);
- sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+ xen_e820_map_entries = memmap.nr_entries;
+
+ sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+ &xen_e820_map_entries);
- for (i = 0; i < memmap.nr_entries; i++)
- e820_add_region(map[i].addr, map[i].size, map[i].type);
+ for (i = 0; i < xen_e820_map_entries; i++)
+ e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
+ xen_e820_map[i].type);
- memblock_reserve(__pa(xen_start_info->mfn_list),
- xen_start_info->pt_base - xen_start_info->mfn_list);
+ /* Remove p2m info, it is not needed. */
+ xen_start_info->mfn_list = 0;
+ xen_start_info->first_p2m_pfn = 0;
+ xen_start_info->nr_p2m_frames = 0;
return "Xen";
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 86484384492e..3f4ebf0261f2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -26,6 +26,7 @@
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
@@ -38,6 +39,7 @@
#include "xen-ops.h"
#include "mmu.h"
#include "smp.h"
+#include "pmu.h"
cpumask_var_t xen_cpu_initialized_map;
@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
kfree(per_cpu(xen_irq_work, cpu).name);
per_cpu(xen_irq_work, cpu).name = NULL;
}
+
+ if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
+ per_cpu(xen_pmu_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_pmu_irq, cpu).name);
+ per_cpu(xen_pmu_irq, cpu).name = NULL;
+ }
};
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
- char *resched_name, *callfunc_name, *debug_name;
+ char *resched_name, *callfunc_name, *debug_name, *pmu_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
per_cpu(xen_irq_work, cpu).irq = rc;
per_cpu(xen_irq_work, cpu).name = callfunc_name;
+ if (is_xen_pmu(cpu)) {
+ pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+ rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
+ xen_pmu_irq_handler,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ pmu_name, NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_pmu_irq, cpu).irq = rc;
+ per_cpu(xen_pmu_irq, cpu).name = pmu_name;
+ }
+
return 0;
fail:
@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
+ xen_pmu_init(0);
+
if (xen_smp_intr_init(0))
BUG();
@@ -429,7 +453,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
}
#endif
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
- ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+ ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
if (rc)
return rc;
+ xen_pmu_init(cpu);
+
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
+ xen_pmu_finish(cpu);
}
}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 53b4c0811f4f..feddabdab448 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -11,6 +11,7 @@
#include "xen-ops.h"
#include "mmu.h"
+#include "pmu.h"
static void xen_pv_pre_suspend(void)
{
@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled)
void xen_arch_pre_suspend(void)
{
- if (xen_pv_domain())
- xen_pv_pre_suspend();
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
+
+ if (xen_pv_domain())
+ xen_pv_pre_suspend();
}
void xen_arch_post_suspend(int cancelled)
{
- if (xen_pv_domain())
- xen_pv_post_suspend(cancelled);
- else
- xen_hvm_post_suspend(cancelled);
+ int cpu;
+
+ if (xen_pv_domain())
+ xen_pv_post_suspend(cancelled);
+ else
+ xen_hvm_post_suspend(cancelled);
+
+ for_each_online_cpu(cpu)
+ xen_pmu_init(cpu);
}
static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 8afdfccf6086..b65f59a358a2 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
+ /* Map the p2m table to a 512GB-aligned user address. */
+ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 2292721b1d10..1399423f3418 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
+void __init xen_reserve_special_pages(void);
+void __init xen_pt_check_e820(void);
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
+#ifdef CONFIG_X86_64
+void __init xen_relocate_p2m(void);
+#endif
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
void __init xen_inv_extra_mem(void);
void __init xen_remap_memory(void);
+phys_addr_t __init xen_find_free_area(phys_addr_t size);
char * __init xen_memory_setup(void);
char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);