diff options
Diffstat (limited to 'arch')
54 files changed, 1914 insertions, 102 deletions
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index f24628db5409..e2bd35b6780c 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -4,6 +4,7 @@ # source "virt/kvm/Kconfig" +source "virt/lib/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -23,6 +24,8 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select ARM_GIC + select ARM_GIC_V3 + select ARM_GIC_V3_ITS select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO @@ -36,6 +39,8 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER ---help--- Support hosting virtualized guest machines. diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index f550abd64a25..48de846f2246 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -32,6 +32,7 @@ obj-y += $(KVM)/arm/vgic/vgic-init.o obj-y += $(KVM)/arm/vgic/vgic-irqfd.o obj-y += $(KVM)/arm/vgic/vgic-v2.o obj-y += $(KVM)/arm/vgic/vgic-v3.o +obj-y += $(KVM)/arm/vgic/vgic-v4.o obj-y += $(KVM)/arm/vgic/vgic-mmio.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o diff --git a/arch/arm/mach-pxa/cm-x255.c b/arch/arm/mach-pxa/cm-x255.c index b592f79a1742..fa8e7dd4d898 100644 --- a/arch/arm/mach-pxa/cm-x255.c +++ b/arch/arm/mach-pxa/cm-x255.c @@ -14,7 +14,7 @@ #include <linux/mtd/partitions.h> #include <linux/mtd/physmap.h> #include <linux/mtd/nand-gpio.h> - +#include <linux/gpio/machine.h> #include <linux/spi/spi.h> #include <linux/spi/pxa2xx_spi.h> @@ -176,6 +176,17 @@ static inline void cmx255_init_nor(void) {} #endif #if defined(CONFIG_MTD_NAND_GPIO) || defined(CONFIG_MTD_NAND_GPIO_MODULE) + +static struct gpiod_lookup_table cmx255_nand_gpiod_table = { + .dev_id = "gpio-nand", + .table = { + GPIO_LOOKUP("gpio-pxa", GPIO_NAND_CS, "nce", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-pxa", GPIO_NAND_CLE, "cle", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-pxa", GPIO_NAND_ALE, "ale", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("gpio-pxa", GPIO_NAND_RB, "rdy", GPIO_ACTIVE_HIGH), + }, +}; + static struct resource cmx255_nand_resource[] = { [0] = { .start = PXA_CS1_PHYS, @@ -198,11 +209,6 @@ static struct mtd_partition cmx255_nand_parts[] = { }; static struct gpio_nand_platdata cmx255_nand_platdata = { - .gpio_nce = GPIO_NAND_CS, - .gpio_cle = GPIO_NAND_CLE, - .gpio_ale = GPIO_NAND_ALE, - .gpio_rdy = GPIO_NAND_RB, - .gpio_nwp = -1, .parts = cmx255_nand_parts, .num_parts = ARRAY_SIZE(cmx255_nand_parts), .chip_delay = 25, @@ -220,6 +226,7 @@ static struct platform_device cmx255_nand = { static void __init cmx255_init_nand(void) { + gpiod_add_lookup_table(&cmx255_nand_gpiod_table); platform_device_register(&cmx255_nand); } #else diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 13f81f971390..2257dfcc44cc 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -4,6 +4,7 @@ # source "virt/kvm/Kconfig" +source "virt/lib/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -36,6 +37,8 @@ config KVM select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS ---help--- Support hosting virtualized guest machines. We don't support KVM with 16K page tables yet, due to the multiple diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 861acbbac385..87c4f7ae24de 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -27,6 +27,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v4.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 7f74c282710f..fad0e6ff460f 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -21,11 +21,6 @@ #include <asm/opal.h> /* - * For static allocation of some of the structures. - */ -#define IMC_MAX_PMUS 32 - -/* * Compatibility macros for IMC devices */ #define IMC_DTB_COMPAT "ibm,opal-in-memory-counters" @@ -125,4 +120,5 @@ enum { extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); extern void thread_imc_disable(void); +extern int get_max_nest_dev(void); #endif /* __ASM_POWERPC_IMC_PMU_H */ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 602e0fde19b4..8bdc2f96c5d6 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -735,8 +735,8 @@ static __init void cpufeatures_cpu_quirks(void) */ if ((version & 0xffffff00) == 0x004e0100) cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1; - else if ((version & 0xffffefff) == 0x004e0200) - cur_cpu_spec->cpu_features &= ~CPU_FTR_POWER9_DD2_1; + else if ((version & 0xffffefff) == 0x004e0201) + cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; } static void __init cpufeatures_setup_finished(void) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index c9de03e0c1f1..d469224c4ada 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -21,6 +21,7 @@ #include <asm/tlbflush.h> #include <asm/page.h> #include <asm/code-patching.h> +#include <asm/setup.h> static int __patch_instruction(unsigned int *addr, unsigned int instr) { @@ -146,11 +147,8 @@ int patch_instruction(unsigned int *addr, unsigned int instr) * During early early boot patch_instruction is called * when text_poke_area is not ready, but we still need * to allow patching. We just do the plain old patching - * We use slab_is_available and per cpu read * via this_cpu_read - * of text_poke_area. Per-CPU areas might not be up early - * this can create problems with just using this_cpu_read() */ - if (!slab_is_available() || !this_cpu_read(text_poke_area)) + if (!this_cpu_read(*PTRRELOC(&text_poke_area))) return __patch_instruction(addr, instr); local_irq_save(flags); diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 564fff06f5c1..23ec2c5e3b78 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -122,7 +122,8 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) return !slice_area_is_free(mm, start, end - start); } -static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) +static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, + unsigned long high_limit) { unsigned long i; @@ -133,15 +134,16 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) if (!slice_low_has_vma(mm, i)) ret->low_slices |= 1u << i; - if (mm->context.slb_addr_limit <= SLICE_LOW_TOP) + if (high_limit <= SLICE_LOW_TOP) return; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) + for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) if (!slice_high_has_vma(mm, i)) __set_bit(i, ret->high_slices); } -static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret) +static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret, + unsigned long high_limit) { unsigned char *hpsizes; int index, mask_index; @@ -156,8 +158,11 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_ma if (((lpsizes >> (i * 4)) & 0xf) == psize) ret->low_slices |= 1u << i; + if (high_limit <= SLICE_LOW_TOP) + return; + hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++) { mask_index = i & 0x1; index = i >> 1; if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) @@ -169,6 +174,10 @@ static int slice_check_fit(struct mm_struct *mm, struct slice_mask mask, struct slice_mask available) { DECLARE_BITMAP(result, SLICE_NUM_HIGH); + /* + * Make sure we just do bit compare only to the max + * addr limit and not the full bit map size. + */ unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); bitmap_and(result, mask.high_slices, @@ -472,7 +481,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First make up a "good" mask of slices that have the right size * already */ - slice_mask_for_size(mm, psize, &good_mask); + slice_mask_for_size(mm, psize, &good_mask, high_limit); slice_print_mask(" good_mask", good_mask); /* @@ -497,7 +506,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, #ifdef CONFIG_PPC_64K_PAGES /* If we support combo pages, we can allow 64k pages in 4k slices */ if (psize == MMU_PAGE_64K) { - slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit); if (fixed) slice_or_mask(&good_mask, &compat_mask); } @@ -530,11 +539,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, return newaddr; } } - - /* We don't fit in the good mask, check what other slices are + /* + * We don't fit in the good mask, check what other slices are * empty and thus can be converted */ - slice_mask_for_free(mm, &potential_mask); + slice_mask_for_free(mm, &potential_mask, high_limit); slice_or_mask(&potential_mask, &good_mask); slice_print_mask(" potential", potential_mask); @@ -744,17 +753,18 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, { struct slice_mask mask, available; unsigned int psize = mm->context.user_psize; + unsigned long high_limit = mm->context.slb_addr_limit; if (radix_enabled()) return 0; slice_range_to_mask(addr, len, &mask); - slice_mask_for_size(mm, psize, &available); + slice_mask_for_size(mm, psize, &available, high_limit); #ifdef CONFIG_PPC_64K_PAGES /* We need to account for 4k slices too */ if (psize == MMU_PAGE_64K) { struct slice_mask compat_mask; - slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask, high_limit); slice_or_mask(&available, &compat_mask); } #endif diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 36344117c680..0ead3cd73caa 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -26,7 +26,7 @@ */ static DEFINE_MUTEX(nest_init_lock); static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc); -static struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS]; +static struct imc_pmu **per_nest_pmu_arr; static cpumask_t nest_imc_cpumask; struct imc_pmu_ref *nest_imc_refc; static int nest_pmus; @@ -286,13 +286,14 @@ static struct imc_pmu_ref *get_nest_pmu_ref(int cpu) static void nest_change_cpu_context(int old_cpu, int new_cpu) { struct imc_pmu **pn = per_nest_pmu_arr; - int i; if (old_cpu < 0 || new_cpu < 0) return; - for (i = 0; *pn && i < IMC_MAX_PMUS; i++, pn++) + while (*pn) { perf_pmu_migrate_context(&(*pn)->pmu, old_cpu, new_cpu); + pn++; + } } static int ppc_nest_imc_cpu_offline(unsigned int cpu) @@ -467,7 +468,7 @@ static int nest_imc_event_init(struct perf_event *event) * Nest HW counter memory resides in a per-chip reserve-memory (HOMER). * Get the base memory addresss for this cpu. */ - chip_id = topology_physical_package_id(event->cpu); + chip_id = cpu_to_chip_id(event->cpu); pcni = pmu->mem_info; do { if (pcni->id == chip_id) { @@ -524,19 +525,19 @@ static int nest_imc_event_init(struct perf_event *event) */ static int core_imc_mem_init(int cpu, int size) { - int phys_id, rc = 0, core_id = (cpu / threads_per_core); + int nid, rc = 0, core_id = (cpu / threads_per_core); struct imc_mem_info *mem_info; /* * alloc_pages_node() will allocate memory for core in the * local node only. */ - phys_id = topology_physical_package_id(cpu); + nid = cpu_to_node(cpu); mem_info = &core_imc_pmu->mem_info[core_id]; mem_info->id = core_id; /* We need only vbase for core counters */ - mem_info->vbase = page_address(alloc_pages_node(phys_id, + mem_info->vbase = page_address(alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | __GFP_NOWARN, get_order(size))); if (!mem_info->vbase) @@ -797,14 +798,14 @@ static int core_imc_event_init(struct perf_event *event) static int thread_imc_mem_alloc(int cpu_id, int size) { u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); - int phys_id = topology_physical_package_id(cpu_id); + int nid = cpu_to_node(cpu_id); if (!local_mem) { /* * This case could happen only once at start, since we dont * free the memory in cpu offline path. */ - local_mem = page_address(alloc_pages_node(phys_id, + local_mem = page_address(alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | __GFP_NOWARN, get_order(size))); if (!local_mem) @@ -1194,6 +1195,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); kfree(pmu_ptr); + kfree(per_nest_pmu_arr); return; } @@ -1218,6 +1220,13 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, return -ENOMEM; /* Needed for hotplug/migration */ + if (!per_nest_pmu_arr) { + per_nest_pmu_arr = kcalloc(get_max_nest_dev() + 1, + sizeof(struct imc_pmu *), + GFP_KERNEL); + if (!per_nest_pmu_arr) + return -ENOMEM; + } per_nest_pmu_arr[pmu_index] = pmu_ptr; break; case IMC_DOMAIN_CORE: diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 21f6531fae20..465ea105b771 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -153,6 +153,22 @@ static void disable_core_pmu_counters(void) put_online_cpus(); } +int get_max_nest_dev(void) +{ + struct device_node *node; + u32 pmu_units = 0, type; + + for_each_compatible_node(node, NULL, IMC_DTB_UNIT_COMPAT) { + if (of_property_read_u32(node, "type", &type)) + continue; + + if (type == IMC_TYPE_CHIP) + pmu_units++; + } + + return pmu_units; +} + static int opal_imc_counters_probe(struct platform_device *pdev) { struct device_node *imc_dev = pdev->dev.of_node; @@ -191,8 +207,10 @@ static int opal_imc_counters_probe(struct platform_device *pdev) break; } - if (!imc_pmu_create(imc_dev, pmu_count, domain)) - pmu_count++; + if (!imc_pmu_create(imc_dev, pmu_count, domain)) { + if (domain == IMC_DOMAIN_NEST) + pmu_count++; + } } return 0; diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index c488621dbec3..aebbe95c9230 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -135,6 +135,7 @@ int chip_to_vas_id(int chipid) } return -1; } +EXPORT_SYMBOL(chip_to_vas_id); static int vas_probe(struct platform_device *pdev) { diff --git a/arch/sparc/Kbuild b/arch/sparc/Kbuild index 675afa285ddb..b4d0f570cc00 100644 --- a/arch/sparc/Kbuild +++ b/arch/sparc/Kbuild @@ -7,3 +7,4 @@ obj-y += mm/ obj-y += math-emu/ obj-y += net/ obj-y += crypto/ +obj-$(CONFIG_SPARC64) += vdso/ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 987a57502909..6bf594ace663 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -84,6 +84,8 @@ config SPARC64 select HAVE_REGS_AND_STACK_ACCESS_API select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select GENERIC_TIME_VSYSCALL + select ARCH_CLOCKSOURCE_DATA config ARCH_DEFCONFIG string diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile index dbc448923f48..edac927e4952 100644 --- a/arch/sparc/Makefile +++ b/arch/sparc/Makefile @@ -81,6 +81,10 @@ install: archclean: $(Q)$(MAKE) $(clean)=$(boot) +PHONY += vdso_install +vdso_install: + $(Q)$(MAKE) $(build)=arch/sparc/vdso $@ + # This is the image used for packaging KBUILD_IMAGE := $(boot)/zImage diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h index a90eea24b286..ca7ea5913494 100644 --- a/arch/sparc/include/asm/bitops_64.h +++ b/arch/sparc/include/asm/bitops_64.h @@ -23,10 +23,11 @@ void set_bit(unsigned long nr, volatile unsigned long *addr); void clear_bit(unsigned long nr, volatile unsigned long *addr); void change_bit(unsigned long nr, volatile unsigned long *addr); +int fls(unsigned int word); +int __fls(unsigned long word); + #include <asm-generic/bitops/non-atomic.h> -#include <asm-generic/bitops/fls.h> -#include <asm-generic/bitops/__fls.h> #include <asm-generic/bitops/fls64.h> #ifdef __KERNEL__ diff --git a/arch/sparc/include/asm/clocksource.h b/arch/sparc/include/asm/clocksource.h new file mode 100644 index 000000000000..d63ef224befe --- /dev/null +++ b/arch/sparc/include/asm/clocksource.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ASM_SPARC_CLOCKSOURCE_H +#define _ASM_SPARC_CLOCKSOURCE_H + +/* VDSO clocksources */ +#define VCLOCK_NONE 0 /* Nothing userspace can do. */ +#define VCLOCK_TICK 1 /* Use %tick. */ +#define VCLOCK_STICK 2 /* Use %stick. */ + +struct arch_clocksource_data { + int vclock_mode; +}; + +#endif /* _ASM_SPARC_CLOCKSOURCE_H */ diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h index 3e3823db303e..c73b5a3ab7b9 100644 --- a/arch/sparc/include/asm/cmpxchg_32.h +++ b/arch/sparc/include/asm/cmpxchg_32.h @@ -63,6 +63,9 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size) (unsigned long)_n_, sizeof(*(ptr))); \ }) +u64 __cmpxchg_u64(u64 *ptr, u64 old, u64 new); +#define cmpxchg64(ptr, old, new) __cmpxchg_u64(ptr, old, new) + #include <asm-generic/cmpxchg-local.h> /* diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h index 5894389f5ed5..25340df3570c 100644 --- a/arch/sparc/include/asm/elf_64.h +++ b/arch/sparc/include/asm/elf_64.h @@ -211,4 +211,18 @@ do { if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ (current->personality & (~PER_MASK))); \ } while (0) +extern unsigned int vdso_enabled; + +#define ARCH_DLINFO \ +do { \ + if (vdso_enabled) \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (unsigned long)current->mm->context.vdso); \ +} while (0) + +struct linux_binprm; + +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 +extern int arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); #endif /* !(__ASM_SPARC64_ELF_H) */ diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h index 5fe64a57b4ba..ad4fb93508ba 100644 --- a/arch/sparc/include/asm/mmu_64.h +++ b/arch/sparc/include/asm/mmu_64.h @@ -97,6 +97,7 @@ typedef struct { unsigned long thp_pte_count; struct tsb_config tsb_block[MM_NUM_TSBS]; struct hv_tsb_descr tsb_descr[MM_NUM_TSBS]; + void *vdso; } mm_context_t; #endif /* !__ASSEMBLY__ */ diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index e25d25b0a34b..b361702ef52a 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -8,9 +8,11 @@ #include <linux/spinlock.h> #include <linux/mm_types.h> +#include <linux/smp.h> #include <asm/spitfire.h> #include <asm-generic/mm_hooks.h> +#include <asm/percpu.h> static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index c7c79fe8d265..aac23d4a4ddd 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -200,6 +200,13 @@ unsigned long get_wchan(struct task_struct *task); * To make a long story short, we are trying to yield the current cpu * strand during busy loops. */ +#ifdef BUILD_VDSO +#define cpu_relax() asm volatile("\n99:\n\t" \ + "rd %%ccr, %%g0\n\t" \ + "rd %%ccr, %%g0\n\t" \ + "rd %%ccr, %%g0\n\t" \ + ::: "memory") +#else /* ! BUILD_VDSO */ #define cpu_relax() asm volatile("\n99:\n\t" \ "rd %%ccr, %%g0\n\t" \ "rd %%ccr, %%g0\n\t" \ @@ -211,6 +218,7 @@ unsigned long get_wchan(struct task_struct *task); "nop\n\t" \ ".previous" \ ::: "memory") +#endif /* Prefetch support. This is tuned for UltraSPARC-III and later. * UltraSPARC-I will treat these as nops, and UltraSPARC-II has diff --git a/arch/sparc/include/asm/tsb.h b/arch/sparc/include/asm/tsb.h index 25b6abdb3908..522a677e050d 100644 --- a/arch/sparc/include/asm/tsb.h +++ b/arch/sparc/include/asm/tsb.h @@ -217,7 +217,7 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end; sllx REG2, 32, REG2; \ andcc REG1, REG2, %g0; \ be,pt %xcc, 700f; \ - sethi %hi(0x1ffc0000), REG2; \ + sethi %hi(0xffe00000), REG2; \ sllx REG2, 1, REG2; \ brgez,pn REG1, FAIL_LABEL; \ andn REG1, REG2, REG1; \ diff --git a/arch/sparc/include/asm/vdso.h b/arch/sparc/include/asm/vdso.h new file mode 100644 index 000000000000..93b628731a5e --- /dev/null +++ b/arch/sparc/include/asm/vdso.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ASM_SPARC_VDSO_H +#define _ASM_SPARC_VDSO_H + +struct vdso_image { + void *data; + unsigned long size; /* Always a multiple of PAGE_SIZE */ + long sym_vvar_start; /* Negative offset to the vvar area */ + long sym_vread_tick; /* Start of vread_tick section */ + long sym_vread_tick_patch_start; /* Start of tick read */ + long sym_vread_tick_patch_end; /* End of tick read */ +}; + +#ifdef CONFIG_SPARC64 +extern const struct vdso_image vdso_image_64_builtin; +#endif +#ifdef CONFIG_COMPAT +extern const struct vdso_image vdso_image_32_builtin; +#endif + +#endif /* _ASM_SPARC_VDSO_H */ diff --git a/arch/sparc/include/asm/vvar.h b/arch/sparc/include/asm/vvar.h new file mode 100644 index 000000000000..0289503d1cb0 --- /dev/null +++ b/arch/sparc/include/asm/vvar.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ASM_SPARC_VVAR_DATA_H +#define _ASM_SPARC_VVAR_DATA_H + +#include <asm/clocksource.h> +#include <linux/seqlock.h> +#include <linux/time.h> +#include <linux/types.h> + +struct vvar_data { + unsigned int seq; + + int vclock_mode; + struct { /* extract of a clocksource struct */ + u64 cycle_last; + u64 mask; + int mult; + int shift; + } clock; + /* open coded 'struct timespec' */ + u64 wall_time_sec; + u64 wall_time_snsec; + u64 monotonic_time_snsec; + u64 monotonic_time_sec; + u64 monotonic_time_coarse_sec; + u64 monotonic_time_coarse_nsec; + u64 wall_time_coarse_sec; + u64 wall_time_coarse_nsec; + + int tz_minuteswest; + int tz_dsttime; +}; + +extern struct vvar_data *vvar_data; +extern int vdso_fix_stick; + +static inline unsigned int vvar_read_begin(const struct vvar_data *s) +{ + unsigned int ret; + +repeat: + ret = READ_ONCE(s->seq); + if (unlikely(ret & 1)) { + cpu_relax(); + goto repeat; + } + smp_rmb(); /* Finish all reads before we return seq */ + return ret; +} + +static inline int vvar_read_retry(const struct vvar_data *s, + unsigned int start) +{ + smp_rmb(); /* Finish all reads before checking the value of seq */ + return unlikely(s->seq != start); +} + +static inline void vvar_write_begin(struct vvar_data *s) +{ + ++s->seq; + smp_wmb(); /* Makes sure that increment of seq is reflected */ +} + +static inline void vvar_write_end(struct vvar_data *s) +{ + smp_wmb(); /* Makes the value of seq current before we increment */ + ++s->seq; +} + + +#endif /* _ASM_SPARC_VVAR_DATA_H */ diff --git a/arch/sparc/include/uapi/asm/auxvec.h b/arch/sparc/include/uapi/asm/auxvec.h index ad6f360261f6..5f80a70cc901 100644 --- a/arch/sparc/include/uapi/asm/auxvec.h +++ b/arch/sparc/include/uapi/asm/auxvec.h @@ -1,4 +1,8 @@ #ifndef __ASMSPARC_AUXVEC_H #define __ASMSPARC_AUXVEC_H +#define AT_SYSINFO_EHDR 33 + +#define AT_VECTOR_SIZE_ARCH 1 + #endif /* !(__ASMSPARC_AUXVEC_H) */ diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 8de9617589a5..cc97545737f0 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_SPARC32) += systbls_32.o obj-y += time_$(BITS).o obj-$(CONFIG_SPARC32) += windows.o obj-y += cpu.o +obj-$(CONFIG_SPARC64) += vdso.o obj-$(CONFIG_SPARC32) += devices.o obj-y += ptrace_$(BITS).o obj-y += unaligned_$(BITS).o diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index 9e293de12052..a41e6e16eb36 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -641,6 +641,8 @@ niagara4_patch: nop call niagara4_patch_pageops nop + call niagara4_patch_fls + nop ba,a,pt %xcc, 80f nop diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 1ef6156b1530..418592a09b41 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -13,6 +13,7 @@ #include <linux/miscdevice.h> #include <linux/bootmem.h> #include <linux/export.h> +#include <linux/refcount.h> #include <asm/cpudata.h> #include <asm/hypervisor.h> @@ -71,7 +72,7 @@ struct mdesc_handle { struct list_head list; struct mdesc_mem_ops *mops; void *self_base; - atomic_t refcnt; + refcount_t refcnt; unsigned int handle_size; struct mdesc_hdr mdesc; }; @@ -153,7 +154,7 @@ static void mdesc_handle_init(struct mdesc_handle *hp, memset(hp, 0, handle_size); INIT_LIST_HEAD(&hp->list); hp->self_base = base; - atomic_set(&hp->refcnt, 1); + refcount_set(&hp->refcnt, 1); hp->handle_size = handle_size; } @@ -183,7 +184,7 @@ static void __init mdesc_memblock_free(struct mdesc_handle *hp) unsigned int alloc_size; unsigned long start; - BUG_ON(atomic_read(&hp->refcnt) != 0); + BUG_ON(refcount_read(&hp->refcnt) != 0); BUG_ON(!list_empty(&hp->list)); alloc_size = PAGE_ALIGN(hp->handle_size); @@ -221,7 +222,7 @@ static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size) static void mdesc_kfree(struct mdesc_handle *hp) { - BUG_ON(atomic_read(&hp->refcnt) != 0); + BUG_ON(refcount_read(&hp->refcnt) != 0); BUG_ON(!list_empty(&hp->list)); kfree(hp->self_base); @@ -260,7 +261,7 @@ struct mdesc_handle *mdesc_grab(void) spin_lock_irqsave(&mdesc_lock, flags); hp = cur_mdesc; if (hp) - atomic_inc(&hp->refcnt); + refcount_inc(&hp->refcnt); spin_unlock_irqrestore(&mdesc_lock, flags); return hp; @@ -272,7 +273,7 @@ void mdesc_release(struct mdesc_handle *hp) unsigned long flags; spin_lock_irqsave(&mdesc_lock, flags); - if (atomic_dec_and_test(&hp->refcnt)) { + if (refcount_dec_and_test(&hp->refcnt)) { list_del_init(&hp->list); hp->mops->free(hp); } @@ -514,7 +515,7 @@ void mdesc_update(void) if (status != HV_EOK || real_len > len) { printk(KERN_ERR "MD: mdesc reread fails with %lu\n", status); - atomic_dec(&hp->refcnt); + refcount_dec(&hp->refcnt); mdesc_free(hp); goto out; } @@ -527,7 +528,7 @@ void mdesc_update(void) mdesc_notify_clients(orig_hp, hp); spin_lock_irqsave(&mdesc_lock, flags); - if (atomic_dec_and_test(&orig_hp->refcnt)) + if (refcount_dec_and_test(&orig_hp->refcnt)) mdesc_free(orig_hp); else list_add(&orig_hp->list, &mdesc_zombie_list); diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c index 3b397081047a..2ef8cfa9677e 100644 --- a/arch/sparc/kernel/time_64.c +++ b/arch/sparc/kernel/time_64.c @@ -28,7 +28,6 @@ #include <linux/jiffies.h> #include <linux/cpufreq.h> #include <linux/percpu.h> -#include <linux/miscdevice.h> #include <linux/rtc/m48t59.h> #include <linux/kernel_stat.h> #include <linux/clockchips.h> @@ -54,6 +53,8 @@ DEFINE_SPINLOCK(rtc_lock); +unsigned int __read_mostly vdso_fix_stick; + #ifdef CONFIG_SMP unsigned long profile_pc(struct pt_regs *regs) { @@ -831,12 +832,17 @@ static void init_tick_ops(struct sparc64_tick_ops *ops) void __init time_init_early(void) { if (tlb_type == spitfire) { - if (is_hummingbird()) + if (is_hummingbird()) { init_tick_ops(&hbtick_operations); - else + clocksource_tick.archdata.vclock_mode = VCLOCK_NONE; + } else { init_tick_ops(&tick_operations); + clocksource_tick.archdata.vclock_mode = VCLOCK_TICK; + vdso_fix_stick = 1; + } } else { init_tick_ops(&stick_operations); + clocksource_tick.archdata.vclock_mode = VCLOCK_STICK; } } diff --git a/arch/sparc/kernel/vdso.c b/arch/sparc/kernel/vdso.c new file mode 100644 index 000000000000..58880662b271 --- /dev/null +++ b/arch/sparc/kernel/vdso.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + */ + +#include <linux/seqlock.h> +#include <linux/time.h> +#include <linux/timekeeper_internal.h> + +#include <asm/vvar.h> + +void update_vsyscall_tz(void) +{ + if (unlikely(vvar_data == NULL)) + return; + + vvar_data->tz_minuteswest = sys_tz.tz_minuteswest; + vvar_data->tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vvar_data *vdata = vvar_data; + + if (unlikely(vdata == NULL)) + return; + + vvar_write_begin(vdata); + vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->tkr_mono.cycle_last; + vdata->clock.mask = tk->tkr_mono.mask; + vdata->clock.mult = tk->tkr_mono.mult; + vdata->clock.shift = tk->tkr_mono.shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + + (tk->wall_to_monotonic.tv_nsec << + tk->tkr_mono.shift); + + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = + (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + vvar_write_end(vdata); +} diff --git a/arch/sparc/kernel/viohs.c b/arch/sparc/kernel/viohs.c index c858f5f3ce2c..635d67ffc9a3 100644 --- a/arch/sparc/kernel/viohs.c +++ b/arch/sparc/kernel/viohs.c @@ -798,9 +798,9 @@ void vio_port_up(struct vio_driver_state *vio) } EXPORT_SYMBOL(vio_port_up); -static void vio_port_timer(unsigned long _arg) +static void vio_port_timer(struct timer_list *t) { - struct vio_driver_state *vio = (struct vio_driver_state *) _arg; + struct vio_driver_state *vio = from_timer(vio, t, timer); vio_port_up(vio); } @@ -849,7 +849,7 @@ int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev, vio->ops = ops; - setup_timer(&vio->timer, vio_port_timer, (unsigned long) vio); + timer_setup(&vio->timer, vio_port_timer, 0); return 0; } diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 44829a8dc458..0f0f76b4f6cd 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -17,6 +17,9 @@ lib-$(CONFIG_SPARC64) += atomic_64.o lib-$(CONFIG_SPARC32) += lshrdi3.o ashldi3.o lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o lib-$(CONFIG_SPARC64) += multi3.o +lib-$(CONFIG_SPARC64) += fls.o +lib-$(CONFIG_SPARC64) += fls64.o +obj-$(CONFIG_SPARC64) += NG4fls.o lib-$(CONFIG_SPARC64) += copy_page.o clear_page.o bzero.o lib-$(CONFIG_SPARC64) += csum_copy.o csum_copy_from_user.o csum_copy_to_user.o diff --git a/arch/sparc/lib/NG4fls.S b/arch/sparc/lib/NG4fls.S new file mode 100644 index 000000000000..2d0991e5b034 --- /dev/null +++ b/arch/sparc/lib/NG4fls.S @@ -0,0 +1,30 @@ +/* NG4fls.S: SPARC optimized fls and __fls for T4 and above. + * + * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. + */ + +#include <linux/linkage.h> + +#define LZCNT_O0_G2 \ + .word 0x85b002e8 + + .text + .register %g2, #scratch + .register %g3, #scratch + +ENTRY(NG4fls) + LZCNT_O0_G2 !lzcnt %o0, %g2 + mov 64, %g3 + retl + sub %g3, %g2, %o0 +ENDPROC(NG4fls) + +ENTRY(__NG4fls) + brz,pn %o0, 1f + LZCNT_O0_G2 !lzcnt %o0, %g2 + mov 63, %g3 + sub %g3, %g2, %o0 +1: + retl + nop +ENDPROC(__NG4fls) diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S index aa58ab39f9a6..37866175c921 100644 --- a/arch/sparc/lib/NG4patch.S +++ b/arch/sparc/lib/NG4patch.S @@ -4,6 +4,8 @@ * Copyright (C) 2012 David S. Miller <davem@davemloft.net> */ +#include <linux/linkage.h> + #define BRANCH_ALWAYS 0x10680000 #define NOP 0x01000000 #define NG_DO_PATCH(OLD, NEW) \ @@ -53,3 +55,10 @@ niagara4_patch_pageops: retl nop .size niagara4_patch_pageops,.-niagara4_patch_pageops + +ENTRY(niagara4_patch_fls) + NG_DO_PATCH(fls, NG4fls) + NG_DO_PATCH(__fls, __NG4fls) + retl + nop +ENDPROC(niagara4_patch_fls) diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c index 5010df497387..465a901a0ada 100644 --- a/arch/sparc/lib/atomic32.c +++ b/arch/sparc/lib/atomic32.c @@ -173,6 +173,20 @@ unsigned long __cmpxchg_u32(volatile u32 *ptr, u32 old, u32 new) } EXPORT_SYMBOL(__cmpxchg_u32); +u64 __cmpxchg_u64(u64 *ptr, u64 old, u64 new) +{ + unsigned long flags; + u64 prev; + + spin_lock_irqsave(ATOMIC_HASH(ptr), flags); + if ((prev = *ptr) == old) + *ptr = new; + spin_unlock_irqrestore(ATOMIC_HASH(ptr), flags); + + return prev; +} +EXPORT_SYMBOL(__cmpxchg_u64); + unsigned long __xchg_u32(volatile u32 *ptr, u32 new) { unsigned long flags; diff --git a/arch/sparc/lib/fls.S b/arch/sparc/lib/fls.S new file mode 100644 index 000000000000..06b8d300bcae --- /dev/null +++ b/arch/sparc/lib/fls.S @@ -0,0 +1,67 @@ +/* fls.S: SPARC default fls definition. + * + * SPARC default fls definition, which follows the same algorithm as + * in generic fls(). This function will be boot time patched on T4 + * and onward. + */ + +#include <linux/linkage.h> +#include <asm/export.h> + + .text + .register %g2, #scratch + .register %g3, #scratch +ENTRY(fls) + brz,pn %o0, 6f + mov 0, %o1 + sethi %hi(0xffff0000), %g3 + mov %o0, %g2 + andcc %o0, %g3, %g0 + be,pt %icc, 8f + mov 32, %o1 + sethi %hi(0xff000000), %g3 + andcc %g2, %g3, %g0 + bne,pt %icc, 3f + sethi %hi(0xf0000000), %g3 + sll %o0, 8, %o0 +1: + add %o1, -8, %o1 + sra %o0, 0, %o0 + mov %o0, %g2 +2: + sethi %hi(0xf0000000), %g3 +3: + andcc %g2, %g3, %g0 + bne,pt %icc, 4f + sethi %hi(0xc0000000), %g3 + sll %o0, 4, %o0 + add %o1, -4, %o1 + sra %o0, 0, %o0 + mov %o0, %g2 +4: + andcc %g2, %g3, %g0 + be,a,pt %icc, 7f + sll %o0, 2, %o0 +5: + xnor %g0, %o0, %o0 + srl %o0, 31, %o0 + sub %o1, %o0, %o1 +6: + jmp %o7 + 8 + sra %o1, 0, %o0 +7: + add %o1, -2, %o1 + ba,pt %xcc, 5b + sra %o0, 0, %o0 +8: + sll %o0, 16, %o0 + sethi %hi(0xff000000), %g3 + sra %o0, 0, %o0 + mov %o0, %g2 + andcc %g2, %g3, %g0 + bne,pt %icc, 2b + mov 16, %o1 + ba,pt %xcc, 1b + sll %o0, 8, %o0 +ENDPROC(fls) +EXPORT_SYMBOL(fls) diff --git a/arch/sparc/lib/fls64.S b/arch/sparc/lib/fls64.S new file mode 100644 index 000000000000..c83e22ae9586 --- /dev/null +++ b/arch/sparc/lib/fls64.S @@ -0,0 +1,61 @@ +/* fls64.S: SPARC default __fls definition. + * + * SPARC default __fls definition, which follows the same algorithm as + * in generic __fls(). This function will be boot time patched on T4 + * and onward. + */ + +#include <linux/linkage.h> +#include <asm/export.h> + + .text + .register %g2, #scratch + .register %g3, #scratch +ENTRY(__fls) + mov -1, %g2 + sllx %g2, 32, %g2 + and %o0, %g2, %g2 + brnz,pt %g2, 1f + mov 63, %g1 + sllx %o0, 32, %o0 + mov 31, %g1 +1: + mov -1, %g2 + sllx %g2, 48, %g2 + and %o0, %g2, %g2 + brnz,pt %g2, 2f + mov -1, %g2 + sllx %o0, 16, %o0 + add %g1, -16, %g1 +2: + mov -1, %g2 + sllx %g2, 56, %g2 + and %o0, %g2, %g2 + brnz,pt %g2, 3f + mov -1, %g2 + sllx %o0, 8, %o0 + add %g1, -8, %g1 +3: + sllx %g2, 60, %g2 + and %o0, %g2, %g2 + brnz,pt %g2, 4f + mov -1, %g2 + sllx %o0, 4, %o0 + add %g1, -4, %g1 +4: + sllx %g2, 62, %g2 + and %o0, %g2, %g2 + brnz,pt %g2, 5f + mov -1, %g3 + sllx %o0, 2, %o0 + add %g1, -2, %g1 +5: + mov 0, %g2 + sllx %g3, 63, %g3 + and %o0, %g3, %o0 + movre %o0, 1, %g2 + sub %g1, %g2, %g1 + jmp %o7+8 + sra %g1, 0, %o0 +ENDPROC(__fls) +EXPORT_SYMBOL(__fls) diff --git a/arch/sparc/vdso/.gitignore b/arch/sparc/vdso/.gitignore new file mode 100644 index 000000000000..ef925b998222 --- /dev/null +++ b/arch/sparc/vdso/.gitignore @@ -0,0 +1,3 @@ +vdso.lds +vdso-image-*.c +vdso2c diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile new file mode 100644 index 000000000000..a6615d8864f7 --- /dev/null +++ b/arch/sparc/vdso/Makefile @@ -0,0 +1,149 @@ +# +# Building vDSO images for sparc. +# + +KBUILD_CFLAGS += $(DISABLE_LTO) + +VDSO64-$(CONFIG_SPARC64) := y +VDSOCOMPAT-$(CONFIG_COMPAT) := y + +# files to link into the vdso +vobjs-y := vdso-note.o vclock_gettime.o + +# files to link into kernel +obj-y += vma.o + +# vDSO images to build +vdso_img-$(VDSO64-y) += 64 +vdso_img-$(VDSOCOMPAT-y) += 32 + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.lds $(vobjs-y) + +# Build the vDSO image C files and link them in. +vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) +vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) +vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) +obj-y += $(vdso_img_objs) +targets += $(vdso_img_cfiles) +targets += $(vdso_img_sodbg) +.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ + $(vdso_img-y:%=$(obj)/vdso%.so) + +export CPPFLAGS_vdso.lds += -P -C + +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,--no-undefined \ + -Wl,-z,max-page-size=8192 -Wl,-z,common-page-size=8192 \ + $(DISABLE_LTO) + +$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,vdso) + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include +hostprogs-y += vdso2c + +quiet_cmd_vdso2c = VDSO2C $@ +define cmd_vdso2c + $(obj)/vdso2c $< $(<:%.dbg=%) $@ +endef + +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE + $(call if_changed,vdso2c) + +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# +CFL := $(PROFILING) -mcmodel=medlow -fPIC -O2 -fasynchronous-unwind-tables \ + -m64 -ffixed-g2 -ffixed-g3 -fcall-used-g4 -fcall-used-g5 -ffixed-g6 \ + -ffixed-g7 $(filter -g%,$(KBUILD_CFLAGS)) \ + $(call cc-option, -fno-stack-protector) -fno-omit-frame-pointer \ + -foptimize-sibling-calls -DBUILD_VDSO + +$(vobjs): KBUILD_CFLAGS += $(CFL) + +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg +CFLAGS_REMOVE_vclock_gettime.o = -pg + +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg + $(call if_changed,objcopy) + +CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf32_sparc,-soname=linux-gate.so.1 + +#This makes sure the $(obj) subdirectory exists even though vdso32/ +#is not a kbuild sub-make subdirectory +override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ + +targets += vdso32/vdso32.lds +targets += vdso32/vdso-note.o +targets += vdso32/vclock_gettime.o + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO +$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(obj)/vdso32.so.dbg: asflags-$(CONFIG_SPARC64) += -m32 + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mcmodel=medlow,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m32 -msoft-float -fpic -mno-app-regs -ffixed-g7 +KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) +KBUILD_CFLAGS_32 += -fno-omit-frame-pointer +KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_32 += -mv8plus +$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + +$(obj)/vdso32.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/vdso-note.o + $(call if_changed,vdso) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(CC) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) + +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ + $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic +GCOV_PROFILE := n + +# +# Install the unstripped copies of vdso*.so. If our toolchain supports +# build-id, install .build-id links as well. +# +quiet_cmd_vdso_install = INSTALL $(@:install_%=%) +define cmd_vdso_install + cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ + if readelf -n $< |grep -q 'Build ID'; then \ + buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ + first=`echo $$buildid | cut -b-2`; \ + last=`echo $$buildid | cut -b3-`; \ + mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ + ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ + fi +endef + +vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) + +$(MODLIB)/vdso: FORCE + @mkdir -p $(MODLIB)/vdso + +$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE + $(call cmd,vdso_install) + +PHONY += vdso_install $(vdso_img_insttargets) +vdso_install: $(vdso_img_insttargets) FORCE diff --git a/arch/sparc/vdso/vclock_gettime.c b/arch/sparc/vdso/vclock_gettime.c new file mode 100644 index 000000000000..3feb3d960ca5 --- /dev/null +++ b/arch/sparc/vdso/vclock_gettime.c @@ -0,0 +1,264 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of clock_gettime, gettimeofday, and time. + * + * The code should have no internal unresolved relocations. + * Check with readelf after changing. + * Also alternative() doesn't work. + */ +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +/* Disable profiling for userspace code: */ +#ifndef DISABLE_BRANCH_PROFILING +#define DISABLE_BRANCH_PROFILING +#endif + +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/string.h> +#include <asm/io.h> +#include <asm/unistd.h> +#include <asm/timex.h> +#include <asm/clocksource.h> +#include <asm/vvar.h> + +#undef TICK_PRIV_BIT +#ifdef CONFIG_SPARC64 +#define TICK_PRIV_BIT (1UL << 63) +#else +#define TICK_PRIV_BIT (1ULL << 63) +#endif + +#define SYSCALL_STRING \ + "ta 0x6d;" \ + "sub %%g0, %%o0, %%o0;" \ + +#define SYSCALL_CLOBBERS \ + "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", \ + "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", \ + "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", \ + "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", \ + "f32", "f34", "f36", "f38", "f40", "f42", "f44", "f46", \ + "f48", "f50", "f52", "f54", "f56", "f58", "f60", "f62", \ + "cc", "memory" + +/* + * Compute the vvar page's address in the process address space, and return it + * as a pointer to the vvar_data. + */ +static notrace noinline struct vvar_data * +get_vvar_data(void) +{ + unsigned long ret; + + /* + * vdso data page is the first vDSO page so grab the return address + * and move up a page to get to the data page. + */ + ret = (unsigned long)__builtin_return_address(0); + ret &= ~(8192 - 1); + ret -= 8192; + + return (struct vvar_data *) ret; +} + +static notrace long +vdso_fallback_gettime(long clock, struct timespec *ts) +{ + register long num __asm__("g1") = __NR_clock_gettime; + register long o0 __asm__("o0") = clock; + register long o1 __asm__("o1") = (long) ts; + + __asm__ __volatile__(SYSCALL_STRING : "=r" (o0) : "r" (num), + "0" (o0), "r" (o1) : SYSCALL_CLOBBERS); + return o0; +} + +static notrace __always_inline long +vdso_fallback_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + register long num __asm__("g1") = __NR_gettimeofday; + register long o0 __asm__("o0") = (long) tv; + register long o1 __asm__("o1") = (long) tz; + + __asm__ __volatile__(SYSCALL_STRING : "=r" (o0) : "r" (num), + "0" (o0), "r" (o1) : SYSCALL_CLOBBERS); + return o0; +} + +#ifdef CONFIG_SPARC64 +static notrace noinline u64 +vread_tick(void) { + u64 ret; + + __asm__ __volatile__("rd %%asr24, %0 \n" + ".section .vread_tick_patch, \"ax\" \n" + "rd %%tick, %0 \n" + ".previous \n" + : "=&r" (ret)); + return ret & ~TICK_PRIV_BIT; +} +#else +static notrace noinline u64 +vread_tick(void) +{ + unsigned int lo, hi; + + __asm__ __volatile__("rd %%asr24, %%g1\n\t" + "srlx %%g1, 32, %1\n\t" + "srl %%g1, 0, %0\n" + ".section .vread_tick_patch, \"ax\" \n" + "rd %%tick, %%g1\n" + ".previous \n" + : "=&r" (lo), "=&r" (hi) + : + : "g1"); + return lo | ((u64)hi << 32); +} +#endif + +static notrace inline u64 +vgetsns(struct vvar_data *vvar) +{ + u64 v; + u64 cycles; + + cycles = vread_tick(); + v = (cycles - vvar->clock.cycle_last) & vvar->clock.mask; + return v * vvar->clock.mult; +} + +static notrace noinline int +do_realtime(struct vvar_data *vvar, struct timespec *ts) +{ + unsigned long seq; + u64 ns; + + ts->tv_nsec = 0; + do { + seq = vvar_read_begin(vvar); + ts->tv_sec = vvar->wall_time_sec; + ns = vvar->wall_time_snsec; + ns += vgetsns(vvar); + ns >>= vvar->clock.shift; + } while (unlikely(vvar_read_retry(vvar, seq))); + + timespec_add_ns(ts, ns); + + return 0; +} + +static notrace noinline int +do_monotonic(struct vvar_data *vvar, struct timespec *ts) +{ + unsigned long seq; + u64 ns; + + ts->tv_nsec = 0; + do { + seq = vvar_read_begin(vvar); + ts->tv_sec = vvar->monotonic_time_sec; + ns = vvar->monotonic_time_snsec; + ns += vgetsns(vvar); + ns >>= vvar->clock.shift; + } while (unlikely(vvar_read_retry(vvar, seq))); + + timespec_add_ns(ts, ns); + + return 0; +} + +static notrace noinline int +do_realtime_coarse(struct vvar_data *vvar, struct timespec *ts) +{ + unsigned long seq; + + do { + seq = vvar_read_begin(vvar); + ts->tv_sec = vvar->wall_time_coarse_sec; + ts->tv_nsec = vvar->wall_time_coarse_nsec; + } while (unlikely(vvar_read_retry(vvar, seq))); + return 0; +} + +static notrace noinline int +do_monotonic_coarse(struct vvar_data *vvar, struct timespec *ts) +{ + unsigned long seq; + + do { + seq = vvar_read_begin(vvar); + ts->tv_sec = vvar->monotonic_time_coarse_sec; + ts->tv_nsec = vvar->monotonic_time_coarse_nsec; + } while (unlikely(vvar_read_retry(vvar, seq))); + + return 0; +} + +notrace int +__vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + struct vvar_data *vvd = get_vvar_data(); + + switch (clock) { + case CLOCK_REALTIME: + if (unlikely(vvd->vclock_mode == VCLOCK_NONE)) + break; + return do_realtime(vvd, ts); + case CLOCK_MONOTONIC: + if (unlikely(vvd->vclock_mode == VCLOCK_NONE)) + break; + return do_monotonic(vvd, ts); + case CLOCK_REALTIME_COARSE: + return do_realtime_coarse(vvd, ts); + case CLOCK_MONOTONIC_COARSE: + return do_monotonic_coarse(vvd, ts); + } + /* + * Unknown clock ID ? Fall back to the syscall. + */ + return vdso_fallback_gettime(clock, ts); +} +int +clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +notrace int +__vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + struct vvar_data *vvd = get_vvar_data(); + + if (likely(vvd->vclock_mode != VCLOCK_NONE)) { + if (likely(tv != NULL)) { + union tstv_t { + struct timespec ts; + struct timeval tv; + } *tstv = (union tstv_t *) tv; + do_realtime(vvd, &tstv->ts); + /* + * Assign before dividing to ensure that the division is + * done in the type of tv_usec, not tv_nsec. + * + * There cannot be > 1 billion usec in a second: + * do_realtime() has already distributed such overflow + * into tv_sec. So we can assign it to an int safely. + */ + tstv->tv.tv_usec = tstv->ts.tv_nsec; + tstv->tv.tv_usec /= 1000; + } + if (unlikely(tz != NULL)) { + /* Avoid memcpy. Some old compilers fail to inline it */ + tz->tz_minuteswest = vvd->tz_minuteswest; + tz->tz_dsttime = vvd->tz_dsttime; + } + return 0; + } + return vdso_fallback_gettimeofday(tv, tz); +} +int +gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); diff --git a/arch/sparc/vdso/vdso-layout.lds.S b/arch/sparc/vdso/vdso-layout.lds.S new file mode 100644 index 000000000000..f2c83abaca12 --- /dev/null +++ b/arch/sparc/vdso/vdso-layout.lds.S @@ -0,0 +1,104 @@ +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +#if defined(BUILD_VDSO64) +# define SHDR_SIZE 64 +#elif defined(BUILD_VDSO32) +# define SHDR_SIZE 40 +#else +# error unknown VDSO target +#endif + +#define NUM_FAKE_SHDRS 7 + +SECTIONS +{ + /* + * User/kernel shared data is before the vDSO. This may be a little + * uglier than putting it after the vDSO, but it avoids issues with + * non-allocatable things that dangle past the end of the PT_LOAD + * segment. Page size is 8192 for both 64-bit and 32-bit vdso binaries + */ + + vvar_start = . -8192; + vvar_data = vvar_start; + + . = SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { + *(.rodata*) + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + + /* + * Ideally this would live in a C file: kept in here for + * compatibility with x86-64. + */ + VDSO_FAKE_SECTION_TABLE_START = .; + . = . + NUM_FAKE_SHDRS * SHDR_SIZE; + VDSO_FAKE_SECTION_TABLE_END = .; + } :text + + .fake_shstrtab : { *(.fake_shstrtab) } :text + + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + + /* + * Text is well-separated from actual data: there's plenty of + * stuff that isn't used at runtime in between. + */ + + .text : { *(.text*) } :text =0x90909090, + + .vread_tick_patch : { + vread_tick_patch_start = .; + *(.vread_tick_patch) + vread_tick_patch_end = .; + } + + /DISCARD/ : { + *(.discard) + *(.discard.*) + *(__bug_table) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/sparc/vdso/vdso-note.S b/arch/sparc/vdso/vdso-note.S new file mode 100644 index 000000000000..79a071e4357e --- /dev/null +++ b/arch/sparc/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/sparc/vdso/vdso.lds.S b/arch/sparc/vdso/vdso.lds.S new file mode 100644 index 000000000000..f3caa29a331c --- /dev/null +++ b/arch/sparc/vdso/vdso.lds.S @@ -0,0 +1,25 @@ +/* + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSO64 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + local: *; + }; +} diff --git a/arch/sparc/vdso/vdso2c.c b/arch/sparc/vdso/vdso2c.c new file mode 100644 index 000000000000..9f5b1cd6d51d --- /dev/null +++ b/arch/sparc/vdso/vdso2c.c @@ -0,0 +1,234 @@ +/* + * vdso2c - A vdso image preparation tool + * Copyright (c) 2014 Andy Lutomirski and others + * Licensed under the GPL v2 + * + * vdso2c requires stripped and unstripped input. It would be trivial + * to fully strip the input in here, but, for reasons described below, + * we need to write a section table. Doing this is more or less + * equivalent to dropping all non-allocatable sections, but it's + * easier to let objcopy handle that instead of doing it ourselves. + * If we ever need to do something fancier than what objcopy provides, + * it would be straightforward to add here. + * + * We keep a section table for a few reasons: + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ + +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +#include <inttypes.h> +#include <stdint.h> +#include <unistd.h> +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <err.h> + +#include <sys/mman.h> +#include <sys/types.h> +#include <tools/be_byteshift.h> + +#include <linux/elf.h> +#include <linux/types.h> +#include <linux/kernel.h> + +const char *outfilename; + +/* Symbols that we need in vdso2c. */ +enum { + sym_vvar_start, + sym_VDSO_FAKE_SECTION_TABLE_START, + sym_VDSO_FAKE_SECTION_TABLE_END, + sym_vread_tick, + sym_vread_tick_patch_start, + sym_vread_tick_patch_end +}; + +struct vdso_sym { + const char *name; + int export; +}; + +struct vdso_sym required_syms[] = { + [sym_vvar_start] = {"vvar_start", 1}, + [sym_VDSO_FAKE_SECTION_TABLE_START] = { + "VDSO_FAKE_SECTION_TABLE_START", 0 + }, + [sym_VDSO_FAKE_SECTION_TABLE_END] = { + "VDSO_FAKE_SECTION_TABLE_END", 0 + }, + [sym_vread_tick] = {"vread_tick", 1}, + [sym_vread_tick_patch_start] = {"vread_tick_patch_start", 1}, + [sym_vread_tick_patch_end] = {"vread_tick_patch_end", 1} +}; + +__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) +static void fail(const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + fprintf(stderr, "Error: "); + vfprintf(stderr, format, ap); + if (outfilename) + unlink(outfilename); + exit(1); + va_end(ap); +} + +/* + * Evil macros for big-endian reads and writes + */ +#define GBE(x, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + (__typeof__(*(x)))get_unaligned_be##bits(x), ifnot) + +#define LAST_GBE(x) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x), (void)(0)) + +#define GET_BE(x) \ + GBE(x, 64, GBE(x, 32, GBE(x, 16, LAST_GBE(x)))) + +#define PBE(x, val, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + put_unaligned_be##bits((val), (x)), ifnot) + +#define LAST_PBE(x, val) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), (void)(0)) + +#define PUT_BE(x, val) \ + PBE(x, val, 64, PBE(x, val, 32, PBE(x, val, 16, LAST_PBE(x, val)))) + +#define NSYMS ARRAY_SIZE(required_syms) + +#define BITSFUNC3(name, bits, suffix) name##bits##suffix +#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) + +#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) + +#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x +#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) +#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) + +#define ELF_BITS 64 +#include "vdso2c.h" +#undef ELF_BITS + +#define ELF_BITS 32 +#include "vdso2c.h" +#undef ELF_BITS + +static void go(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; + + if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { + go64(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { + go32(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else { + fail("unknown ELF class\n"); + } +} + +static void map_input(const char *name, void **addr, size_t *len, int prot) +{ + off_t tmp_len; + + int fd = open(name, O_RDONLY); + + if (fd == -1) + err(1, "%s", name); + + tmp_len = lseek(fd, 0, SEEK_END); + if (tmp_len == (off_t)-1) + err(1, "lseek"); + *len = (size_t)tmp_len; + + *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); + if (*addr == MAP_FAILED) + err(1, "mmap"); + + close(fd); +} + +int main(int argc, char **argv) +{ + size_t raw_len, stripped_len; + void *raw_addr, *stripped_addr; + FILE *outfile; + char *name, *tmp; + int namelen; + + if (argc != 4) { + printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); + return 1; + } + + /* + * Figure out the struct name. If we're writing to a .so file, + * generate raw output insted. + */ + name = strdup(argv[3]); + namelen = strlen(name); + if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { + name = NULL; + } else { + tmp = strrchr(name, '/'); + if (tmp) + name = tmp + 1; + tmp = strchr(name, '.'); + if (tmp) + *tmp = '\0'; + for (tmp = name; *tmp; tmp++) + if (*tmp == '-') + *tmp = '_'; + } + + map_input(argv[1], &raw_addr, &raw_len, PROT_READ); + map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); + + outfilename = argv[3]; + outfile = fopen(outfilename, "w"); + if (!outfile) + err(1, "%s", argv[2]); + + go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); + + munmap(raw_addr, raw_len); + munmap(stripped_addr, stripped_len); + fclose(outfile); + + return 0; +} diff --git a/arch/sparc/vdso/vdso2c.h b/arch/sparc/vdso/vdso2c.h new file mode 100644 index 000000000000..808decb0f7be --- /dev/null +++ b/arch/sparc/vdso/vdso2c.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +/* + * This file is included up to twice from vdso2c.c. It generates code for + * 32-bit and 64-bit vDSOs. We will eventually need both for 64-bit builds, + * since 32-bit vDSOs will then be built for 32-bit userspace. + */ + +static void BITSFUNC(go)(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + int found_load = 0; + unsigned long load_size = -1; /* Work around bogus warning */ + unsigned long mapping_size; + int i; + unsigned long j; + + ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; + ELF(Dyn) *dyn = 0, *dyn_end = 0; + INT_BITS syms[NSYMS] = {}; + + ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_BE(&hdr->e_phoff)); + + /* Walk the segment table. */ + for (i = 0; i < GET_BE(&hdr->e_phnum); i++) { + if (GET_BE(&pt[i].p_type) == PT_LOAD) { + if (found_load) + fail("multiple PT_LOAD segs\n"); + + if (GET_BE(&pt[i].p_offset) != 0 || + GET_BE(&pt[i].p_vaddr) != 0) + fail("PT_LOAD in wrong place\n"); + + if (GET_BE(&pt[i].p_memsz) != GET_BE(&pt[i].p_filesz)) + fail("cannot handle memsz != filesz\n"); + + load_size = GET_BE(&pt[i].p_memsz); + found_load = 1; + } else if (GET_BE(&pt[i].p_type) == PT_DYNAMIC) { + dyn = raw_addr + GET_BE(&pt[i].p_offset); + dyn_end = raw_addr + GET_BE(&pt[i].p_offset) + + GET_BE(&pt[i].p_memsz); + } + } + if (!found_load) + fail("no PT_LOAD seg\n"); + + if (stripped_len < load_size) + fail("stripped input is too short\n"); + + /* Walk the dynamic table */ + for (i = 0; dyn + i < dyn_end && + GET_BE(&dyn[i].d_tag) != DT_NULL; i++) { + typeof(dyn[i].d_tag) tag = GET_BE(&dyn[i].d_tag); + typeof(dyn[i].d_un.d_val) val = GET_BE(&dyn[i].d_un.d_val); + + if ((tag == DT_RELSZ || tag == DT_RELASZ) && (val != 0)) + fail("vdso image contains dynamic relocations\n"); + } + + /* Walk the section table */ + for (i = 0; i < GET_BE(&hdr->e_shnum); i++) { + ELF(Shdr) *sh = raw_addr + GET_BE(&hdr->e_shoff) + + GET_BE(&hdr->e_shentsize) * i; + if (GET_BE(&sh->sh_type) == SHT_SYMTAB) + symtab_hdr = sh; + } + + if (!symtab_hdr) + fail("no symbol table\n"); + + strtab_hdr = raw_addr + GET_BE(&hdr->e_shoff) + + GET_BE(&hdr->e_shentsize) * GET_BE(&symtab_hdr->sh_link); + + /* Walk the symbol table */ + for (i = 0; + i < GET_BE(&symtab_hdr->sh_size) / GET_BE(&symtab_hdr->sh_entsize); + i++) { + int k; + + ELF(Sym) *sym = raw_addr + GET_BE(&symtab_hdr->sh_offset) + + GET_BE(&symtab_hdr->sh_entsize) * i; + const char *name = raw_addr + GET_BE(&strtab_hdr->sh_offset) + + GET_BE(&sym->st_name); + + for (k = 0; k < NSYMS; k++) { + if (!strcmp(name, required_syms[k].name)) { + if (syms[k]) { + fail("duplicate symbol %s\n", + required_syms[k].name); + } + + /* + * Careful: we use negative addresses, but + * st_value is unsigned, so we rely + * on syms[k] being a signed type of the + * correct width. + */ + syms[k] = GET_BE(&sym->st_value); + } + } + } + + /* Validate mapping addresses. */ + if (syms[sym_vvar_start] % 8192) + fail("vvar_begin must be a multiple of 8192\n"); + + if (!name) { + fwrite(stripped_addr, stripped_len, 1, outfile); + return; + } + + mapping_size = (stripped_len + 8191) / 8192 * 8192; + + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); + fprintf(outfile, "#include <linux/cache.h>\n"); + fprintf(outfile, "#include <asm/vdso.h>\n"); + fprintf(outfile, "\n"); + fprintf(outfile, + "static unsigned char raw_data[%lu] __ro_after_init __aligned(8192)= {", + mapping_size); + for (j = 0; j < stripped_len; j++) { + if (j % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", + (int)((unsigned char *)stripped_addr)[j]); + } + fprintf(outfile, "\n};\n\n"); + + fprintf(outfile, "const struct vdso_image %s_builtin = {\n", name); + fprintf(outfile, "\t.data = raw_data,\n"); + fprintf(outfile, "\t.size = %lu,\n", mapping_size); + for (i = 0; i < NSYMS; i++) { + if (required_syms[i].export && syms[i]) + fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", + required_syms[i].name, (int64_t)syms[i]); + } + fprintf(outfile, "};\n"); +} diff --git a/arch/sparc/vdso/vdso32/.gitignore b/arch/sparc/vdso/vdso32/.gitignore new file mode 100644 index 000000000000..e45fba9d0ced --- /dev/null +++ b/arch/sparc/vdso/vdso32/.gitignore @@ -0,0 +1 @@ +vdso32.lds diff --git a/arch/sparc/vdso/vdso32/vclock_gettime.c b/arch/sparc/vdso/vdso32/vclock_gettime.c new file mode 100644 index 000000000000..026abb3b826c --- /dev/null +++ b/arch/sparc/vdso/vdso32/vclock_gettime.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +#define BUILD_VDSO32 + +#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE +#undef CONFIG_OPTIMIZE_INLINING +#endif + +#ifdef CONFIG_SPARC64 + +/* + * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel + * configuration + */ +#undef CONFIG_64BIT +#undef CONFIG_SPARC64 +#define BUILD_VDSO32_64 +#define CONFIG_32BIT +#undef CONFIG_QUEUED_RWLOCKS +#undef CONFIG_QUEUED_SPINLOCKS + +#endif + +#include "../vclock_gettime.c" diff --git a/arch/sparc/vdso/vdso32/vdso-note.S b/arch/sparc/vdso/vdso32/vdso-note.S new file mode 100644 index 000000000000..e234983cf0d8 --- /dev/null +++ b/arch/sparc/vdso/vdso32/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO + * text. Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/sparc/vdso/vdso32/vdso32.lds.S b/arch/sparc/vdso/vdso32/vdso32.lds.S new file mode 100644 index 000000000000..53575ee154c4 --- /dev/null +++ b/arch/sparc/vdso/vdso32/vdso32.lds.S @@ -0,0 +1,24 @@ +/* + * Linker script for sparc32 vDSO + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSO32 +#include "../vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + local: *; + }; +} diff --git a/arch/sparc/vdso/vma.c b/arch/sparc/vdso/vma.c new file mode 100644 index 000000000000..0a6f50098e23 --- /dev/null +++ b/arch/sparc/vdso/vma.c @@ -0,0 +1,268 @@ +/* + * Set up the VMAs to tell the VM about the vDSO. + * Copyright 2007 Andi Kleen, SUSE Labs. + * Subject to the GPL, v.2 + */ + +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/random.h> +#include <linux/elf.h> +#include <asm/vdso.h> +#include <asm/vvar.h> +#include <asm/page.h> + +unsigned int __read_mostly vdso_enabled = 1; + +static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]" +}; + +#ifdef CONFIG_SPARC64 +static struct vm_special_mapping vdso_mapping64 = { + .name = "[vdso]" +}; +#endif + +#ifdef CONFIG_COMPAT +static struct vm_special_mapping vdso_mapping32 = { + .name = "[vdso]" +}; +#endif + +struct vvar_data *vvar_data; + +#define SAVE_INSTR_SIZE 4 + +/* + * Allocate pages for the vdso and vvar, and copy in the vdso text from the + * kernel image. + */ +int __init init_vdso_image(const struct vdso_image *image, + struct vm_special_mapping *vdso_mapping) +{ + int i; + struct page *dp, **dpp = NULL; + int dnpages = 0; + struct page *cp, **cpp = NULL; + int cnpages = (image->size) / PAGE_SIZE; + + /* + * First, the vdso text. This is initialied data, an integral number of + * pages long. + */ + if (WARN_ON(image->size % PAGE_SIZE != 0)) + goto oom; + + cpp = kcalloc(cnpages, sizeof(struct page *), GFP_KERNEL); + vdso_mapping->pages = cpp; + + if (!cpp) + goto oom; + + if (vdso_fix_stick) { + /* + * If the system uses %tick instead of %stick, patch the VDSO + * with instruction reading %tick instead of %stick. + */ + unsigned int j, k = SAVE_INSTR_SIZE; + unsigned char *data = image->data; + + for (j = image->sym_vread_tick_patch_start; + j < image->sym_vread_tick_patch_end; j++) { + + data[image->sym_vread_tick + k] = data[j]; + k++; + } + } + + for (i = 0; i < cnpages; i++) { + cp = alloc_page(GFP_KERNEL); + if (!cp) + goto oom; + cpp[i] = cp; + copy_page(page_address(cp), image->data + i * PAGE_SIZE); + } + + /* + * Now the vvar page. This is uninitialized data. + */ + + if (vvar_data == NULL) { + dnpages = (sizeof(struct vvar_data) / PAGE_SIZE) + 1; + if (WARN_ON(dnpages != 1)) + goto oom; + dpp = kcalloc(dnpages, sizeof(struct page *), GFP_KERNEL); + vvar_mapping.pages = dpp; + + if (!dpp) + goto oom; + + dp = alloc_page(GFP_KERNEL); + if (!dp) + goto oom; + + dpp[0] = dp; + vvar_data = page_address(dp); + memset(vvar_data, 0, PAGE_SIZE); + + vvar_data->seq = 0; + } + + return 0; + oom: + if (cpp != NULL) { + for (i = 0; i < cnpages; i++) { + if (cpp[i] != NULL) + __free_page(cpp[i]); + } + kfree(cpp); + vdso_mapping->pages = NULL; + } + + if (dpp != NULL) { + for (i = 0; i < dnpages; i++) { + if (dpp[i] != NULL) + __free_page(dpp[i]); + } + kfree(dpp); + vvar_mapping.pages = NULL; + } + + pr_warn("Cannot allocate vdso\n"); + vdso_enabled = 0; + return -ENOMEM; +} + +static int __init init_vdso(void) +{ + int err = 0; +#ifdef CONFIG_SPARC64 + err = init_vdso_image(&vdso_image_64_builtin, &vdso_mapping64); + if (err) + return err; +#endif + +#ifdef CONFIG_COMPAT + err = init_vdso_image(&vdso_image_32_builtin, &vdso_mapping32); +#endif + return err; + +} +subsys_initcall(init_vdso); + +struct linux_binprm; + +/* Shuffle the vdso up a bit, randomly. */ +static unsigned long vdso_addr(unsigned long start, unsigned int len) +{ + unsigned int offset; + + /* This loses some more bits than a modulo, but is cheaper */ + offset = get_random_int() & (PTRS_PER_PTE - 1); + return start + (offset << PAGE_SHIFT); +} + +static int map_vdso(const struct vdso_image *image, + struct vm_special_mapping *vdso_mapping) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long text_start, addr = 0; + int ret = 0; + + down_write(&mm->mmap_sem); + + /* + * First, get an unmapped region: then randomize it, and make sure that + * region is free. + */ + if (current->flags & PF_RANDOMIZE) { + addr = get_unmapped_area(NULL, 0, + image->size - image->sym_vvar_start, + 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + addr = vdso_addr(addr, image->size - image->sym_vvar_start); + } + addr = get_unmapped_area(NULL, addr, + image->size - image->sym_vvar_start, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + text_start = addr - image->sym_vvar_start; + current->mm->context.vdso = (void __user *)text_start; + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + vma = _install_special_mapping(mm, + text_start, + image->size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + vdso_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + vma = _install_special_mapping(mm, + addr, + -image->sym_vvar_start, + VM_READ|VM_MAYREAD, + &vvar_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + do_munmap(mm, text_start, image->size, NULL); + } + +up_fail: + if (ret) + current->mm->context.vdso = NULL; + + up_write(&mm->mmap_sem); + return ret; +} + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + + if (!vdso_enabled) + return 0; + +#if defined CONFIG_COMPAT + if (!(is_32bit_task())) + return map_vdso(&vdso_image_64_builtin, &vdso_mapping64); + else + return map_vdso(&vdso_image_32_builtin, &vdso_mapping32); +#else + return map_vdso(&vdso_image_64_builtin, &vdso_mapping64); +#endif + +} + +static __init int vdso_setup(char *s) +{ + int err; + unsigned long val; + + err = kstrtoul(s, 10, &val); + vdso_enabled = val; + return err; +} +__setup("vdso=", vdso_setup); diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index d9280482a2f8..c68add8df3ae 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -10,7 +10,6 @@ config UML select HAVE_DEBUG_KMEMLEAK select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES - select GENERIC_IO select GENERIC_CLOCKEVENTS select HAVE_GCC_PLUGINS select TTY # Needed for line.c diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b71daed3cca2..59e13a79c2e3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3671,6 +3671,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; switch (ecx) { + case MSR_IA32_CR_PAT: + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; + vcpu->arch.pat = data; + svm->vmcb->save.g_pat = data; + mark_dirty(svm->vmcb, VMCB_NPT); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr); break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7c3522a989d0..714a0673ec3c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -70,6 +70,9 @@ MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); +static bool __read_mostly enable_vnmi = 1; +module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); + static bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); @@ -202,6 +205,10 @@ struct loaded_vmcs { bool nmi_known_unmasked; unsigned long vmcs_host_cr3; /* May not match real cr3 */ unsigned long vmcs_host_cr4; /* May not match real cr4 */ + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; struct list_head loaded_vmcss_on_cpu_link; }; @@ -1291,6 +1298,11 @@ static inline bool cpu_has_vmx_invpcid(void) SECONDARY_EXEC_ENABLE_INVPCID; } +static inline bool cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; +} + static inline bool cpu_has_vmx_wbinvd_exit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -1348,11 +1360,6 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; -} - static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & @@ -3712,9 +3719,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS; - opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | + PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -5232,6 +5239,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + + if (!enable_vnmi) + pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; + /* Enable the preemption timer dynamically */ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; @@ -5666,7 +5677,8 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) static void enable_nmi_window(struct kvm_vcpu *vcpu) { - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + if (!enable_vnmi || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } @@ -5706,6 +5718,19 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!enable_vnmi) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->loaded_vmcs->soft_vnmi_blocked = 1; + vmx->loaded_vmcs->vnmi_blocked_time = 0; + } + ++vcpu->stat.nmi_injections; vmx->loaded_vmcs->nmi_known_unmasked = false; @@ -5724,6 +5749,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); bool masked; + if (!enable_vnmi) + return vmx->loaded_vmcs->soft_vnmi_blocked; if (vmx->loaded_vmcs->nmi_known_unmasked) return false; masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; @@ -5735,13 +5762,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->loaded_vmcs->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + if (!enable_vnmi) { + if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { + vmx->loaded_vmcs->soft_vnmi_blocked = masked; + vmx->loaded_vmcs->vnmi_blocked_time = 0; + } + } else { + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -5749,6 +5783,10 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) if (to_vmx(vcpu)->nested.nested_run_pending) return 0; + if (!enable_vnmi && + to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) + return 0; + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); @@ -6476,6 +6514,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * AAK134, BY25. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -6535,6 +6574,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { + WARN_ON_ONCE(!enable_vnmi); vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_VIRTUAL_NMI_PENDING); ++vcpu->stat.nmi_window_exits; @@ -6758,6 +6798,9 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; + if (!cpu_has_virtual_nmis()) + enable_vnmi = 0; + /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not @@ -6962,7 +7005,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) } /* Create a new VMCS */ - item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); + item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL); if (!item) return NULL; item->vmcs02.vmcs = alloc_vmcs(); @@ -7979,6 +8022,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) * "blocked by NMI" bit has to be set before next VM entry. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -8823,6 +8867,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) { + if (vmx_interrupt_allowed(vcpu)) { + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } + } + if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); @@ -9105,33 +9168,38 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (vmx->loaded_vmcs->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->loaded_vmcs->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); + if (enable_vnmi) { + if (vmx->loaded_vmcs->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); + } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), + vmx->loaded_vmcs->entry_time)); } static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, @@ -9248,6 +9316,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long debugctlmsr, cr3, cr4; + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->entry_time = ktime_get(); + /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) |