summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/bitops.h3
-rw-r--r--arch/x86/boot/compressed/Makefile8
-rw-r--r--arch/x86/boot/compressed/eboot.c545
-rw-r--r--arch/x86/boot/compressed/eboot.h12
-rw-r--r--arch/x86/boot/compressed/kaslr.c98
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c73
-rw-r--r--arch/x86/boot/string.c5
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S2
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c12
-rw-r--r--arch/x86/crypto/aegis128l-aesni-asm.S2
-rw-r--r--arch/x86/crypto/aegis128l-aesni-glue.c12
-rw-r--r--arch/x86/crypto/aegis256-aesni-asm.S2
-rw-r--r--arch/x86/crypto/aegis256-aesni-glue.c12
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S8
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S4
-rw-r--r--arch/x86/crypto/morus1280-avx2-asm.S2
-rw-r--r--arch/x86/crypto/morus1280-avx2-glue.c10
-rw-r--r--arch/x86/crypto/morus1280-sse2-asm.S2
-rw-r--r--arch/x86/crypto/morus1280-sse2-glue.c10
-rw-r--r--arch/x86/crypto/morus640-sse2-asm.S2
-rw-r--r--arch/x86/crypto/morus640-sse2-glue.c10
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S2
-rw-r--r--arch/x86/entry/entry_32.S632
-rw-r--r--arch/x86/entry/entry_64.S23
-rw-r--r--arch/x86/entry/vdso/Makefile26
-rw-r--r--arch/x86/events/amd/ibs.c6
-rw-r--r--arch/x86/events/intel/core.c34
-rw-r--r--arch/x86/events/intel/ds.c74
-rw-r--r--arch/x86/events/intel/lbr.c56
-rw-r--r--arch/x86/events/intel/uncore.h2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c10
-rw-r--r--arch/x86/events/perf_event.h6
-rw-r--r--arch/x86/hyperv/hv_apic.c59
-rw-r--r--arch/x86/hyperv/mmu.c80
-rw-r--r--arch/x86/include/asm/apic.h9
-rw-r--r--arch/x86/include/asm/atomic.h32
-rw-r--r--arch/x86/include/asm/atomic64_32.h61
-rw-r--r--arch/x86/include/asm/atomic64_64.h50
-rw-r--r--arch/x86/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h4
-rw-r--r--arch/x86/include/asm/cpufeatures.h6
-rw-r--r--arch/x86/include/asm/dmi.h2
-rw-r--r--arch/x86/include/asm/hardirq.h26
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h7
-rw-r--r--arch/x86/include/asm/intel-family.h13
-rw-r--r--arch/x86/include/asm/intel-mid.h43
-rw-r--r--arch/x86/include/asm/intel_ds.h3
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/include/asm/kprobes.h5
-rw-r--r--arch/x86/include/asm/kvm_guest.h7
-rw-r--r--arch/x86/include/asm/kvm_host.h6
-rw-r--r--arch/x86/include/asm/kvm_para.h1
-rw-r--r--arch/x86/include/asm/mmu_context.h5
-rw-r--r--arch/x86/include/asm/mshyperv.h34
-rw-r--r--arch/x86/include/asm/msr-index.h7
-rw-r--r--arch/x86/include/asm/nospec-branch.h2
-rw-r--r--arch/x86/include/asm/orc_types.h2
-rw-r--r--arch/x86/include/asm/page_32_types.h9
-rw-r--r--arch/x86/include/asm/percpu.h7
-rw-r--r--arch/x86/include/asm/pgtable-2level.h26
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h3
-rw-r--r--arch/x86/include/asm/pgtable-3level.h44
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h6
-rw-r--r--arch/x86/include/asm/pgtable-invert.h32
-rw-r--r--arch/x86/include/asm/pgtable.h168
-rw-r--r--arch/x86/include/asm/pgtable_32.h2
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h127
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h3
-rw-r--r--arch/x86/include/asm/pgtable_types.h28
-rw-r--r--arch/x86/include/asm/processor-flags.h8
-rw-r--r--arch/x86/include/asm/processor.h18
-rw-r--r--arch/x86/include/asm/pti.h3
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h2
-rw-r--r--arch/x86/include/asm/refcount.h1
-rw-r--r--arch/x86/include/asm/sections.h1
-rw-r--r--arch/x86/include/asm/set_memory.h1
-rw-r--r--arch/x86/include/asm/switch_to.h16
-rw-r--r--arch/x86/include/asm/text-patching.h1
-rw-r--r--arch/x86/include/asm/tlbflush.h21
-rw-r--r--arch/x86/include/asm/topology.h6
-rw-r--r--arch/x86/include/asm/trace/hyperv.h15
-rw-r--r--arch/x86/include/asm/tsc.h4
-rw-r--r--arch/x86/include/asm/unwind_hints.h16
-rw-r--r--arch/x86/include/asm/vmx.h11
-rw-r--r--arch/x86/kernel/alternative.c7
-rw-r--r--arch/x86/kernel/apic/apic.c21
-rw-r--r--arch/x86/kernel/apic/io_apic.c1
-rw-r--r--arch/x86/kernel/apic/msi.c1
-rw-r--r--arch/x86/kernel/apic/vector.c20
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/asm-offsets.c5
-rw-r--r--arch/x86/kernel/asm-offsets_32.c10
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c64
-rw-r--r--arch/x86/kernel/cpu/bugs.c191
-rw-r--r--arch/x86/kernel/cpu/common.c104
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/intel.c17
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c11
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h143
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c129
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c1522
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h43
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c808
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c202
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c16
-rw-r--r--arch/x86/kernel/cpu/topology.c41
-rw-r--r--arch/x86/kernel/dumpstack.c28
-rw-r--r--arch/x86/kernel/fpu/core.c1
-rw-r--r--arch/x86/kernel/head_32.S20
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/hpet.c1
-rw-r--r--arch/x86/kernel/hw_breakpoint.c131
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/idt.c1
-rw-r--r--arch/x86/kernel/irq.c1
-rw-r--r--arch/x86/kernel/irq_32.c1
-rw-r--r--arch/x86/kernel/irq_64.c1
-rw-r--r--arch/x86/kernel/irqinit.c1
-rw-r--r--arch/x86/kernel/jump_label.c11
-rw-r--r--arch/x86/kernel/kprobes/common.h10
-rw-r--r--arch/x86/kernel/kprobes/core.c124
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c49
-rw-r--r--arch/x86/kernel/kprobes/opt.c1
-rw-r--r--arch/x86/kernel/kvm.c18
-rw-r--r--arch/x86/kernel/kvmclock.c259
-rw-r--r--arch/x86/kernel/ldt.c137
-rw-r--r--arch/x86/kernel/machine_kexec_32.c5
-rw-r--r--arch/x86/kernel/paravirt.c14
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c2
-rw-r--r--arch/x86/kernel/pci-iommu_table.c2
-rw-r--r--arch/x86/kernel/pcspeaker.c2
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/setup.c16
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c18
-rw-r--r--arch/x86/kernel/stacktrace.c42
-rw-r--r--arch/x86/kernel/time.c1
-rw-r--r--arch/x86/kernel/tsc.c259
-rw-r--r--arch/x86/kernel/tsc_msr.c96
-rw-r--r--arch/x86/kernel/unwind_orc.c52
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S17
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c1
-rw-r--r--arch/x86/kvm/vmx.c477
-rw-r--r--arch/x86/kvm/x86.c34
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/mm/dump_pagetables.c27
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init.c62
-rw-r--r--arch/x86/mm/init_64.c14
-rw-r--r--arch/x86/mm/kmmio.c25
-rw-r--r--arch/x86/mm/mmap.c21
-rw-r--r--arch/x86/mm/numa_emulation.c107
-rw-r--r--arch/x86/mm/pageattr.c27
-rw-r--r--arch/x86/mm/pgtable.c169
-rw-r--r--arch/x86/mm/pti.c262
-rw-r--r--arch/x86/mm/tlb.c224
-rw-r--r--arch/x86/net/bpf_jit_comp32.c8
-rw-r--r--arch/x86/platform/efi/efi_64.c103
-rw-r--r--arch/x86/platform/efi/quirks.c14
-rw-r--r--arch/x86/platform/intel-mid/Makefile2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c1
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c23
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_weak_decls.h18
-rw-r--r--arch/x86/platform/intel-mid/mfld.c70
-rw-r--r--arch/x86/platform/intel-mid/mrfld.c105
-rw-r--r--arch/x86/platform/olpc/olpc.c4
-rw-r--r--arch/x86/platform/uv/tlb_uv.c3
-rw-r--r--arch/x86/power/hibernate_asm_64.S2
-rw-r--r--arch/x86/tools/relocs.c1
-rw-r--r--arch/x86/um/vdso/.gitignore1
-rw-r--r--arch/x86/um/vdso/Makefile16
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/enlighten_pv.c51
-rw-r--r--arch/x86/xen/mmu_pv.c6
-rw-r--r--arch/x86/xen/suspend_pv.c5
-rw-r--r--arch/x86/xen/time.c18
-rw-r--r--arch/x86/xen/xen-ops.h6
188 files changed, 6700 insertions, 2690 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 887d3a7bb646..0b767d6577b7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -180,13 +180,14 @@ config X86
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
+ select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
select HAVE_STACK_VALIDATION if X86_64
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
+ select HOTPLUG_SMT if SMP
select IRQ_FORCED_THREADING
select NEED_SG_DMA_LENGTH
select PCI_LOCKLESS_CONFIG
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a08e82856563..7e3c07d6ad42 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -80,11 +80,6 @@ ifeq ($(CONFIG_X86_32),y)
# alignment instructions.
KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4))
- # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
- # a lot more stack due to the lack of sharing of stacklots:
- KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \
- $(call cc-option,-fno-unit-at-a-time))
-
# CPU-specific tuning. Anything which can be shared with UML should go here.
include arch/x86/Makefile_32.cpu
KBUILD_CFLAGS += $(cflags-y)
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
index 0d41d68131cc..2e1382486e91 100644
--- a/arch/x86/boot/bitops.h
+++ b/arch/x86/boot/bitops.h
@@ -17,6 +17,7 @@
#define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */
#include <linux/types.h>
+#include <asm/asm.h>
static inline bool constant_test_bit(int nr, const void *addr)
{
@@ -28,7 +29,7 @@ static inline bool variable_test_bit(int nr, const void *addr)
bool v;
const u32 *p = (const u32 *)addr;
- asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr));
+ asm("btl %2,%1" CC_SET(c) : CC_OUT(c) (v) : "m" (*p), "Ir" (nr));
return v;
}
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index fa42f895fdde..169c2feda14a 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -106,9 +106,13 @@ define cmd_check_data_rel
done
endef
+# We need to run two commands under "if_changed", so merge them into a
+# single invocation.
+quiet_cmd_check-and-link-vmlinux = LD $@
+ cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld)
+
$(obj)/vmlinux: $(vmlinux-objs-y) FORCE
- $(call if_changed,check_data_rel)
- $(call if_changed,ld)
+ $(call if_changed,check-and-link-vmlinux)
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
$(obj)/vmlinux.bin: vmlinux FORCE
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index e98522ea6f09..1458b1700fc7 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -34,74 +34,13 @@ static void setup_boot_services##bits(struct efi_config *c) \
\
table = (typeof(table))sys_table; \
\
- c->runtime_services = table->runtime; \
- c->boot_services = table->boottime; \
- c->text_output = table->con_out; \
+ c->runtime_services = table->runtime; \
+ c->boot_services = table->boottime; \
+ c->text_output = table->con_out; \
}
BOOT_SERVICES(32);
BOOT_SERVICES(64);
-static inline efi_status_t __open_volume32(void *__image, void **__fh)
-{
- efi_file_io_interface_t *io;
- efi_loaded_image_32_t *image = __image;
- efi_file_handle_32_t *fh;
- efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
- efi_status_t status;
- void *handle = (void *)(unsigned long)image->device_handle;
- unsigned long func;
-
- status = efi_call_early(handle_protocol, handle,
- &fs_proto, (void **)&io);
- if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to handle fs_proto\n");
- return status;
- }
-
- func = (unsigned long)io->open_volume;
- status = efi_early->call(func, io, &fh);
- if (status != EFI_SUCCESS)
- efi_printk(sys_table, "Failed to open volume\n");
-
- *__fh = fh;
- return status;
-}
-
-static inline efi_status_t __open_volume64(void *__image, void **__fh)
-{
- efi_file_io_interface_t *io;
- efi_loaded_image_64_t *image = __image;
- efi_file_handle_64_t *fh;
- efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
- efi_status_t status;
- void *handle = (void *)(unsigned long)image->device_handle;
- unsigned long func;
-
- status = efi_call_early(handle_protocol, handle,
- &fs_proto, (void **)&io);
- if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to handle fs_proto\n");
- return status;
- }
-
- func = (unsigned long)io->open_volume;
- status = efi_early->call(func, io, &fh);
- if (status != EFI_SUCCESS)
- efi_printk(sys_table, "Failed to open volume\n");
-
- *__fh = fh;
- return status;
-}
-
-efi_status_t
-efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
-{
- if (efi_early->is64)
- return __open_volume64(__image, __fh);
-
- return __open_volume32(__image, __fh);
-}
-
void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
{
efi_call_proto(efi_simple_text_output_protocol, output_string,
@@ -109,7 +48,7 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
}
static efi_status_t
-__setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
+preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
{
struct pci_setup_rom *rom = NULL;
efi_status_t status;
@@ -134,16 +73,16 @@ __setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to alloc mem for rom\n");
+ efi_printk(sys_table, "Failed to allocate memory for 'rom'\n");
return status;
}
memset(rom, 0, sizeof(*rom));
- rom->data.type = SETUP_PCI;
- rom->data.len = size - sizeof(struct setup_data);
- rom->data.next = 0;
- rom->pcilen = pci->romsize;
+ rom->data.type = SETUP_PCI;
+ rom->data.len = size - sizeof(struct setup_data);
+ rom->data.next = 0;
+ rom->pcilen = pci->romsize;
*__rom = rom;
status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
@@ -179,96 +118,6 @@ free_struct:
return status;
}
-static void
-setup_efi_pci32(struct boot_params *params, void **pci_handle,
- unsigned long size)
-{
- efi_pci_io_protocol_t *pci = NULL;
- efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
- u32 *handles = (u32 *)(unsigned long)pci_handle;
- efi_status_t status;
- unsigned long nr_pci;
- struct setup_data *data;
- int i;
-
- data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
-
- while (data && data->next)
- data = (struct setup_data *)(unsigned long)data->next;
-
- nr_pci = size / sizeof(u32);
- for (i = 0; i < nr_pci; i++) {
- struct pci_setup_rom *rom = NULL;
- u32 h = handles[i];
-
- status = efi_call_early(handle_protocol, h,
- &pci_proto, (void **)&pci);
-
- if (status != EFI_SUCCESS)
- continue;
-
- if (!pci)
- continue;
-
- status = __setup_efi_pci(pci, &rom);
- if (status != EFI_SUCCESS)
- continue;
-
- if (data)
- data->next = (unsigned long)rom;
- else
- params->hdr.setup_data = (unsigned long)rom;
-
- data = (struct setup_data *)rom;
-
- }
-}
-
-static void
-setup_efi_pci64(struct boot_params *params, void **pci_handle,
- unsigned long size)
-{
- efi_pci_io_protocol_t *pci = NULL;
- efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
- u64 *handles = (u64 *)(unsigned long)pci_handle;
- efi_status_t status;
- unsigned long nr_pci;
- struct setup_data *data;
- int i;
-
- data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
-
- while (data && data->next)
- data = (struct setup_data *)(unsigned long)data->next;
-
- nr_pci = size / sizeof(u64);
- for (i = 0; i < nr_pci; i++) {
- struct pci_setup_rom *rom = NULL;
- u64 h = handles[i];
-
- status = efi_call_early(handle_protocol, h,
- &pci_proto, (void **)&pci);
-
- if (status != EFI_SUCCESS)
- continue;
-
- if (!pci)
- continue;
-
- status = __setup_efi_pci(pci, &rom);
- if (status != EFI_SUCCESS)
- continue;
-
- if (data)
- data->next = (unsigned long)rom;
- else
- params->hdr.setup_data = (unsigned long)rom;
-
- data = (struct setup_data *)rom;
-
- }
-}
-
/*
* There's no way to return an informative status from this function,
* because any analysis (and printing of error messages) needs to be
@@ -284,6 +133,9 @@ static void setup_efi_pci(struct boot_params *params)
void **pci_handle = NULL;
efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
unsigned long size = 0;
+ unsigned long nr_pci;
+ struct setup_data *data;
+ int i;
status = efi_call_early(locate_handle,
EFI_LOCATE_BY_PROTOCOL,
@@ -295,7 +147,7 @@ static void setup_efi_pci(struct boot_params *params)
size, (void **)&pci_handle);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to alloc mem for pci_handle\n");
+ efi_printk(sys_table, "Failed to allocate memory for 'pci_handle'\n");
return;
}
@@ -307,10 +159,34 @@ static void setup_efi_pci(struct boot_params *params)
if (status != EFI_SUCCESS)
goto free_handle;
- if (efi_early->is64)
- setup_efi_pci64(params, pci_handle, size);
- else
- setup_efi_pci32(params, pci_handle, size);
+ data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
+
+ while (data && data->next)
+ data = (struct setup_data *)(unsigned long)data->next;
+
+ nr_pci = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32));
+ for (i = 0; i < nr_pci; i++) {
+ efi_pci_io_protocol_t *pci = NULL;
+ struct pci_setup_rom *rom;
+
+ status = efi_call_early(handle_protocol,
+ efi_is_64bit() ? ((u64 *)pci_handle)[i]
+ : ((u32 *)pci_handle)[i],
+ &pci_proto, (void **)&pci);
+ if (status != EFI_SUCCESS || !pci)
+ continue;
+
+ status = preserve_pci_rom_image(pci, &rom);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ if (data)
+ data->next = (unsigned long)rom;
+ else
+ params->hdr.setup_data = (unsigned long)rom;
+
+ data = (struct setup_data *)rom;
+ }
free_handle:
efi_call_early(free_pool, pci_handle);
@@ -341,8 +217,7 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
size + sizeof(struct setup_data), &new);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table,
- "Failed to alloc mem for properties\n");
+ efi_printk(sys_table, "Failed to allocate memory for 'properties'\n");
return;
}
@@ -358,9 +233,9 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
new->next = 0;
data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
- if (!data)
+ if (!data) {
boot_params->hdr.setup_data = (unsigned long)new;
- else {
+ } else {
while (data->next)
data = (struct setup_data *)(unsigned long)data->next;
data->next = (unsigned long)new;
@@ -380,81 +255,55 @@ static void setup_quirks(struct boot_params *boot_params)
}
}
+/*
+ * See if we have Universal Graphics Adapter (UGA) protocol
+ */
static efi_status_t
-setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
+setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size)
{
- struct efi_uga_draw_protocol *uga = NULL, *first_uga;
- efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
+ efi_status_t status;
+ u32 width, height;
+ void **uga_handle = NULL;
+ efi_uga_draw_protocol_t *uga = NULL, *first_uga;
unsigned long nr_ugas;
- u32 *handles = (u32 *)uga_handle;
- efi_status_t status = EFI_INVALID_PARAMETER;
int i;
- first_uga = NULL;
- nr_ugas = size / sizeof(u32);
- for (i = 0; i < nr_ugas; i++) {
- efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
- u32 w, h, depth, refresh;
- void *pciio;
- u32 handle = handles[i];
-
- status = efi_call_early(handle_protocol, handle,
- &uga_proto, (void **)&uga);
- if (status != EFI_SUCCESS)
- continue;
-
- efi_call_early(handle_protocol, handle, &pciio_proto, &pciio);
-
- status = efi_early->call((unsigned long)uga->get_mode, uga,
- &w, &h, &depth, &refresh);
- if (status == EFI_SUCCESS && (!first_uga || pciio)) {
- *width = w;
- *height = h;
-
- /*
- * Once we've found a UGA supporting PCIIO,
- * don't bother looking any further.
- */
- if (pciio)
- break;
-
- first_uga = uga;
- }
- }
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size, (void **)&uga_handle);
+ if (status != EFI_SUCCESS)
+ return status;
- return status;
-}
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ uga_proto, NULL, &size, uga_handle);
+ if (status != EFI_SUCCESS)
+ goto free_handle;
-static efi_status_t
-setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height)
-{
- struct efi_uga_draw_protocol *uga = NULL, *first_uga;
- efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
- unsigned long nr_ugas;
- u64 *handles = (u64 *)uga_handle;
- efi_status_t status = EFI_INVALID_PARAMETER;
- int i;
+ height = 0;
+ width = 0;
first_uga = NULL;
- nr_ugas = size / sizeof(u64);
+ nr_ugas = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32));
for (i = 0; i < nr_ugas; i++) {
efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
u32 w, h, depth, refresh;
void *pciio;
- u64 handle = handles[i];
+ unsigned long handle = efi_is_64bit() ? ((u64 *)uga_handle)[i]
+ : ((u32 *)uga_handle)[i];
status = efi_call_early(handle_protocol, handle,
- &uga_proto, (void **)&uga);
+ uga_proto, (void **)&uga);
if (status != EFI_SUCCESS)
continue;
+ pciio = NULL;
efi_call_early(handle_protocol, handle, &pciio_proto, &pciio);
- status = efi_early->call((unsigned long)uga->get_mode, uga,
- &w, &h, &depth, &refresh);
+ status = efi_call_proto(efi_uga_draw_protocol, get_mode, uga,
+ &w, &h, &depth, &refresh);
if (status == EFI_SUCCESS && (!first_uga || pciio)) {
- *width = w;
- *height = h;
+ width = w;
+ height = h;
/*
* Once we've found a UGA supporting PCIIO,
@@ -467,59 +316,28 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height)
}
}
- return status;
-}
-
-/*
- * See if we have Universal Graphics Adapter (UGA) protocol
- */
-static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
- unsigned long size)
-{
- efi_status_t status;
- u32 width, height;
- void **uga_handle = NULL;
-
- status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
- size, (void **)&uga_handle);
- if (status != EFI_SUCCESS)
- return status;
-
- status = efi_call_early(locate_handle,
- EFI_LOCATE_BY_PROTOCOL,
- uga_proto, NULL, &size, uga_handle);
- if (status != EFI_SUCCESS)
- goto free_handle;
-
- height = 0;
- width = 0;
-
- if (efi_early->is64)
- status = setup_uga64(uga_handle, size, &width, &height);
- else
- status = setup_uga32(uga_handle, size, &width, &height);
-
if (!width && !height)
goto free_handle;
/* EFI framebuffer */
- si->orig_video_isVGA = VIDEO_TYPE_EFI;
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
- si->lfb_depth = 32;
- si->lfb_width = width;
- si->lfb_height = height;
+ si->lfb_depth = 32;
+ si->lfb_width = width;
+ si->lfb_height = height;
- si->red_size = 8;
- si->red_pos = 16;
- si->green_size = 8;
- si->green_pos = 8;
- si->blue_size = 8;
- si->blue_pos = 0;
- si->rsvd_size = 8;
- si->rsvd_pos = 24;
+ si->red_size = 8;
+ si->red_pos = 16;
+ si->green_size = 8;
+ si->green_pos = 8;
+ si->blue_size = 8;
+ si->blue_pos = 0;
+ si->rsvd_size = 8;
+ si->rsvd_pos = 24;
free_handle:
efi_call_early(free_pool, uga_handle);
+
return status;
}
@@ -586,7 +404,7 @@ struct boot_params *make_boot_params(struct efi_config *c)
if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
return NULL;
- if (efi_early->is64)
+ if (efi_is_64bit())
setup_boot_services64(efi_early);
else
setup_boot_services32(efi_early);
@@ -601,7 +419,7 @@ struct boot_params *make_boot_params(struct efi_config *c)
status = efi_low_alloc(sys_table, 0x4000, 1,
(unsigned long *)&boot_params);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to alloc lowmem for boot params\n");
+ efi_printk(sys_table, "Failed to allocate lowmem for boot params\n");
return NULL;
}
@@ -617,9 +435,9 @@ struct boot_params *make_boot_params(struct efi_config *c)
* Fill out some of the header fields ourselves because the
* EFI firmware loader doesn't load the first sector.
*/
- hdr->root_flags = 1;
- hdr->vid_mode = 0xffff;
- hdr->boot_flag = 0xAA55;
+ hdr->root_flags = 1;
+ hdr->vid_mode = 0xffff;
+ hdr->boot_flag = 0xAA55;
hdr->type_of_loader = 0x21;
@@ -627,6 +445,7 @@ struct boot_params *make_boot_params(struct efi_config *c)
cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size);
if (!cmdline_ptr)
goto fail;
+
hdr->cmd_line_ptr = (unsigned long)cmdline_ptr;
/* Fill in upper bits of command line address, NOP on 32 bit */
boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32;
@@ -663,10 +482,12 @@ struct boot_params *make_boot_params(struct efi_config *c)
boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32;
return boot_params;
+
fail2:
efi_free(sys_table, options_size, hdr->cmd_line_ptr);
fail:
efi_free(sys_table, 0x4000, (unsigned long)boot_params);
+
return NULL;
}
@@ -678,7 +499,7 @@ static void add_e820ext(struct boot_params *params,
unsigned long size;
e820ext->type = SETUP_E820_EXT;
- e820ext->len = nr_entries * sizeof(struct boot_e820_entry);
+ e820ext->len = nr_entries * sizeof(struct boot_e820_entry);
e820ext->next = 0;
data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
@@ -692,8 +513,8 @@ static void add_e820ext(struct boot_params *params,
params->hdr.setup_data = (unsigned long)e820ext;
}
-static efi_status_t setup_e820(struct boot_params *params,
- struct setup_data *e820ext, u32 e820ext_size)
+static efi_status_t
+setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_size)
{
struct boot_e820_entry *entry = params->e820_table;
struct efi_info *efi = &params->efi_info;
@@ -814,11 +635,10 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
}
struct exit_boot_struct {
- struct boot_params *boot_params;
- struct efi_info *efi;
- struct setup_data *e820ext;
- __u32 e820ext_size;
- bool is64;
+ struct boot_params *boot_params;
+ struct efi_info *efi;
+ struct setup_data *e820ext;
+ __u32 e820ext_size;
};
static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
@@ -845,25 +665,25 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
first = false;
}
- signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE;
+ signature = efi_is_64bit() ? EFI64_LOADER_SIGNATURE
+ : EFI32_LOADER_SIGNATURE;
memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32));
- p->efi->efi_systab = (unsigned long)sys_table_arg;
- p->efi->efi_memdesc_size = *map->desc_size;
- p->efi->efi_memdesc_version = *map->desc_ver;
- p->efi->efi_memmap = (unsigned long)*map->map;
- p->efi->efi_memmap_size = *map->map_size;
+ p->efi->efi_systab = (unsigned long)sys_table_arg;
+ p->efi->efi_memdesc_size = *map->desc_size;
+ p->efi->efi_memdesc_version = *map->desc_ver;
+ p->efi->efi_memmap = (unsigned long)*map->map;
+ p->efi->efi_memmap_size = *map->map_size;
#ifdef CONFIG_X86_64
- p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32;
- p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32;
+ p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32;
+ p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32;
#endif
return EFI_SUCCESS;
}
-static efi_status_t exit_boot(struct boot_params *boot_params,
- void *handle, bool is64)
+static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
{
unsigned long map_sz, key, desc_size, buff_size;
efi_memory_desc_t *mem_map;
@@ -874,17 +694,16 @@ static efi_status_t exit_boot(struct boot_params *boot_params,
struct efi_boot_memmap map;
struct exit_boot_struct priv;
- map.map = &mem_map;
- map.map_size = &map_sz;
- map.desc_size = &desc_size;
- map.desc_ver = &desc_version;
- map.key_ptr = &key;
- map.buff_size = &buff_size;
- priv.boot_params = boot_params;
- priv.efi = &boot_params->efi_info;
- priv.e820ext = NULL;
- priv.e820ext_size = 0;
- priv.is64 = is64;
+ map.map = &mem_map;
+ map.map_size = &map_sz;
+ map.desc_size = &desc_size;
+ map.desc_ver = &desc_version;
+ map.key_ptr = &key;
+ map.buff_size = &buff_size;
+ priv.boot_params = boot_params;
+ priv.efi = &boot_params->efi_info;
+ priv.e820ext = NULL;
+ priv.e820ext_size = 0;
/* Might as well exit boot services now */
status = efi_exit_boot_services(sys_table, handle, &map, &priv,
@@ -892,10 +711,11 @@ static efi_status_t exit_boot(struct boot_params *boot_params,
if (status != EFI_SUCCESS)
return status;
- e820ext = priv.e820ext;
- e820ext_size = priv.e820ext_size;
+ e820ext = priv.e820ext;
+ e820ext_size = priv.e820ext_size;
+
/* Historic? */
- boot_params->alt_mem_k = 32 * 1024;
+ boot_params->alt_mem_k = 32 * 1024;
status = setup_e820(boot_params, e820ext, e820ext_size);
if (status != EFI_SUCCESS)
@@ -908,8 +728,8 @@ static efi_status_t exit_boot(struct boot_params *boot_params,
* On success we return a pointer to a boot_params structure, and NULL
* on failure.
*/
-struct boot_params *efi_main(struct efi_config *c,
- struct boot_params *boot_params)
+struct boot_params *
+efi_main(struct efi_config *c, struct boot_params *boot_params)
{
struct desc_ptr *gdt = NULL;
efi_loaded_image_t *image;
@@ -918,13 +738,11 @@ struct boot_params *efi_main(struct efi_config *c,
struct desc_struct *desc;
void *handle;
efi_system_table_t *_table;
- bool is64;
efi_early = c;
_table = (efi_system_table_t *)(unsigned long)efi_early->table;
handle = (void *)(unsigned long)efi_early->image_handle;
- is64 = efi_early->is64;
sys_table = _table;
@@ -932,7 +750,7 @@ struct boot_params *efi_main(struct efi_config *c,
if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
goto fail;
- if (is64)
+ if (efi_is_64bit())
setup_boot_services64(efi_early);
else
setup_boot_services32(efi_early);
@@ -957,7 +775,7 @@ struct boot_params *efi_main(struct efi_config *c,
status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
sizeof(*gdt), (void **)&gdt);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to alloc mem for gdt structure\n");
+ efi_printk(sys_table, "Failed to allocate memory for 'gdt' structure\n");
goto fail;
}
@@ -965,7 +783,7 @@ struct boot_params *efi_main(struct efi_config *c,
status = efi_low_alloc(sys_table, gdt->size, 8,
(unsigned long *)&gdt->address);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to alloc mem for gdt\n");
+ efi_printk(sys_table, "Failed to allocate memory for 'gdt'\n");
goto fail;
}
@@ -988,7 +806,7 @@ struct boot_params *efi_main(struct efi_config *c,
hdr->code32_start = bzimage_addr;
}
- status = exit_boot(boot_params, handle, is64);
+ status = exit_boot(boot_params, handle);
if (status != EFI_SUCCESS) {
efi_printk(sys_table, "exit_boot() failed!\n");
goto fail;
@@ -1002,19 +820,20 @@ struct boot_params *efi_main(struct efi_config *c,
if (IS_ENABLED(CONFIG_X86_64)) {
/* __KERNEL32_CS */
- desc->limit0 = 0xffff;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
- desc->s = DESC_TYPE_CODE_DATA;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit1 = 0xf;
- desc->avl = 0;
- desc->l = 0;
- desc->d = SEG_OP_SIZE_32BIT;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit1 = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+
desc++;
} else {
/* Second entry is unused on 32-bit */
@@ -1022,15 +841,16 @@ struct boot_params *efi_main(struct efi_config *c,
}
/* __KERNEL_CS */
- desc->limit0 = 0xffff;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
- desc->s = DESC_TYPE_CODE_DATA;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit1 = 0xf;
- desc->avl = 0;
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit1 = 0xf;
+ desc->avl = 0;
+
if (IS_ENABLED(CONFIG_X86_64)) {
desc->l = 1;
desc->d = 0;
@@ -1038,41 +858,41 @@ struct boot_params *efi_main(struct efi_config *c,
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
}
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
desc++;
/* __KERNEL_DS */
- desc->limit0 = 0xffff;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
- desc->s = DESC_TYPE_CODE_DATA;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit1 = 0xf;
- desc->avl = 0;
- desc->l = 0;
- desc->d = SEG_OP_SIZE_32BIT;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit1 = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
desc++;
if (IS_ENABLED(CONFIG_X86_64)) {
/* Task segment value */
- desc->limit0 = 0x0000;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_TSS;
- desc->s = 0;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit1 = 0x0;
- desc->avl = 0;
- desc->l = 0;
- desc->d = 0;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
+ desc->limit0 = 0x0000;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_TSS;
+ desc->s = 0;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit1 = 0x0;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = 0;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
desc++;
}
@@ -1082,5 +902,6 @@ struct boot_params *efi_main(struct efi_config *c,
return boot_params;
fail:
efi_printk(sys_table, "efi_main() failed!\n");
+
return NULL;
}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index e799dc5c6448..8297387c4676 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -12,22 +12,22 @@
#define DESC_TYPE_CODE_DATA (1 << 0)
-struct efi_uga_draw_protocol_32 {
+typedef struct {
u32 get_mode;
u32 set_mode;
u32 blt;
-};
+} efi_uga_draw_protocol_32_t;
-struct efi_uga_draw_protocol_64 {
+typedef struct {
u64 get_mode;
u64 set_mode;
u64 blt;
-};
+} efi_uga_draw_protocol_64_t;
-struct efi_uga_draw_protocol {
+typedef struct {
void *get_mode;
void *set_mode;
void *blt;
-};
+} efi_uga_draw_protocol_t;
#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index b87a7582853d..302517929932 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -102,7 +102,7 @@ static bool memmap_too_large;
/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
-unsigned long long mem_limit = ULLONG_MAX;
+static unsigned long long mem_limit = ULLONG_MAX;
enum mem_avoid_index {
@@ -215,7 +215,36 @@ static void mem_avoid_memmap(char *str)
memmap_too_large = true;
}
-static int handle_mem_memmap(void)
+/* Store the number of 1GB huge pages which users specified: */
+static unsigned long max_gb_huge_pages;
+
+static void parse_gb_huge_pages(char *param, char *val)
+{
+ static bool gbpage_sz;
+ char *p;
+
+ if (!strcmp(param, "hugepagesz")) {
+ p = val;
+ if (memparse(p, &p) != PUD_SIZE) {
+ gbpage_sz = false;
+ return;
+ }
+
+ if (gbpage_sz)
+ warn("Repeatedly set hugeTLB page size of 1G!\n");
+ gbpage_sz = true;
+ return;
+ }
+
+ if (!strcmp(param, "hugepages") && gbpage_sz) {
+ p = val;
+ max_gb_huge_pages = simple_strtoull(p, &p, 0);
+ return;
+ }
+}
+
+
+static int handle_mem_options(void)
{
char *args = (char *)get_cmd_line_ptr();
size_t len = strlen((char *)args);
@@ -223,7 +252,8 @@ static int handle_mem_memmap(void)
char *param, *val;
u64 mem_size;
- if (!strstr(args, "memmap=") && !strstr(args, "mem="))
+ if (!strstr(args, "memmap=") && !strstr(args, "mem=") &&
+ !strstr(args, "hugepages"))
return 0;
tmp_cmdline = malloc(len + 1);
@@ -248,6 +278,8 @@ static int handle_mem_memmap(void)
if (!strcmp(param, "memmap")) {
mem_avoid_memmap(val);
+ } else if (strstr(param, "hugepages")) {
+ parse_gb_huge_pages(param, val);
} else if (!strcmp(param, "mem")) {
char *p = val;
@@ -387,7 +419,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* We don't need to set a mapping for setup_data. */
/* Mark the memmap regions we need to avoid */
- handle_mem_memmap();
+ handle_mem_options();
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
@@ -466,6 +498,60 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size)
}
}
+/*
+ * Skip as many 1GB huge pages as possible in the passed region
+ * according to the number which users specified:
+ */
+static void
+process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
+{
+ unsigned long addr, size = 0;
+ struct mem_vector tmp;
+ int i = 0;
+
+ if (!max_gb_huge_pages) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ addr = ALIGN(region->start, PUD_SIZE);
+ /* Did we raise the address above the passed in memory entry? */
+ if (addr < region->start + region->size)
+ size = region->size - (addr - region->start);
+
+ /* Check how many 1GB huge pages can be filtered out: */
+ while (size > PUD_SIZE && max_gb_huge_pages) {
+ size -= PUD_SIZE;
+ max_gb_huge_pages--;
+ i++;
+ }
+
+ /* No good 1GB huge pages found: */
+ if (!i) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ /*
+ * Skip those 'i'*1GB good huge pages, and continue checking and
+ * processing the remaining head or tail part of the passed region
+ * if available.
+ */
+
+ if (addr >= region->start + image_size) {
+ tmp.start = region->start;
+ tmp.size = addr - region->start;
+ store_slot_info(&tmp, image_size);
+ }
+
+ size = region->size - (addr - region->start) - i * PUD_SIZE;
+ if (size >= image_size) {
+ tmp.start = addr + i * PUD_SIZE;
+ tmp.size = size;
+ store_slot_info(&tmp, image_size);
+ }
+}
+
static unsigned long slots_fetch_random(void)
{
unsigned long slot;
@@ -546,7 +632,7 @@ static void process_mem_region(struct mem_vector *entry,
/* If nothing overlaps, store the region and return. */
if (!mem_avoid_overlap(&region, &overlap)) {
- store_slot_info(&region, image_size);
+ process_gb_huge_pages(&region, image_size);
return;
}
@@ -556,7 +642,7 @@ static void process_mem_region(struct mem_vector *entry,
beginning.start = region.start;
beginning.size = overlap.start - region.start;
- store_slot_info(&beginning, image_size);
+ process_gb_huge_pages(&beginning, image_size);
}
/* Return if overlap extends to or past end of region. */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 8c5107545251..9e2157371491 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,3 +1,4 @@
+#include <asm/e820/types.h>
#include <asm/processor.h>
#include "pgtable.h"
#include "../string.h"
@@ -34,10 +35,62 @@ unsigned long *trampoline_32bit __section(.data);
extern struct boot_params *boot_params;
int cmdline_find_option_bool(const char *option);
+static unsigned long find_trampoline_placement(void)
+{
+ unsigned long bios_start, ebda_start;
+ unsigned long trampoline_start;
+ struct boot_e820_entry *entry;
+ int i;
+
+ /*
+ * Find a suitable spot for the trampoline.
+ * This code is based on reserve_bios_regions().
+ */
+
+ ebda_start = *(unsigned short *)0x40e << 4;
+ bios_start = *(unsigned short *)0x413 << 10;
+
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
+
+ if (ebda_start > BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
+
+ bios_start = round_down(bios_start, PAGE_SIZE);
+
+ /* Find the first usable memory region under bios_start. */
+ for (i = boot_params->e820_entries - 1; i >= 0; i--) {
+ entry = &boot_params->e820_table[i];
+
+ /* Skip all entries above bios_start. */
+ if (bios_start <= entry->addr)
+ continue;
+
+ /* Skip non-RAM entries. */
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ /* Adjust bios_start to the end of the entry if needed. */
+ if (bios_start > entry->addr + entry->size)
+ bios_start = entry->addr + entry->size;
+
+ /* Keep bios_start page-aligned. */
+ bios_start = round_down(bios_start, PAGE_SIZE);
+
+ /* Skip the entry if it's too small. */
+ if (bios_start - TRAMPOLINE_32BIT_SIZE < entry->addr)
+ continue;
+
+ break;
+ }
+
+ /* Place the trampoline just below the end of low memory */
+ return bios_start - TRAMPOLINE_32BIT_SIZE;
+}
+
struct paging_config paging_prepare(void *rmode)
{
struct paging_config paging_config = {};
- unsigned long bios_start, ebda_start;
/* Initialize boot_params. Required for cmdline_find_option_bool(). */
boot_params = rmode;
@@ -61,23 +114,7 @@ struct paging_config paging_prepare(void *rmode)
paging_config.l5_required = 1;
}
- /*
- * Find a suitable spot for the trampoline.
- * This code is based on reserve_bios_regions().
- */
-
- ebda_start = *(unsigned short *)0x40e << 4;
- bios_start = *(unsigned short *)0x413 << 10;
-
- if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
- bios_start = BIOS_START_MAX;
-
- if (ebda_start > BIOS_START_MIN && ebda_start < bios_start)
- bios_start = ebda_start;
-
- /* Place the trampoline just below the end of low memory, aligned to 4k */
- paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE;
- paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE);
+ paging_config.trampoline_start = find_trampoline_placement();
trampoline_32bit = (unsigned long *)paging_config.trampoline_start;
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 16f49123d747..c4428a176973 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -13,6 +13,7 @@
*/
#include <linux/types.h>
+#include <asm/asm.h>
#include "ctype.h"
#include "string.h"
@@ -28,8 +29,8 @@
int memcmp(const void *s1, const void *s2, size_t len)
{
bool diff;
- asm("repe; cmpsb; setnz %0"
- : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ asm("repe; cmpsb" CC_SET(nz)
+ : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
return diff;
}
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 717bf0776421..5f7e43d4f64a 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -75,7 +75,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov LEN, %r8
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 5de7c0d46edf..acd11b3bf639 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128_aesni_alg[] = {
}
};
-static const struct x86_cpu_id aesni_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_AES),
- X86_FEATURE_MATCH(X86_FEATURE_XMM2),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
-
static int __init crypto_aegis128_aesni_module_init(void)
{
- if (!x86_match_cpu(aesni_cpu_id))
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
return crypto_register_aeads(crypto_aegis128_aesni_alg,
diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S
index 4eda2b8db9e1..491dd61c845c 100644
--- a/arch/x86/crypto/aegis128l-aesni-asm.S
+++ b/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -66,7 +66,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG0, MSG0
pxor MSG1, MSG1
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
index 876e4866e633..2071c3d1ae07 100644
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128l_aesni_alg[] = {
}
};
-static const struct x86_cpu_id aesni_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_AES),
- X86_FEATURE_MATCH(X86_FEATURE_XMM2),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
-
static int __init crypto_aegis128l_aesni_module_init(void)
{
- if (!x86_match_cpu(aesni_cpu_id))
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
return crypto_register_aeads(crypto_aegis128l_aesni_alg,
diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S
index 32aae8397268..8870c7c5d9a4 100644
--- a/arch/x86/crypto/aegis256-aesni-asm.S
+++ b/arch/x86/crypto/aegis256-aesni-asm.S
@@ -59,7 +59,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov LEN, %r8
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
index 2b5dd3af8f4d..b5f2a8fd5a71 100644
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis256_aesni_alg[] = {
}
};
-static const struct x86_cpu_id aesni_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_AES),
- X86_FEATURE_MATCH(X86_FEATURE_XMM2),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
-
static int __init crypto_aegis256_aesni_module_init(void)
{
- if (!x86_match_cpu(aesni_cpu_id))
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
return crypto_register_aeads(crypto_aegis256_aesni_alg,
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index e762ef417562..9bd139569b41 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -258,7 +258,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.macro GCM_INIT Iv SUBKEY AAD AADLEN
mov \AADLEN, %r11
mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
- xor %r11, %r11
+ xor %r11d, %r11d
mov %r11, InLen(%arg2) # ctx_data.in_length = 0
mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
@@ -286,7 +286,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
movdqu HashKey(%arg2), %xmm13
add %arg5, InLen(%arg2)
- xor %r11, %r11 # initialise the data pointer offset as zero
+ xor %r11d, %r11d # initialise the data pointer offset as zero
PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
sub %r11, %arg5 # sub partial block data used
@@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %rax,%rax
+ xor %eax, %eax
mov %rax, PBlockLen(%arg2)
jmp _dec_done_\@
@@ -737,7 +737,7 @@ _no_extra_mask_2_\@:
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %rax,%rax
+ xor %eax, %eax
mov %rax, PBlockLen(%arg2)
jmp _encode_done_\@
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index faecb1518bf8..1985ea0b551b 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -463,7 +463,7 @@ _get_AAD_rest_final\@:
_get_AAD_done\@:
# initialize the data pointer offset as zero
- xor %r11, %r11
+ xor %r11d, %r11d
# start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0
@@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:
_get_AAD_done\@:
# initialize the data pointer offset as zero
- xor %r11, %r11
+ xor %r11d, %r11d
# start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0
diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S
index 07653d4582a6..de182c460f82 100644
--- a/arch/x86/crypto/morus1280-avx2-asm.S
+++ b/arch/x86/crypto/morus1280-avx2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
vpxor MSG, MSG, MSG
mov %rcx, %r8
diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c
index f111f36d26dc..6634907d6ccd 100644
--- a/arch/x86/crypto/morus1280-avx2-glue.c
+++ b/arch/x86/crypto/morus1280-avx2-glue.c
@@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor,
MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
-static const struct x86_cpu_id avx2_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_AVX2),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id);
-
static int __init crypto_morus1280_avx2_module_init(void)
{
- if (!x86_match_cpu(avx2_cpu_id))
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
return -ENODEV;
return crypto_register_aeads(crypto_morus1280_avx2_algs,
diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S
index bd1aa1b60869..da5d2905db60 100644
--- a/arch/x86/crypto/morus1280-sse2-asm.S
+++ b/arch/x86/crypto/morus1280-sse2-asm.S
@@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG_LO, MSG_LO
pxor MSG_HI, MSG_HI
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c
index 839270aa713c..95cf857d2cbb 100644
--- a/arch/x86/crypto/morus1280-sse2-glue.c
+++ b/arch/x86/crypto/morus1280-sse2-glue.c
@@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor,
MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
-static const struct x86_cpu_id sse2_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_XMM2),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
-
static int __init crypto_morus1280_sse2_module_init(void)
{
- if (!x86_match_cpu(sse2_cpu_id))
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
return crypto_register_aeads(crypto_morus1280_sse2_algs,
diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S
index efa02816d921..414db480250e 100644
--- a/arch/x86/crypto/morus640-sse2-asm.S
+++ b/arch/x86/crypto/morus640-sse2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov %rcx, %r8
diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c
index 26b47e2db8d2..615fb7bc9a32 100644
--- a/arch/x86/crypto/morus640-sse2-glue.c
+++ b/arch/x86/crypto/morus640-sse2-glue.c
@@ -37,15 +37,11 @@ asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor,
MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
-static const struct x86_cpu_id sse2_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_XMM2),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
-
static int __init crypto_morus640_sse2_module_init(void)
{
- if (!x86_match_cpu(sse2_cpu_id))
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
return crypto_register_aeads(crypto_morus640_sse2_algs,
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 6204bd53528c..613d0bfc3d84 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -96,7 +96,7 @@
# cleanup workspace
mov $8, %ecx
mov %rsp, %rdi
- xor %rax, %rax
+ xor %eax, %eax
rep stosq
mov %rbp, %rsp # deallocate workspace
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index c371bfee137a..2767c625a52c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -65,7 +65,7 @@
# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
# define preempt_stop(clobbers)
-# define resume_kernel restore_all
+# define resume_kernel restore_all_kernel
#endif
.macro TRACE_IRQS_IRET
@@ -77,6 +77,8 @@
#endif
.endm
+#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
+
/*
* User gs save/restore
*
@@ -154,7 +156,52 @@
#endif /* CONFIG_X86_32_LAZY_GS */
-.macro SAVE_ALL pt_regs_ax=%eax
+/* Unconditionally switch to user cr3 */
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ movl %cr3, \scratch_reg
+ orl $PTI_SWITCH_MASK, \scratch_reg
+ movl \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro BUG_IF_WRONG_CR3 no_user_check=0
+#ifdef CONFIG_DEBUG_ENTRY
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ .if \no_user_check == 0
+ /* coming from usermode? */
+ testl $SEGMENT_RPL_MASK, PT_CS(%esp)
+ jz .Lend_\@
+ .endif
+ /* On user-cr3? */
+ movl %cr3, %eax
+ testl $PTI_SWITCH_MASK, %eax
+ jnz .Lend_\@
+ /* From userspace with kernel cr3 - BUG */
+ ud2
+.Lend_\@:
+#endif
+.endm
+
+/*
+ * Switch to kernel cr3 if not already loaded and return current cr3 in
+ * \scratch_reg
+ */
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ movl %cr3, \scratch_reg
+ /* Test if we are already on kernel CR3 */
+ testl $PTI_SWITCH_MASK, \scratch_reg
+ jz .Lend_\@
+ andl $(~PTI_SWITCH_MASK), \scratch_reg
+ movl \scratch_reg, %cr3
+ /* Return original CR3 in \scratch_reg */
+ orl $PTI_SWITCH_MASK, \scratch_reg
+.Lend_\@:
+.endm
+
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
cld
PUSH_GS
pushl %fs
@@ -173,6 +220,29 @@
movl $(__KERNEL_PERCPU), %edx
movl %edx, %fs
SET_KERNEL_GS %edx
+
+ /* Switch to kernel stack if necessary */
+.if \switch_stacks > 0
+ SWITCH_TO_KERNEL_STACK
+.endif
+
+.endm
+
+.macro SAVE_ALL_NMI cr3_reg:req
+ SAVE_ALL
+
+ BUG_IF_WRONG_CR3
+
+ /*
+ * Now switch the CR3 when PTI is enabled.
+ *
+ * We can enter with either user or kernel cr3, the code will
+ * store the old cr3 in \cr3_reg and switches to the kernel cr3
+ * if necessary.
+ */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
+
+.Lend_\@:
.endm
/*
@@ -221,6 +291,349 @@
POP_GS_EX
.endm
+.macro RESTORE_ALL_NMI cr3_reg:req pop=0
+ /*
+ * Now switch the CR3 when PTI is enabled.
+ *
+ * We enter with kernel cr3 and switch the cr3 to the value
+ * stored on \cr3_reg, which is either a user or a kernel cr3.
+ */
+ ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
+
+ testl $PTI_SWITCH_MASK, \cr3_reg
+ jz .Lswitched_\@
+
+ /* User cr3 in \cr3_reg - write it to hardware cr3 */
+ movl \cr3_reg, %cr3
+
+.Lswitched_\@:
+
+ BUG_IF_WRONG_CR3
+
+ RESTORE_REGS pop=\pop
+.endm
+
+.macro CHECK_AND_APPLY_ESPFIX
+#ifdef CONFIG_X86_ESPFIX32
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+
+ ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
+
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ /*
+ * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+ * are returning to the kernel.
+ * See comments in process.c:copy_thread() for details.
+ */
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ jne .Lend_\@ # returning to user-space with LDT SS
+
+ /*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ shr $16, %edx
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+ pushl $__ESPFIX_SS
+ pushl %eax /* new kernel esp */
+ /*
+ * Disable interrupts, but do not irqtrace this section: we
+ * will soon execute iret and the tracer was already set to
+ * the irqstate after the IRET:
+ */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ lss (%esp), %esp /* switch to espfix segment */
+.Lend_\@:
+#endif /* CONFIG_X86_ESPFIX32 */
+.endm
+
+/*
+ * Called with pt_regs fully populated and kernel segments loaded,
+ * so we can access PER_CPU and use the integer registers.
+ *
+ * We need to be very careful here with the %esp switch, because an NMI
+ * can happen everywhere. If the NMI handler finds itself on the
+ * entry-stack, it will overwrite the task-stack and everything we
+ * copied there. So allocate the stack-frame on the task-stack and
+ * switch to it before we do any copying.
+ */
+
+#define CS_FROM_ENTRY_STACK (1 << 31)
+#define CS_FROM_USER_CR3 (1 << 30)
+
+.macro SWITCH_TO_KERNEL_STACK
+
+ ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
+
+ BUG_IF_WRONG_CR3
+
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+
+ /*
+ * %eax now contains the entry cr3 and we carry it forward in
+ * that register for the time this macro runs
+ */
+
+ /* Are we on the entry stack? Bail out if not! */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %esp, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
+ jae .Lend_\@
+
+ /* Load stack pointer into %esi and %edi */
+ movl %esp, %esi
+ movl %esi, %edi
+
+ /* Move %edi to the top of the entry stack */
+ andl $(MASK_entry_stack), %edi
+ addl $(SIZEOF_entry_stack), %edi
+
+ /* Load top of task-stack into %edi */
+ movl TSS_entry2task_stack(%edi), %edi
+
+ /*
+ * Clear unused upper bits of the dword containing the word-sized CS
+ * slot in pt_regs in case hardware didn't clear it for us.
+ */
+ andl $(0x0000ffff), PT_CS(%esp)
+
+ /* Special case - entry from kernel mode via entry stack */
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS
+ movb PT_CS(%esp), %cl
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
+#else
+ movl PT_CS(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+#endif
+ cmpl $USER_RPL, %ecx
+ jb .Lentry_from_kernel_\@
+
+ /* Bytes to copy */
+ movl $PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esi)
+ jz .Lcopy_pt_regs_\@
+
+ /*
+ * Stack-frame contains 4 additional segment registers when
+ * coming from VM86 mode
+ */
+ addl $(4 * 4), %ecx
+
+#endif
+.Lcopy_pt_regs_\@:
+
+ /* Allocate frame on task-stack */
+ subl %ecx, %edi
+
+ /* Switch to task-stack */
+ movl %edi, %esp
+
+ /*
+ * We are now on the task-stack and can safely copy over the
+ * stack-frame
+ */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ jmp .Lend_\@
+
+.Lentry_from_kernel_\@:
+
+ /*
+ * This handles the case when we enter the kernel from
+ * kernel-mode and %esp points to the entry-stack. When this
+ * happens we need to switch to the task-stack to run C code,
+ * but switch back to the entry-stack again when we approach
+ * iret and return to the interrupted code-path. This usually
+ * happens when we hit an exception while restoring user-space
+ * segment registers on the way back to user-space or when the
+ * sysenter handler runs with eflags.tf set.
+ *
+ * When we switch to the task-stack here, we can't trust the
+ * contents of the entry-stack anymore, as the exception handler
+ * might be scheduled out or moved to another CPU. Therefore we
+ * copy the complete entry-stack to the task-stack and set a
+ * marker in the iret-frame (bit 31 of the CS dword) to detect
+ * what we've done on the iret path.
+ *
+ * On the iret path we copy everything back and switch to the
+ * entry-stack, so that the interrupted kernel code-path
+ * continues on the same stack it was interrupted with.
+ *
+ * Be aware that an NMI can happen anytime in this code.
+ *
+ * %esi: Entry-Stack pointer (same as %esp)
+ * %edi: Top of the task stack
+ * %eax: CR3 on kernel entry
+ */
+
+ /* Calculate number of bytes on the entry stack in %ecx */
+ movl %esi, %ecx
+
+ /* %ecx to the top of entry-stack */
+ andl $(MASK_entry_stack), %ecx
+ addl $(SIZEOF_entry_stack), %ecx
+
+ /* Number of bytes on the entry stack to %ecx */
+ sub %esi, %ecx
+
+ /* Mark stackframe as coming from entry stack */
+ orl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
+
+ /*
+ * Test the cr3 used to enter the kernel and add a marker
+ * so that we can switch back to it before iret.
+ */
+ testl $PTI_SWITCH_MASK, %eax
+ jz .Lcopy_pt_regs_\@
+ orl $CS_FROM_USER_CR3, PT_CS(%esp)
+
+ /*
+ * %esi and %edi are unchanged, %ecx contains the number of
+ * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
+ * the stack-frame on task-stack and copy everything over
+ */
+ jmp .Lcopy_pt_regs_\@
+
+.Lend_\@:
+.endm
+
+/*
+ * Switch back from the kernel stack to the entry stack.
+ *
+ * The %esp register must point to pt_regs on the task stack. It will
+ * first calculate the size of the stack-frame to copy, depending on
+ * whether we return to VM86 mode or not. With that it uses 'rep movsl'
+ * to copy the contents of the stack over to the entry stack.
+ *
+ * We must be very careful here, as we can't trust the contents of the
+ * task-stack once we switched to the entry-stack. When an NMI happens
+ * while on the entry-stack, the NMI handler will switch back to the top
+ * of the task stack, overwriting our stack-frame we are about to copy.
+ * Therefore we switch the stack only after everything is copied over.
+ */
+.macro SWITCH_TO_ENTRY_STACK
+
+ ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
+
+ /* Bytes to copy */
+ movl $PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+ testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp)
+ jz .Lcopy_pt_regs_\@
+
+ /* Additional 4 registers to copy when returning to VM86 mode */
+ addl $(4 * 4), %ecx
+
+.Lcopy_pt_regs_\@:
+#endif
+
+ /* Initialize source and destination for movsl */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+ subl %ecx, %edi
+ movl %esp, %esi
+
+ /* Save future stack pointer in %ebx */
+ movl %edi, %ebx
+
+ /* Copy over the stack-frame */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ /*
+ * Switch to entry-stack - needs to happen after everything is
+ * copied because the NMI handler will overwrite the task-stack
+ * when on entry-stack
+ */
+ movl %ebx, %esp
+
+.Lend_\@:
+.endm
+
+/*
+ * This macro handles the case when we return to kernel-mode on the iret
+ * path and have to switch back to the entry stack and/or user-cr3
+ *
+ * See the comments below the .Lentry_from_kernel_\@ label in the
+ * SWITCH_TO_KERNEL_STACK macro for more details.
+ */
+.macro PARANOID_EXIT_TO_KERNEL_MODE
+
+ /*
+ * Test if we entered the kernel with the entry-stack. Most
+ * likely we did not, because this code only runs on the
+ * return-to-kernel path.
+ */
+ testl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
+ jz .Lend_\@
+
+ /* Unlikely slow-path */
+
+ /* Clear marker from stack-frame */
+ andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
+
+ /* Copy the remaining task-stack contents to entry-stack */
+ movl %esp, %esi
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+
+ /* Bytes on the task-stack to ecx */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+ subl %esi, %ecx
+
+ /* Allocate stack-frame on entry-stack */
+ subl %ecx, %edi
+
+ /*
+ * Save future stack-pointer, we must not switch until the
+ * copy is done, otherwise the NMI handler could destroy the
+ * contents of the task-stack we are about to copy.
+ */
+ movl %edi, %ebx
+
+ /* Do the copy */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ /* Safe to switch to entry-stack now */
+ movl %ebx, %esp
+
+ /*
+ * We came from entry-stack and need to check if we also need to
+ * switch back to user cr3.
+ */
+ testl $CS_FROM_USER_CR3, PT_CS(%esp)
+ jz .Lend_\@
+
+ /* Clear marker from stack-frame */
+ andl $(~CS_FROM_USER_CR3), PT_CS(%esp)
+
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+.Lend_\@:
+.endm
/*
* %eax: prev task
* %edx: next task
@@ -351,9 +764,9 @@ ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
.Lneed_resched:
cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz restore_all
+ jnz restore_all_kernel
testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all
+ jz restore_all_kernel
call preempt_schedule_irq
jmp .Lneed_resched
END(resume_kernel)
@@ -412,7 +825,21 @@ ENTRY(xen_sysenter_target)
* 0(%ebp) arg6
*/
ENTRY(entry_SYSENTER_32)
- movl TSS_sysenter_sp0(%esp), %esp
+ /*
+ * On entry-stack with all userspace-regs live - save and
+ * restore eflags and %eax to use it as scratch-reg for the cr3
+ * switch.
+ */
+ pushfl
+ pushl %eax
+ BUG_IF_WRONG_CR3 no_user_check=1
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+ popl %eax
+ popfl
+
+ /* Stack empty again, switch to task stack */
+ movl TSS_entry2task_stack(%esp), %esp
+
.Lsysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
pushl %ebp /* pt_regs->sp (stashed in bp) */
@@ -421,7 +848,7 @@ ENTRY(entry_SYSENTER_32)
pushl $__USER_CS /* pt_regs->cs */
pushl $0 /* pt_regs->ip = 0 (placeholder) */
pushl %eax /* pt_regs->orig_ax */
- SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */
/*
* SYSENTER doesn't filter flags, so we need to clear NT, AC
@@ -460,25 +887,49 @@ ENTRY(entry_SYSENTER_32)
/* Opportunistic SYSEXIT */
TRACE_IRQS_ON /* User mode traces as IRQs on. */
+
+ /*
+ * Setup entry stack - we keep the pointer in %eax and do the
+ * switch after almost all user-state is restored.
+ */
+
+ /* Load entry stack pointer and allocate frame for eflags/eax */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
+ subl $(2*4), %eax
+
+ /* Copy eflags and eax to entry stack */
+ movl PT_EFLAGS(%esp), %edi
+ movl PT_EAX(%esp), %esi
+ movl %edi, (%eax)
+ movl %esi, 4(%eax)
+
+ /* Restore user registers and segments */
movl PT_EIP(%esp), %edx /* pt_regs->ip */
movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
1: mov PT_FS(%esp), %fs
PTGS_TO_GS
+
popl %ebx /* pt_regs->bx */
addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
popl %esi /* pt_regs->si */
popl %edi /* pt_regs->di */
popl %ebp /* pt_regs->bp */
- popl %eax /* pt_regs->ax */
+
+ /* Switch to entry stack */
+ movl %eax, %esp
+
+ /* Now ready to switch the cr3 */
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
/*
* Restore all flags except IF. (We restore IF separately because
* STI gives a one-instruction window in which we won't be interrupted,
* whereas POPF does not.)
*/
- addl $PT_EFLAGS-PT_DS, %esp /* point esp at pt_regs->flags */
btrl $X86_EFLAGS_IF_BIT, (%esp)
+ BUG_IF_WRONG_CR3 no_user_check=1
popfl
+ popl %eax
/*
* Return back to the vDSO, which will pop ecx and edx.
@@ -532,7 +983,8 @@ ENDPROC(entry_SYSENTER_32)
ENTRY(entry_INT80_32)
ASM_CLAC
pushl %eax /* pt_regs->orig_ax */
- SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+
+ SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */
/*
* User mode is traced as though IRQs are on, and the interrupt gate
@@ -546,24 +998,17 @@ ENTRY(entry_INT80_32)
restore_all:
TRACE_IRQS_IRET
+ SWITCH_TO_ENTRY_STACK
.Lrestore_all_notrace:
-#ifdef CONFIG_X86_ESPFIX32
- ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX
-
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- /*
- * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- * are returning to the kernel.
- * See comments in process.c:copy_thread() for details.
- */
- movb PT_OLDSS(%esp), %ah
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- je .Lldt_ss # returning to user-space with LDT SS
-#endif
+ CHECK_AND_APPLY_ESPFIX
.Lrestore_nocheck:
- RESTORE_REGS 4 # skip orig_eax/error_code
+ /* Switch back to user CR3 */
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+ BUG_IF_WRONG_CR3
+
+ /* Restore user state */
+ RESTORE_REGS pop=4 # skip orig_eax/error_code
.Lirq_return:
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
@@ -572,46 +1017,33 @@ restore_all:
*/
INTERRUPT_RETURN
+restore_all_kernel:
+ TRACE_IRQS_IRET
+ PARANOID_EXIT_TO_KERNEL_MODE
+ BUG_IF_WRONG_CR3
+ RESTORE_REGS 4
+ jmp .Lirq_return
+
.section .fixup, "ax"
ENTRY(iret_exc )
pushl $0 # no error code
pushl $do_iret_error
- jmp common_exception
-.previous
- _ASM_EXTABLE(.Lirq_return, iret_exc)
-#ifdef CONFIG_X86_ESPFIX32
-.Lldt_ss:
-/*
- * Setup and switch to ESPFIX stack
- *
- * We're returning to userspace with a 16 bit stack. The CPU will not
- * restore the high word of ESP for us on executing iret... This is an
- * "official" bug of all the x86-compatible CPUs, which we can work
- * around to make dosemu and wine happy. We do this by preloading the
- * high word of ESP with the high word of the userspace ESP while
- * compensating for the offset by changing to the ESPFIX segment with
- * a base address that matches for the difference.
- */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
- mov %esp, %edx /* load kernel esp */
- mov PT_OLDESP(%esp), %eax /* load userspace esp */
- mov %dx, %ax /* eax: new kernel esp */
- sub %eax, %edx /* offset (low word is 0) */
- shr $16, %edx
- mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
- mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
- pushl $__ESPFIX_SS
- pushl %eax /* new kernel esp */
+#ifdef CONFIG_DEBUG_ENTRY
/*
- * Disable interrupts, but do not irqtrace this section: we
- * will soon execute iret and the tracer was already set to
- * the irqstate after the IRET:
+ * The stack-frame here is the one that iret faulted on, so its a
+ * return-to-user frame. We are on kernel-cr3 because we come here from
+ * the fixup code. This confuses the CR3 checker, so switch to user-cr3
+ * as the checker expects it.
*/
- DISABLE_INTERRUPTS(CLBR_ANY)
- lss (%esp), %esp /* switch to espfix segment */
- jmp .Lrestore_nocheck
+ pushl %eax
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+ popl %eax
#endif
+
+ jmp common_exception
+.previous
+ _ASM_EXTABLE(.Lirq_return, iret_exc)
ENDPROC(entry_INT80_32)
.macro FIXUP_ESPFIX_STACK
@@ -671,7 +1103,8 @@ END(irq_entries_start)
common_interrupt:
ASM_CLAC
addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
- SAVE_ALL
+
+ SAVE_ALL switch_stacks=1
ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
movl %esp, %eax
@@ -679,16 +1112,16 @@ common_interrupt:
jmp ret_from_intr
ENDPROC(common_interrupt)
-#define BUILD_INTERRUPT3(name, nr, fn) \
-ENTRY(name) \
- ASM_CLAC; \
- pushl $~(nr); \
- SAVE_ALL; \
- ENCODE_FRAME_POINTER; \
- TRACE_IRQS_OFF \
- movl %esp, %eax; \
- call fn; \
- jmp ret_from_intr; \
+#define BUILD_INTERRUPT3(name, nr, fn) \
+ENTRY(name) \
+ ASM_CLAC; \
+ pushl $~(nr); \
+ SAVE_ALL switch_stacks=1; \
+ ENCODE_FRAME_POINTER; \
+ TRACE_IRQS_OFF \
+ movl %esp, %eax; \
+ call fn; \
+ jmp ret_from_intr; \
ENDPROC(name)
#define BUILD_INTERRUPT(name, nr) \
@@ -920,16 +1353,20 @@ common_exception:
pushl %es
pushl %ds
pushl %eax
+ movl $(__USER_DS), %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
pushl %ebp
pushl %edi
pushl %esi
pushl %edx
pushl %ecx
pushl %ebx
+ SWITCH_TO_KERNEL_STACK
ENCODE_FRAME_POINTER
cld
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
UNWIND_ESPFIX_STACK
GS_TO_REG %ecx
movl PT_GS(%esp), %edi # get the function address
@@ -937,9 +1374,6 @@ common_exception:
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
REG_TO_PTGS %ecx
SET_KERNEL_GS %ecx
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
TRACE_IRQS_OFF
movl %esp, %eax # pt_regs pointer
CALL_NOSPEC %edi
@@ -948,40 +1382,12 @@ END(common_exception)
ENTRY(debug)
/*
- * #DB can happen at the first instruction of
- * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this
- * happens, then we will be running on a very small stack. We
- * need to detect this condition and switch to the thread
- * stack before calling any C code at all.
- *
- * If you edit this code, keep in mind that NMIs can happen in here.
+ * Entry from sysenter is now handled in common_exception
*/
ASM_CLAC
pushl $-1 # mark this as an int
- SAVE_ALL
- ENCODE_FRAME_POINTER
- xorl %edx, %edx # error code 0
- movl %esp, %eax # pt_regs pointer
-
- /* Are we currently on the SYSENTER stack? */
- movl PER_CPU_VAR(cpu_entry_area), %ecx
- addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
- subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
- cmpl $SIZEOF_entry_stack, %ecx
- jb .Ldebug_from_sysenter_stack
-
- TRACE_IRQS_OFF
- call do_debug
- jmp ret_from_exception
-
-.Ldebug_from_sysenter_stack:
- /* We're on the SYSENTER stack. Switch off. */
- movl %esp, %ebx
- movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
- TRACE_IRQS_OFF
- call do_debug
- movl %ebx, %esp
- jmp ret_from_exception
+ pushl $do_debug
+ jmp common_exception
END(debug)
/*
@@ -993,6 +1399,7 @@ END(debug)
*/
ENTRY(nmi)
ASM_CLAC
+
#ifdef CONFIG_X86_ESPFIX32
pushl %eax
movl %ss, %eax
@@ -1002,7 +1409,7 @@ ENTRY(nmi)
#endif
pushl %eax # pt_regs->orig_ax
- SAVE_ALL
+ SAVE_ALL_NMI cr3_reg=%edi
ENCODE_FRAME_POINTER
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
@@ -1016,7 +1423,7 @@ ENTRY(nmi)
/* Not on SYSENTER stack. */
call do_nmi
- jmp .Lrestore_all_notrace
+ jmp .Lnmi_return
.Lnmi_from_sysenter_stack:
/*
@@ -1027,7 +1434,11 @@ ENTRY(nmi)
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
call do_nmi
movl %ebx, %esp
- jmp .Lrestore_all_notrace
+
+.Lnmi_return:
+ CHECK_AND_APPLY_ESPFIX
+ RESTORE_ALL_NMI cr3_reg=%edi pop=4
+ jmp .Lirq_return
#ifdef CONFIG_X86_ESPFIX32
.Lnmi_espfix_stack:
@@ -1042,12 +1453,12 @@ ENTRY(nmi)
pushl 16(%esp)
.endr
pushl %eax
- SAVE_ALL
+ SAVE_ALL_NMI cr3_reg=%edi
ENCODE_FRAME_POINTER
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx, %edx # zero error code
call do_nmi
- RESTORE_REGS
+ RESTORE_ALL_NMI cr3_reg=%edi
lss 12+4(%esp), %esp # back to espfix stack
jmp .Lirq_return
#endif
@@ -1056,7 +1467,8 @@ END(nmi)
ENTRY(int3)
ASM_CLAC
pushl $-1 # mark this as an int
- SAVE_ALL
+
+ SAVE_ALL switch_stacks=1
ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
xorl %edx, %edx # zero error code
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 73a522d53b53..957dfb693ecc 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -92,7 +92,7 @@ END(native_usergs_sysret64)
.endm
.macro TRACE_IRQS_IRETQ_DEBUG
- bt $9, EFLAGS(%rsp) /* interrupts off? */
+ btl $9, EFLAGS(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON_DEBUG
1:
@@ -408,6 +408,7 @@ ENTRY(ret_from_fork)
1:
/* kernel thread */
+ UNWIND_HINT_EMPTY
movq %r12, %rdi
CALL_NOSPEC %rbx
/*
@@ -701,7 +702,7 @@ retint_kernel:
#ifdef CONFIG_PREEMPT
/* Interrupts are off */
/* Check if we need preemption */
- bt $9, EFLAGS(%rsp) /* were interrupts off? */
+ btl $9, EFLAGS(%rsp) /* were interrupts off? */
jnc 1f
0: cmpl $0, PER_CPU_VAR(__preempt_count)
jnz 1f
@@ -981,7 +982,7 @@ ENTRY(\sym)
call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
+ jmp error_exit
.endif
END(\sym)
.endm
@@ -1222,7 +1223,6 @@ END(paranoid_exit)
/*
* Save all registers in pt_regs, and switch GS if needed.
- * Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
UNWIND_HINT_FUNC
@@ -1269,7 +1269,6 @@ ENTRY(error_entry)
* for these here too.
*/
.Lerror_kernelspace:
- incl %ebx
leaq native_irq_return_iret(%rip), %rcx
cmpq %rcx, RIP+8(%rsp)
je .Lerror_bad_iret
@@ -1303,28 +1302,20 @@ ENTRY(error_entry)
/*
* Pretend that the exception came from user mode: set up pt_regs
- * as if we faulted immediately after IRET and clear EBX so that
- * error_exit knows that we will be returning to user mode.
+ * as if we faulted immediately after IRET.
*/
mov %rsp, %rdi
call fixup_bad_iret
mov %rax, %rsp
- decl %ebx
jmp .Lerror_entry_from_usermode_after_swapgs
END(error_entry)
-
-/*
- * On entry, EBX is a "return to kernel mode" flag:
- * 1: already in kernel mode, don't need SWAPGS
- * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
- */
ENTRY(error_exit)
UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- testl %ebx, %ebx
- jnz retint_kernel
+ testb $3, CS(%rsp)
+ jz retint_kernel
jmp retint_user
END(error_exit)
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 261802b1cc50..9f695f517747 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -46,10 +46,8 @@ targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
CPPFLAGS_vdso.lds += -P -C
-VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
- -Wl,--no-undefined \
- -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \
- $(DISABLE_LTO)
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
+ -z max-page-size=4096 -z common-page-size=4096
$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
@@ -58,9 +56,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(src
hostprogs-y += vdso2c
quiet_cmd_vdso2c = VDSO2C $@
-define cmd_vdso2c
- $(obj)/vdso2c $< $(<:%.dbg=%) $@
-endef
+ cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@
$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
$(call if_changed,vdso2c)
@@ -95,10 +91,8 @@ CFLAGS_REMOVE_vvar.o = -pg
#
CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
-VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \
- -Wl,-soname=linux-vdso.so.1 \
- -Wl,-z,max-page-size=4096 \
- -Wl,-z,common-page-size=4096
+VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
+ -z max-page-size=4096 -z common-page-size=4096
# x32-rebranded versions
vobjx32s-y := $(vobjs-y:.o=-x32.o)
@@ -123,7 +117,7 @@ $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
-VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
+VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
@@ -157,13 +151,13 @@ $(obj)/vdso32.so.dbg: FORCE \
# The DSO images are built using a special linker script.
#
quiet_cmd_vdso = VDSO $@
- cmd_vdso = $(CC) -nostdlib -o $@ \
+ cmd_vdso = $(LD) -nostdlib -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
- -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
+ -T $(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
- $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
+VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \
+ $(call ld-option, --build-id) -Bsymbolic
GCOV_PROFILE := n
#
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 4b98101209a1..d50bb4dc0650 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -579,7 +579,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
{
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
struct perf_event *event = pcpu->event;
- struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event *hwc;
struct perf_sample_data data;
struct perf_raw_record raw;
struct pt_regs regs;
@@ -602,6 +602,10 @@ fail:
return 0;
}
+ if (WARN_ON_ONCE(!event))
+ goto fail;
+
+ hwc = &event->hw;
msr = hwc->config_base;
buf = ibs_data.regs;
rdmsrl(msr, *buf);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 707b2a96e516..035c37481f57 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2041,15 +2041,15 @@ static void intel_pmu_disable_event(struct perf_event *event)
cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+ if (unlikely(event->attr.precise_ip))
+ intel_pmu_pebs_disable(event);
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc);
return;
}
x86_pmu_disable_event(event);
-
- if (unlikely(event->attr.precise_ip))
- intel_pmu_pebs_disable(event);
}
static void intel_pmu_del_event(struct perf_event *event)
@@ -2068,17 +2068,19 @@ static void intel_pmu_read_event(struct perf_event *event)
x86_perf_event_update(event);
}
-static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_enable_fixed(struct perf_event *event)
{
+ struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
- u64 ctrl_val, bits, mask;
+ u64 ctrl_val, mask, bits = 0;
/*
- * Enable IRQ generation (0x8),
+ * Enable IRQ generation (0x8), if not PEBS,
* and enable ring-3 counting (0x2) and ring-0 counting (0x1)
* if requested:
*/
- bits = 0x8ULL;
+ if (!event->attr.precise_ip)
+ bits |= 0x8;
if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
bits |= 0x2;
if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
@@ -2120,14 +2122,14 @@ static void intel_pmu_enable_event(struct perf_event *event)
if (unlikely(event_is_checkpointed(event)))
cpuc->intel_cp_status |= (1ull << hwc->idx);
+ if (unlikely(event->attr.precise_ip))
+ intel_pmu_pebs_enable(event);
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_enable_fixed(hwc);
+ intel_pmu_enable_fixed(event);
return;
}
- if (unlikely(event->attr.precise_ip))
- intel_pmu_pebs_enable(event);
-
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
@@ -2280,7 +2282,10 @@ again:
* counters from the GLOBAL_STATUS mask and we always process PEBS
* events via drain_pebs().
*/
- status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ status &= ~cpuc->pebs_enabled;
+ else
+ status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
/*
* PEBS overflow sets bit 62 in the global status register
@@ -2997,6 +3002,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
+
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
}
if (needs_branch_stack(event)) {
@@ -4069,7 +4077,6 @@ __init int intel_pmu_init(void)
intel_pmu_lbr_init_skl();
x86_pmu.event_constraints = intel_slm_event_constraints;
- x86_pmu.pebs_constraints = intel_glp_pebs_event_constraints;
x86_pmu.extra_regs = intel_glm_extra_regs;
/*
* It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
@@ -4079,6 +4086,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_prec_dist = true;
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
x86_pmu.get_event_constraints = glp_get_event_constraints;
x86_pmu.cpu_events = glm_events_attrs;
/* Goldmont Plus has 4-wide pipeline */
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8cf03f101938..b7b01d762d32 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -713,12 +713,6 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
-struct event_constraint intel_glp_pebs_event_constraints[] = {
- /* Allow all events as PEBS with no flags */
- INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
- EVENT_CONSTRAINT_END
-};
-
struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
@@ -871,6 +865,13 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
}
}
+ /*
+ * Extended PEBS support
+ * Makes the PEBS code search the normal constraints.
+ */
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ return NULL;
+
return &emptyconstraint;
}
@@ -896,10 +897,16 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
{
struct debug_store *ds = cpuc->ds;
u64 threshold;
+ int reserved;
+
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed;
+ else
+ reserved = x86_pmu.max_pebs_events;
if (cpuc->n_pebs == cpuc->n_large_pebs) {
threshold = ds->pebs_absolute_maximum -
- x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+ reserved * x86_pmu.pebs_record_size;
} else {
threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
}
@@ -963,7 +970,11 @@ void intel_pmu_pebs_enable(struct perf_event *event)
* This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
*/
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
- ds->pebs_event_reset[hwc->idx] =
+ unsigned int idx = hwc->idx;
+
+ if (idx >= INTEL_PMC_IDX_FIXED)
+ idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED);
+ ds->pebs_event_reset[idx] =
(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
} else {
ds->pebs_event_reset[hwc->idx] = 0;
@@ -1186,16 +1197,20 @@ static void setup_pebs_sample_data(struct perf_event *event,
}
/*
+ * We must however always use iregs for the unwinder to stay sane; the
+ * record BP,SP,IP can point into thin air when the record is from a
+ * previous PMI context or an (I)RET happend between the record and
+ * PMI.
+ */
+ if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ data->callchain = perf_callchain(event, iregs);
+
+ /*
* We use the interrupt regs as a base because the PEBS record does not
* contain a full regs set, specifically it seems to lack segment
* descriptors, which get used by things like user_mode().
*
* In the simple case fix up only the IP for PERF_SAMPLE_IP.
- *
- * We must however always use BP,SP from iregs for the unwinder to stay
- * sane; the record BP,SP can point into thin air when the record is
- * from a previous PMI context or an (I)RET happend between the record
- * and PMI.
*/
*regs = *iregs;
@@ -1214,15 +1229,8 @@ static void setup_pebs_sample_data(struct perf_event *event,
regs->si = pebs->si;
regs->di = pebs->di;
- /*
- * Per the above; only set BP,SP if we don't need callchains.
- *
- * XXX: does this make sense?
- */
- if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) {
- regs->bp = pebs->bp;
- regs->sp = pebs->sp;
- }
+ regs->bp = pebs->bp;
+ regs->sp = pebs->sp;
#ifndef CONFIG_X86_32
regs->r8 = pebs->r8;
@@ -1484,9 +1492,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
struct debug_store *ds = cpuc->ds;
struct perf_event *event;
void *base, *at, *top;
- short counts[MAX_PEBS_EVENTS] = {};
- short error[MAX_PEBS_EVENTS] = {};
- int bit, i;
+ short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ int bit, i, size;
+ u64 mask;
if (!x86_pmu.pebs_active)
return;
@@ -1496,6 +1505,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
ds->pebs_index = ds->pebs_buffer_base;
+ mask = (1ULL << x86_pmu.max_pebs_events) - 1;
+ size = x86_pmu.max_pebs_events;
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL) {
+ mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED;
+ size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
+ }
+
if (unlikely(base >= top)) {
/*
* The drain_pebs() could be called twice in a short period
@@ -1505,7 +1521,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
* update the event->count for this case.
*/
for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled,
- x86_pmu.max_pebs_events) {
+ size) {
event = cpuc->events[bit];
if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
intel_pmu_save_and_restart_reload(event, 0);
@@ -1518,12 +1534,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
u64 pebs_status;
pebs_status = p->status & cpuc->pebs_enabled;
- pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+ pebs_status &= mask;
/* PEBS v3 has more accurate status bits */
if (x86_pmu.intel_cap.pebs_format >= 3) {
for_each_set_bit(bit, (unsigned long *)&pebs_status,
- x86_pmu.max_pebs_events)
+ size)
counts[bit]++;
continue;
@@ -1571,7 +1587,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
counts[bit]++;
}
- for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+ for (bit = 0; bit < size; bit++) {
if ((counts[bit] == 0) && (error[bit] == 0))
continue;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index cf372b90557e..f3e006bed9a7 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -216,6 +216,8 @@ static void intel_pmu_lbr_reset_64(void)
void intel_pmu_lbr_reset(void)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
if (!x86_pmu.lbr_nr)
return;
@@ -223,6 +225,9 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_32();
else
intel_pmu_lbr_reset_64();
+
+ cpuc->last_task_ctx = NULL;
+ cpuc->last_log_id = 0;
}
/*
@@ -334,6 +339,7 @@ static inline u64 rdlbr_to(unsigned int idx)
static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int i;
unsigned lbr_idx, mask;
u64 tos;
@@ -344,9 +350,21 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
return;
}
- mask = x86_pmu.lbr_nr - 1;
tos = task_ctx->tos;
- for (i = 0; i < tos; i++) {
+ /*
+ * Does not restore the LBR registers, if
+ * - No one else touched them, and
+ * - Did not enter C6
+ */
+ if ((task_ctx == cpuc->last_task_ctx) &&
+ (task_ctx->log_id == cpuc->last_log_id) &&
+ rdlbr_from(tos)) {
+ task_ctx->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ for (i = 0; i < task_ctx->valid_lbrs; i++) {
lbr_idx = (tos - i) & mask;
wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
wrlbr_to (lbr_idx, task_ctx->lbr_to[i]);
@@ -354,14 +372,24 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
+
+ for (; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrlbr_from(lbr_idx, 0);
+ wrlbr_to(lbr_idx, 0);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
+ }
+
wrmsrl(x86_pmu.lbr_tos, tos);
task_ctx->lbr_stack_state = LBR_NONE;
}
static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
unsigned lbr_idx, mask;
- u64 tos;
+ u64 tos, from;
int i;
if (task_ctx->lbr_callstack_users == 0) {
@@ -371,15 +399,22 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
mask = x86_pmu.lbr_nr - 1;
tos = intel_pmu_lbr_tos();
- for (i = 0; i < tos; i++) {
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
lbr_idx = (tos - i) & mask;
- task_ctx->lbr_from[i] = rdlbr_from(lbr_idx);
+ from = rdlbr_from(lbr_idx);
+ if (!from)
+ break;
+ task_ctx->lbr_from[i] = from;
task_ctx->lbr_to[i] = rdlbr_to(lbr_idx);
if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
+ task_ctx->valid_lbrs = i;
task_ctx->tos = tos;
task_ctx->lbr_stack_state = LBR_VALID;
+
+ cpuc->last_task_ctx = task_ctx;
+ cpuc->last_log_id = ++task_ctx->log_id;
}
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
@@ -531,7 +566,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
*/
static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
- bool need_info = false;
+ bool need_info = false, call_stack = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
@@ -542,7 +577,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
if (cpuc->lbr_sel) {
need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
if (cpuc->lbr_sel->config & LBR_CALL_STACK)
- num = tos;
+ call_stack = true;
}
for (i = 0; i < num; i++) {
@@ -555,6 +590,13 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
from = rdlbr_from(lbr_idx);
to = rdlbr_to(lbr_idx);
+ /*
+ * Read LBR call stack entries
+ * until invalid entry (0s) is detected.
+ */
+ if (call_stack && !from)
+ break;
+
if (lbr_format == LBR_FORMAT_INFO && need_info) {
u64 info;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index c9e1e0bef3c3..e17ab885b1e9 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -28,7 +28,7 @@
#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
#define UNCORE_EXTRA_PCI_DEV 0xff
-#define UNCORE_EXTRA_PCI_DEV_MAX 3
+#define UNCORE_EXTRA_PCI_DEV_MAX 4
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 87dc0263a2e1..51d7c117e3c7 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1029,6 +1029,7 @@ void snbep_uncore_cpu_init(void)
enum {
SNBEP_PCI_QPI_PORT0_FILTER,
SNBEP_PCI_QPI_PORT1_FILTER,
+ BDX_PCI_QPI_PORT2_FILTER,
HSWEP_PCI_PCU_3,
};
@@ -3286,15 +3287,18 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
},
{ /* QPI Port 0 filter */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
},
{ /* QPI Port 1 filter */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
},
{ /* QPI Port 2 filter */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ BDX_PCI_QPI_PORT2_FILTER),
},
{ /* PCU.3 (for Capability registers) */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0),
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9f3711470ec1..156286335351 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -163,6 +163,7 @@ struct intel_excl_cntrs {
unsigned core_id; /* per-core: core id */
};
+struct x86_perf_task_context;
#define MAX_LBR_ENTRIES 32
enum {
@@ -214,6 +215,8 @@ struct cpu_hw_events {
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
struct er_account *lbr_sel;
u64 br_sel;
+ struct x86_perf_task_context *last_task_ctx;
+ int last_log_id;
/*
* Intel host/guest exclude bits
@@ -648,8 +651,10 @@ struct x86_perf_task_context {
u64 lbr_to[MAX_LBR_ENTRIES];
u64 lbr_info[MAX_LBR_ENTRIES];
int tos;
+ int valid_lbrs;
int lbr_callstack_users;
int lbr_stack_state;
+ int log_id;
};
#define x86_add_quirk(func_) \
@@ -668,6 +673,7 @@ do { \
#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */
#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
+#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 402338365651..5b0f613428c2 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -31,6 +31,8 @@
#include <asm/mshyperv.h>
#include <asm/apic.h>
+#include <asm/trace/hyperv.h>
+
static struct apic orig_apic;
static u64 hv_apic_icr_read(void)
@@ -99,6 +101,9 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
int nr_bank = 0;
int ret = 1;
+ if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+
local_irq_save(flags);
arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -130,10 +135,10 @@ ipi_mask_ex_done:
static bool __send_ipi_mask(const struct cpumask *mask, int vector)
{
int cur_cpu, vcpu;
- struct ipi_arg_non_ex **arg;
- struct ipi_arg_non_ex *ipi_arg;
+ struct ipi_arg_non_ex ipi_arg;
int ret = 1;
- unsigned long flags;
+
+ trace_hyperv_send_ipi_mask(mask, vector);
if (cpumask_empty(mask))
return true;
@@ -144,40 +149,43 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector)
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return false;
- if ((ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
- return __send_ipi_mask_ex(mask, vector);
-
- local_irq_save(flags);
- arg = (struct ipi_arg_non_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
-
- ipi_arg = *arg;
- if (unlikely(!ipi_arg))
- goto ipi_mask_done;
-
- ipi_arg->vector = vector;
- ipi_arg->reserved = 0;
- ipi_arg->cpu_mask = 0;
+ /*
+ * From the supplied CPU set we need to figure out if we can get away
+ * with cheaper HVCALL_SEND_IPI hypercall. This is possible when the
+ * highest VP number in the set is < 64. As VP numbers are usually in
+ * ascending order and match Linux CPU ids, here is an optimization:
+ * we check the VP number for the highest bit in the supplied set first
+ * so we can quickly find out if using HVCALL_SEND_IPI_EX hypercall is
+ * a must. We will also check all VP numbers when walking the supplied
+ * CPU set to remain correct in all cases.
+ */
+ if (hv_cpu_number_to_vp_number(cpumask_last(mask)) >= 64)
+ goto do_ex_hypercall;
+
+ ipi_arg.vector = vector;
+ ipi_arg.cpu_mask = 0;
for_each_cpu(cur_cpu, mask) {
vcpu = hv_cpu_number_to_vp_number(cur_cpu);
if (vcpu == VP_INVAL)
- goto ipi_mask_done;
+ return false;
/*
* This particular version of the IPI hypercall can
* only target upto 64 CPUs.
*/
if (vcpu >= 64)
- goto ipi_mask_done;
+ goto do_ex_hypercall;
- __set_bit(vcpu, (unsigned long *)&ipi_arg->cpu_mask);
+ __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask);
}
- ret = hv_do_hypercall(HVCALL_SEND_IPI, ipi_arg, NULL);
-
-ipi_mask_done:
- local_irq_restore(flags);
+ ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
+ ipi_arg.cpu_mask);
return ((ret == 0) ? true : false);
+
+do_ex_hypercall:
+ return __send_ipi_mask_ex(mask, vector);
}
static bool __send_ipi_one(int cpu, int vector)
@@ -233,10 +241,7 @@ static void hv_send_ipi_self(int vector)
void __init hv_apic_init(void)
{
if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
- if ((ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
- pr_info("Hyper-V: Using ext hypercalls for IPI\n");
- else
- pr_info("Hyper-V: Using IPI hypercalls\n");
+ pr_info("Hyper-V: Using IPI hypercalls\n");
/*
* Set the IPI entry points.
*/
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index de27615c51ea..1147e1fed7ff 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -16,6 +16,8 @@
/* Each gva in gva_list encodes up to 4096 pages to flush */
#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
+static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
/*
* Fills in gva_list starting from offset. Returns the number of items added.
@@ -93,10 +95,29 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
if (cpumask_equal(cpus, cpu_present_mask)) {
flush->flags |= HV_FLUSH_ALL_PROCESSORS;
} else {
+ /*
+ * From the supplied CPU set we need to figure out if we can get
+ * away with cheaper HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE}
+ * hypercalls. This is possible when the highest VP number in
+ * the set is < 64. As VP numbers are usually in ascending order
+ * and match Linux CPU ids, here is an optimization: we check
+ * the VP number for the highest bit in the supplied set first
+ * so we can quickly find out if using *_EX hypercalls is a
+ * must. We will also check all VP numbers when walking the
+ * supplied CPU set to remain correct in all cases.
+ */
+ if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64)
+ goto do_ex_hypercall;
+
for_each_cpu(cpu, cpus) {
vcpu = hv_cpu_number_to_vp_number(cpu);
- if (vcpu >= 64)
+ if (vcpu == VP_INVAL) {
+ local_irq_restore(flags);
goto do_native;
+ }
+
+ if (vcpu >= 64)
+ goto do_ex_hypercall;
__set_bit(vcpu, (unsigned long *)
&flush->processor_mask);
@@ -123,7 +144,12 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus,
status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST,
gva_n, 0, flush, NULL);
}
+ goto check_status;
+
+do_ex_hypercall:
+ status = hyperv_flush_tlb_others_ex(cpus, info);
+check_status:
local_irq_restore(flags);
if (!(status & HV_HYPERCALL_RESULT_MASK))
@@ -132,35 +158,22 @@ do_native:
native_flush_tlb_others(cpus, info);
}
-static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
- const struct flush_tlb_info *info)
+static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
{
int nr_bank = 0, max_gvas, gva_n;
struct hv_tlb_flush_ex **flush_pcpu;
struct hv_tlb_flush_ex *flush;
- u64 status = U64_MAX;
- unsigned long flags;
+ u64 status;
- trace_hyperv_mmu_flush_tlb_others(cpus, info);
-
- if (!hv_hypercall_pg)
- goto do_native;
-
- if (cpumask_empty(cpus))
- return;
-
- local_irq_save(flags);
+ if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return U64_MAX;
flush_pcpu = (struct hv_tlb_flush_ex **)
this_cpu_ptr(hyperv_pcpu_input_arg);
flush = *flush_pcpu;
- if (unlikely(!flush)) {
- local_irq_restore(flags);
- goto do_native;
- }
-
if (info->mm) {
/*
* AddressSpace argument must match the CR3 with PCID bits
@@ -176,15 +189,10 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->hv_vp_set.valid_bank_mask = 0;
- if (!cpumask_equal(cpus, cpu_present_mask)) {
- flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
- nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
- }
-
- if (!nr_bank) {
- flush->hv_vp_set.format = HV_GENERIC_SET_ALL;
- flush->flags |= HV_FLUSH_ALL_PROCESSORS;
- }
+ flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
+ if (nr_bank < 0)
+ return U64_MAX;
/*
* We can flush not more than max_gvas with one hypercall. Flush the
@@ -213,12 +221,7 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
gva_n, nr_bank, flush, NULL);
}
- local_irq_restore(flags);
-
- if (!(status & HV_HYPERCALL_RESULT_MASK))
- return;
-do_native:
- native_flush_tlb_others(cpus, info);
+ return status;
}
void hyperv_setup_mmu_ops(void)
@@ -226,11 +229,6 @@ void hyperv_setup_mmu_ops(void)
if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED))
return;
- if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) {
- pr_info("Using hypercall for remote TLB flush\n");
- pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
- } else {
- pr_info("Using ext hypercall for remote TLB flush\n");
- pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex;
- }
+ pr_info("Using hypercall for remote TLB flush\n");
+ pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
}
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 74a9e06b6cfd..130e81e10fc7 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -10,6 +10,7 @@
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
+#include <asm/hardirq.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
@@ -502,12 +503,19 @@ extern int default_check_phys_apicid_present(int phys_apicid);
#endif /* CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_SMP
+bool apic_id_is_primary_thread(unsigned int id);
+#else
+static inline bool apic_id_is_primary_thread(unsigned int id) { return false; }
+#endif
+
extern void irq_enter(void);
extern void irq_exit(void);
static inline void entering_irq(void)
{
irq_enter();
+ kvm_set_cpu_l1tf_flush_l1d();
}
static inline void entering_ack_irq(void)
@@ -520,6 +528,7 @@ static inline void ipi_entering_ack_irq(void)
{
irq_enter();
ack_APIC_irq();
+ kvm_set_cpu_l1tf_flush_l1d();
}
static inline void exiting_irq(void)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 0db6bec95489..b143717b92b3 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -80,6 +80,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v)
* true if the result is zero, or false for all
* other cases.
*/
+#define arch_atomic_sub_and_test arch_atomic_sub_and_test
static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
@@ -91,6 +92,7 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
*
* Atomically increments @v by 1.
*/
+#define arch_atomic_inc arch_atomic_inc
static __always_inline void arch_atomic_inc(atomic_t *v)
{
asm volatile(LOCK_PREFIX "incl %0"
@@ -103,6 +105,7 @@ static __always_inline void arch_atomic_inc(atomic_t *v)
*
* Atomically decrements @v by 1.
*/
+#define arch_atomic_dec arch_atomic_dec
static __always_inline void arch_atomic_dec(atomic_t *v)
{
asm volatile(LOCK_PREFIX "decl %0"
@@ -117,6 +120,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
+#define arch_atomic_dec_and_test arch_atomic_dec_and_test
static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e);
@@ -130,6 +134,7 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
* and returns true if the result is zero, or false for all
* other cases.
*/
+#define arch_atomic_inc_and_test arch_atomic_inc_and_test
static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e);
@@ -144,6 +149,7 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
+#define arch_atomic_add_negative arch_atomic_add_negative
static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s);
@@ -173,9 +179,6 @@ static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
return arch_atomic_add_return(-i, v);
}
-#define arch_atomic_inc_return(v) (arch_atomic_add_return(1, v))
-#define arch_atomic_dec_return(v) (arch_atomic_sub_return(1, v))
-
static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
return xadd(&v->counter, i);
@@ -199,7 +202,7 @@ static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int n
static inline int arch_atomic_xchg(atomic_t *v, int new)
{
- return xchg(&v->counter, new);
+ return arch_xchg(&v->counter, new);
}
static inline void arch_atomic_and(int i, atomic_t *v)
@@ -253,27 +256,6 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v)
return val;
}
-/**
- * __arch_atomic_add_unless - add unless the number is already a given value
- * @v: pointer of type atomic_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns the old value of @v.
- */
-static __always_inline int __arch_atomic_add_unless(atomic_t *v, int a, int u)
-{
- int c = arch_atomic_read(v);
-
- do {
- if (unlikely(c == u))
- break;
- } while (!arch_atomic_try_cmpxchg(v, &c, c + a));
-
- return c;
-}
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 92212bf0484f..ef959f02d070 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -158,6 +158,7 @@ static inline long long arch_atomic64_inc_return(atomic64_t *v)
"S" (v) : "memory", "ecx");
return a;
}
+#define arch_atomic64_inc_return arch_atomic64_inc_return
static inline long long arch_atomic64_dec_return(atomic64_t *v)
{
@@ -166,6 +167,7 @@ static inline long long arch_atomic64_dec_return(atomic64_t *v)
"S" (v) : "memory", "ecx");
return a;
}
+#define arch_atomic64_dec_return arch_atomic64_dec_return
/**
* arch_atomic64_add - add integer to atomic64 variable
@@ -198,25 +200,12 @@ static inline long long arch_atomic64_sub(long long i, atomic64_t *v)
}
/**
- * arch_atomic64_sub_and_test - subtract value from variable and test result
- * @i: integer value to subtract
- * @v: pointer to type atomic64_t
- *
- * Atomically subtracts @i from @v and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-static inline int arch_atomic64_sub_and_test(long long i, atomic64_t *v)
-{
- return arch_atomic64_sub_return(i, v) == 0;
-}
-
-/**
* arch_atomic64_inc - increment atomic64 variable
* @v: pointer to type atomic64_t
*
* Atomically increments @v by 1.
*/
+#define arch_atomic64_inc arch_atomic64_inc
static inline void arch_atomic64_inc(atomic64_t *v)
{
__alternative_atomic64(inc, inc_return, /* no output */,
@@ -229,6 +218,7 @@ static inline void arch_atomic64_inc(atomic64_t *v)
*
* Atomically decrements @v by 1.
*/
+#define arch_atomic64_dec arch_atomic64_dec
static inline void arch_atomic64_dec(atomic64_t *v)
{
__alternative_atomic64(dec, dec_return, /* no output */,
@@ -236,46 +226,6 @@ static inline void arch_atomic64_dec(atomic64_t *v)
}
/**
- * arch_atomic64_dec_and_test - decrement and test
- * @v: pointer to type atomic64_t
- *
- * Atomically decrements @v by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-static inline int arch_atomic64_dec_and_test(atomic64_t *v)
-{
- return arch_atomic64_dec_return(v) == 0;
-}
-
-/**
- * atomic64_inc_and_test - increment and test
- * @v: pointer to type atomic64_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-static inline int arch_atomic64_inc_and_test(atomic64_t *v)
-{
- return arch_atomic64_inc_return(v) == 0;
-}
-
-/**
- * arch_atomic64_add_negative - add and test if negative
- * @i: integer value to add
- * @v: pointer to type atomic64_t
- *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-static inline int arch_atomic64_add_negative(long long i, atomic64_t *v)
-{
- return arch_atomic64_add_return(i, v) < 0;
-}
-
-/**
* arch_atomic64_add_unless - add unless the number is a given value
* @v: pointer of type atomic64_t
* @a: the amount to add to v...
@@ -295,7 +245,7 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, long long a,
return (int)a;
}
-
+#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
{
int r;
@@ -304,6 +254,7 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
return r;
}
+#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
static inline long long arch_atomic64_dec_if_positive(atomic64_t *v)
{
long long r;
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6106b59d3260..4343d9b4f30e 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -71,6 +71,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v)
* true if the result is zero, or false for all
* other cases.
*/
+#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
{
GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
@@ -82,6 +83,7 @@ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
*
* Atomically increments @v by 1.
*/
+#define arch_atomic64_inc arch_atomic64_inc
static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "incq %0"
@@ -95,6 +97,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v)
*
* Atomically decrements @v by 1.
*/
+#define arch_atomic64_dec arch_atomic64_dec
static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "decq %0"
@@ -110,6 +113,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
+#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e);
@@ -123,6 +127,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
* and returns true if the result is zero, or false for all
* other cases.
*/
+#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e);
@@ -137,6 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
+#define arch_atomic64_add_negative arch_atomic64_add_negative
static inline bool arch_atomic64_add_negative(long i, atomic64_t *v)
{
GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s);
@@ -169,9 +175,6 @@ static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v)
return xadd(&v->counter, -i);
}
-#define arch_atomic64_inc_return(v) (arch_atomic64_add_return(1, (v)))
-#define arch_atomic64_dec_return(v) (arch_atomic64_sub_return(1, (v)))
-
static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new)
{
return arch_cmpxchg(&v->counter, old, new);
@@ -185,46 +188,7 @@ static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, l
static inline long arch_atomic64_xchg(atomic64_t *v, long new)
{
- return xchg(&v->counter, new);
-}
-
-/**
- * arch_atomic64_add_unless - add unless the number is a given value
- * @v: pointer of type atomic64_t
- * @a: the amount to add to v...
- * @u: ...unless v is equal to u.
- *
- * Atomically adds @a to @v, so long as it was not @u.
- * Returns the old value of @v.
- */
-static inline bool arch_atomic64_add_unless(atomic64_t *v, long a, long u)
-{
- s64 c = arch_atomic64_read(v);
- do {
- if (unlikely(c == u))
- return false;
- } while (!arch_atomic64_try_cmpxchg(v, &c, c + a));
- return true;
-}
-
-#define arch_atomic64_inc_not_zero(v) arch_atomic64_add_unless((v), 1, 0)
-
-/*
- * arch_atomic64_dec_if_positive - decrement by 1 if old value positive
- * @v: pointer of type atomic_t
- *
- * The function returns the old value of *v minus 1, even if
- * the atomic variable, v, was not decremented.
- */
-static inline long arch_atomic64_dec_if_positive(atomic64_t *v)
-{
- s64 dec, c = arch_atomic64_read(v);
- do {
- dec = c - 1;
- if (unlikely(dec < 0))
- break;
- } while (!arch_atomic64_try_cmpxchg(v, &c, dec));
- return dec;
+ return arch_xchg(&v->counter, new);
}
static inline void arch_atomic64_and(long i, atomic64_t *v)
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index e3efd8a06066..a55d79b233d3 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -75,7 +75,7 @@ extern void __add_wrong_size(void)
* use "asm volatile" and "memory" clobbers to prevent gcc from moving
* information around.
*/
-#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
+#define arch_xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
/*
* Atomic compare and exchange. Compare OLD with MEM, if identical,
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index bfca3b346c74..072e5459fe2f 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -10,13 +10,13 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
#define arch_cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
- cmpxchg((ptr), (o), (n)); \
+ arch_cmpxchg((ptr), (o), (n)); \
})
#define arch_cmpxchg64_local(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
- cmpxchg_local((ptr), (o), (n)); \
+ arch_cmpxchg_local((ptr), (o), (n)); \
})
#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5701f5cecd31..89a048c2faec 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -219,6 +219,8 @@
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
+#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
+#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -229,7 +231,7 @@
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
-
+#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -341,6 +343,7 @@
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
@@ -373,5 +376,6 @@
#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 0ab2ab27ad1f..b825cb201251 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -4,8 +4,8 @@
#include <linux/compiler.h>
#include <linux/init.h>
+#include <linux/io.h>
-#include <asm/io.h>
#include <asm/setup.h>
static __always_inline __init void *dmi_alloc(unsigned len)
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 740a428acf1e..d9069bb26c7f 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -3,10 +3,12 @@
#define _ASM_X86_HARDIRQ_H
#include <linux/threads.h>
-#include <linux/irq.h>
typedef struct {
- unsigned int __softirq_pending;
+ u16 __softirq_pending;
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+ u8 kvm_cpu_l1tf_flush_l1d;
+#endif
unsigned int __nmi_count; /* arch dependent */
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int apic_timer_irqs; /* arch dependent */
@@ -58,4 +60,24 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu);
extern u64 arch_irq_stat(void);
#define arch_irq_stat arch_irq_stat
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static inline void kvm_set_cpu_l1tf_flush_l1d(void)
+{
+ __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
+}
+
+static inline void kvm_clear_cpu_l1tf_flush_l1d(void)
+{
+ __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
+}
+
+static inline bool kvm_get_cpu_l1tf_flush_l1d(void)
+{
+ return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
+}
+#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
+static inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
+#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
+
#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index f59c39835a5a..a1f0e90d0818 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -49,11 +49,14 @@ static inline int hw_breakpoint_slots(int type)
return HBP_NUM;
}
+struct perf_event_attr;
struct perf_event;
struct pmu;
-extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
+extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
+extern int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw);
extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
unsigned long val, void *data);
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index cf090e584202..7ed08a7c3398 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -76,4 +76,17 @@
#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */
+/* Useful macros */
+#define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \
+{ \
+ .vendor = X86_VENDOR_INTEL, \
+ .family = _family, \
+ .model = _model, \
+ .feature = X86_FEATURE_ANY, \
+ .driver_data = (kernel_ulong_t)&_driver_data \
+}
+
+#define INTEL_CPU_FAM6(_model, _driver_data) \
+ INTEL_CPU_FAM_ANY(6, INTEL_FAM6_##_model, _driver_data)
+
#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index fe04491130ae..52f815a80539 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -80,35 +80,6 @@ enum intel_mid_cpu_type {
extern enum intel_mid_cpu_type __intel_mid_cpu_chip;
-/**
- * struct intel_mid_ops - Interface between intel-mid & sub archs
- * @arch_setup: arch_setup function to re-initialize platform
- * structures (x86_init, x86_platform_init)
- *
- * This structure can be extended if any new interface is required
- * between intel-mid & its sub arch files.
- */
-struct intel_mid_ops {
- void (*arch_setup)(void);
-};
-
-/* Helper API's for INTEL_MID_OPS_INIT */
-#define DECLARE_INTEL_MID_OPS_INIT(cpuname, cpuid) \
- [cpuid] = get_##cpuname##_ops
-
-/* Maximum number of CPU ops */
-#define MAX_CPU_OPS(a) (sizeof(a)/sizeof(void *))
-
-/*
- * For every new cpu addition, a weak get_<cpuname>_ops() function needs be
- * declared in arch/x86/platform/intel_mid/intel_mid_weak_decls.h.
- */
-#define INTEL_MID_OPS_INIT { \
- DECLARE_INTEL_MID_OPS_INIT(penwell, INTEL_MID_CPU_CHIP_PENWELL), \
- DECLARE_INTEL_MID_OPS_INIT(cloverview, INTEL_MID_CPU_CHIP_CLOVERVIEW), \
- DECLARE_INTEL_MID_OPS_INIT(tangier, INTEL_MID_CPU_CHIP_TANGIER) \
-};
-
#ifdef CONFIG_X86_INTEL_MID
static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void)
@@ -136,20 +107,6 @@ enum intel_mid_timer_options {
extern enum intel_mid_timer_options intel_mid_timer_options;
-/*
- * Penwell uses spread spectrum clock, so the freq number is not exactly
- * the same as reported by MSR based on SDM.
- */
-#define FSB_FREQ_83SKU 83200
-#define FSB_FREQ_100SKU 99840
-#define FSB_FREQ_133SKU 133000
-
-#define FSB_FREQ_167SKU 167000
-#define FSB_FREQ_200SKU 200000
-#define FSB_FREQ_267SKU 267000
-#define FSB_FREQ_333SKU 333000
-#define FSB_FREQ_400SKU 400000
-
/* Bus Select SoC Fuse value */
#define BSEL_SOC_FUSE_MASK 0x7
/* FSB 133MHz */
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
index 62a9f4966b42..ae26df1c2789 100644
--- a/arch/x86/include/asm/intel_ds.h
+++ b/arch/x86/include/asm/intel_ds.h
@@ -8,6 +8,7 @@
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
+#define MAX_FIXED_PEBS_EVENTS 3
/*
* A debug store configuration.
@@ -23,7 +24,7 @@ struct debug_store {
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
- u64 pebs_event_reset[MAX_PEBS_EVENTS];
+ u64 pebs_event_reset[MAX_PEBS_EVENTS + MAX_FIXED_PEBS_EVENTS];
} __aligned(PAGE_SIZE);
DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c4fc17220df9..c14f2a74b2be 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -13,6 +13,8 @@
* Interrupt control:
*/
+/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
+extern inline unsigned long native_save_fl(void);
extern inline unsigned long native_save_fl(void)
{
unsigned long flags;
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 367d99cff426..c8cec1b39b88 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -78,7 +78,7 @@ struct arch_specific_insn {
* boostable = true: This instruction has been boosted: we have
* added a relative jump after the instruction copy in insn,
* so no single-step and fixup are needed (unless there's
- * a post_handler or break_handler).
+ * a post_handler).
*/
bool boostable;
bool if_modifier;
@@ -111,9 +111,6 @@ struct kprobe_ctlblk {
unsigned long kprobe_status;
unsigned long kprobe_old_flags;
unsigned long kprobe_saved_flags;
- unsigned long *jprobe_saved_sp;
- struct pt_regs jprobe_saved_regs;
- kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
struct prev_kprobe prev_kprobe;
};
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
deleted file mode 100644
index 46185263d9c2..000000000000
--- a/arch/x86/include/asm/kvm_guest.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_KVM_GUEST_H
-#define _ASM_X86_KVM_GUEST_H
-
-int kvm_setup_vsyscall_timeinfo(void);
-
-#endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..acebb808c4b5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -17,6 +17,7 @@
#include <linux/tracepoint.h>
#include <linux/cpumask.h>
#include <linux/irq_work.h>
+#include <linux/irq.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -713,6 +714,9 @@ struct kvm_vcpu_arch {
/* be preempted when it's in kernel-mode(cpl=0) */
bool preempted_in_kernel;
+
+ /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
+ bool l1tf_flush_l1d;
};
struct kvm_lpage_info {
@@ -881,6 +885,7 @@ struct kvm_vcpu_stat {
u64 signal_exits;
u64 irq_window_exits;
u64 nmi_window_exits;
+ u64 l1d_flush;
u64 halt_exits;
u64 halt_successful_poll;
u64 halt_attempted_poll;
@@ -1413,6 +1418,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
+u64 kvm_get_arch_capabilities(void);
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 3aea2658323a..4c723632c036 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -7,7 +7,6 @@
#include <uapi/asm/kvm_para.h>
extern void kvmclock_init(void);
-extern int kvm_register_clock(char *txt);
#ifdef CONFIG_KVM_GUEST
bool kvm_check_and_clear_guest_paused(void);
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bbc796eb0a3b..eeeb9289c764 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -71,12 +71,7 @@ struct ldt_struct {
static inline void *ldt_slot_va(int slot)
{
-#ifdef CONFIG_X86_64
return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
-#else
- BUG();
- return (void *)fix_to_virt(FIX_HOLE);
-#endif
}
/*
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 5a7375ed5f7c..19886fef1dfc 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -194,6 +194,40 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
return hv_status;
}
+/* Fast hypercall with 16 bytes of input */
+static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+{
+ u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+#ifdef CONFIG_X86_64
+ {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ CALL_NOSPEC
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : "r" (input2),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "r8", "r9", "r10", "r11");
+ }
+#else
+ {
+ u32 input1_hi = upper_32_bits(input1);
+ u32 input1_lo = lower_32_bits(input1);
+ u32 input2_hi = upper_32_bits(input2);
+ u32 input2_lo = lower_32_bits(input2);
+
+ __asm__ __volatile__ (CALL_NOSPEC
+ : "=A"(hv_status),
+ "+c"(input1_lo), ASM_CALL_CONSTRAINT
+ : "A" (control), "b" (input1_hi),
+ "D"(input2_hi), "S"(input2_lo),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc");
+ }
+#endif
+ return hv_status;
+}
+
/*
* Rep hypercalls. Callers of this functions are supposed to ensure that
* rep_count and varhead_size comply with Hyper-V hypercall definition.
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 68b2c3150de1..4731f0cf97c5 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -70,12 +70,19 @@
#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
+#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
#define ARCH_CAP_SSB_NO (1 << 4) /*
* Not susceptible to Speculative Store Bypass
* attack, so no Speculative Store Bypass
* control required.
*/
+#define MSR_IA32_FLUSH_CMD 0x0000010b
+#define L1D_FLUSH (1 << 0) /*
+ * Writeback and invalidate the
+ * L1 data cache.
+ */
+
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f6f6c63da62f..fd2a8c1b88bc 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -214,7 +214,7 @@ enum spectre_v2_mitigation {
SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
SPECTRE_V2_RETPOLINE_GENERIC,
SPECTRE_V2_RETPOLINE_AMD,
- SPECTRE_V2_IBRS,
+ SPECTRE_V2_IBRS_ENHANCED,
};
/* The Speculative Store Bypass disable variants */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 9c9dc579bd7d..46f516dd80ce 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -88,6 +88,7 @@ struct orc_entry {
unsigned sp_reg:4;
unsigned bp_reg:4;
unsigned type:2;
+ unsigned end:1;
} __packed;
/*
@@ -101,6 +102,7 @@ struct unwind_hint {
s16 sp_offset;
u8 sp_reg;
u8 type;
+ u8 end;
};
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index aa30c3241ea7..0d5c739eebd7 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -29,8 +29,13 @@
#define N_EXCEPTION_STACKS 1
#ifdef CONFIG_X86_PAE
-/* 44=32+12, the limit we can fit into an unsigned long pfn */
-#define __PHYSICAL_MASK_SHIFT 44
+/*
+ * This is beyond the 44 bit limit imposed by the 32bit long pfns,
+ * but we need the full mask to make sure inverted PROT_NONE
+ * entries have all the host bits set in a guest.
+ * The real limit is still 44 bits.
+ */
+#define __PHYSICAL_MASK_SHIFT 52
#define __VIRTUAL_MASK_SHIFT 32
#else /* !CONFIG_X86_PAE */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a06b07399d17..e9202a0de8f0 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -450,9 +450,10 @@ do { \
bool __ret; \
typeof(pcp1) __o1 = (o1), __n1 = (n1); \
typeof(pcp2) __o2 = (o2), __n2 = (n2); \
- asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
- : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \
- : "b" (__n1), "c" (__n2), "a" (__o1)); \
+ asm volatile("cmpxchg8b "__percpu_arg(1) \
+ CC_SET(z) \
+ : CC_OUT(z) (__ret), "+m" (pcp1), "+m" (pcp2), "+a" (__o1), "+d" (__o2) \
+ : "b" (__n1), "c" (__n2)); \
__ret; \
})
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 685ffe8a0eaf..24c6cf5f16b7 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -19,6 +19,9 @@ static inline void native_set_pte(pte_t *ptep , pte_t pte)
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pmd.pud.p4d.pgd = pti_set_user_pgtbl(&pmdp->pud.p4d.pgd, pmd.pud.p4d.pgd);
+#endif
*pmdp = pmd;
}
@@ -58,6 +61,9 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#ifdef CONFIG_SMP
static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pti_set_user_pgtbl(&xp->pud.p4d.pgd, __pgd(0));
+#endif
return __pmd(xchg((pmdval_t *)xp, 0));
}
#else
@@ -67,6 +73,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
#ifdef CONFIG_SMP
static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pti_set_user_pgtbl(&xp->p4d.pgd, __pgd(0));
+#endif
return __pud(xchg((pudval_t *)xp, 0));
}
#else
@@ -95,4 +104,21 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+/* No inverted PFNs on 2 level page tables */
+
+static inline u64 protnone_mask(u64 val)
+{
+ return 0;
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
+{
+ return val;
+}
+
+static inline bool __pte_needs_invert(u64 val)
+{
+ return false;
+}
+
#endif /* _ASM_X86_PGTABLE_2LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index f982ef808e7e..6deb6cd236e3 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -35,4 +35,7 @@ typedef union {
#define PTRS_PER_PTE 1024
+/* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */
+#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
+
#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index f24df59c40b2..a564084c6141 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -98,6 +98,9 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
+#endif
set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
}
@@ -229,6 +232,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
{
union split_pud res, *orig = (union split_pud *)pudp;
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0));
+#endif
+
/* xchg acts as a barrier before setting of the high bits */
res.pud_low = xchg(&orig->pud_low, 0);
res.pud_high = orig->pud_high;
@@ -241,12 +248,43 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#endif
/* Encode and de-code a swap entry */
+#define SWP_TYPE_BITS 5
+
+#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+
+/* We always extract/encode the offset by shifting it all the way up, and then down again */
+#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
+
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
#define __swp_type(x) (((x).val) & 0x1f)
#define __swp_offset(x) ((x).val >> 5)
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
-#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
-#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+
+/*
+ * Normally, __swp_entry() converts from arch-independent swp_entry_t to
+ * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result
+ * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the
+ * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to
+ * __swp_entry_to_pte() through the following helper macro based on 64bit
+ * __swp_entry().
+ */
+#define __swp_pteval_entry(type, offset) ((pteval_t) { \
+ (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) })
+
+#define __swp_entry_to_pte(x) ((pte_t){ .pte = \
+ __swp_pteval_entry(__swp_type(x), __swp_offset(x)) })
+/*
+ * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent
+ * swp_entry_t, but also has to convert it from 64bit to the 32bit
+ * intermediate representation, using the following macros based on 64bit
+ * __swp_type() and __swp_offset().
+ */
+#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS)))
+#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT))
+
+#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
+ __pteval_swp_offset(pte)))
#define gup_get_pte gup_get_pte
/*
@@ -295,4 +333,6 @@ static inline pte_t gup_get_pte(pte_t *ptep)
return pte;
}
+#include <asm/pgtable-invert.h>
+
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 6a59a6d0cc50..858358a82b14 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -21,9 +21,10 @@ typedef union {
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_PARAVIRT
-#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
+#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \
+ (pv_info.shared_kernel_pmd)))
#else
-#define SHARED_KERNEL_PMD 1
+#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))
#endif
/*
@@ -45,5 +46,6 @@ typedef union {
#define PTRS_PER_PTE 512
#define MAX_POSSIBLE_PHYSMEM_BITS 36
+#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h
new file mode 100644
index 000000000000..44b1203ece12
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-invert.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_PGTABLE_INVERT_H
+#define _ASM_PGTABLE_INVERT_H 1
+
+#ifndef __ASSEMBLY__
+
+static inline bool __pte_needs_invert(u64 val)
+{
+ return !(val & _PAGE_PRESENT);
+}
+
+/* Get a mask to xor with the page table entry to get the correct pfn. */
+static inline u64 protnone_mask(u64 val)
+{
+ return __pte_needs_invert(val) ? ~0ull : 0;
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
+{
+ /*
+ * When a PTE transitions from NONE to !NONE or vice-versa
+ * invert the PFN part to stop speculation.
+ * pte_pfn undoes this when needed.
+ */
+ if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
+ val = (val & ~mask) | (~val & mask);
+ return val;
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5715647fc4fe..e4ffa565a69f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -30,11 +30,14 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
void ptdump_walk_pgd_level_checkwx(void);
+void ptdump_walk_user_pgd_level_checkwx(void);
#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
+#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
+#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
#else
-#define debug_checkwx() do { } while (0)
+#define debug_checkwx() do { } while (0)
+#define debug_checkwx_user() do { } while (0)
#endif
/*
@@ -185,19 +188,29 @@ static inline int pte_special(pte_t pte)
return pte_flags(pte) & _PAGE_SPECIAL;
}
+/* Entries that were set to PROT_NONE are inverted */
+
+static inline u64 protnone_mask(u64 val);
+
static inline unsigned long pte_pfn(pte_t pte)
{
- return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ phys_addr_t pfn = pte_val(pte);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}
static inline unsigned long pmd_pfn(pmd_t pmd)
{
- return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
+ phys_addr_t pfn = pmd_val(pmd);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}
static inline unsigned long pud_pfn(pud_t pud)
{
- return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
+ phys_addr_t pfn = pud_val(pud);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}
static inline unsigned long p4d_pfn(p4d_t p4d)
@@ -400,11 +413,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_RW);
}
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
-{
- return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
-}
-
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
pudval_t v = native_pud_val(pud);
@@ -459,11 +467,6 @@ static inline pud_t pud_mkwrite(pud_t pud)
return pud_set_flags(pud, _PAGE_RW);
}
-static inline pud_t pud_mknotpresent(pud_t pud)
-{
- return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE);
-}
-
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
@@ -545,25 +548,45 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot)
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
- return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
- check_pgprot(pgprot));
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PTE_PFN_MASK;
+ return __pte(pfn | check_pgprot(pgprot));
}
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
- return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
- check_pgprot(pgprot));
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PMD_PAGE_MASK;
+ return __pmd(pfn | check_pgprot(pgprot));
}
static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
- return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) |
- check_pgprot(pgprot));
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PUD_PAGE_MASK;
+ return __pud(pfn | check_pgprot(pgprot));
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return pfn_pmd(pmd_pfn(pmd),
+ __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+}
+
+static inline pud_t pud_mknotpresent(pud_t pud)
+{
+ return pfn_pud(pud_pfn(pud),
+ __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
+
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
- pteval_t val = pte_val(pte);
+ pteval_t val = pte_val(pte), oldval = val;
/*
* Chop off the NX bit (if present), and add the NX portion of
@@ -571,17 +594,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
*/
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
-
+ val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
return __pte(val);
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
- pmdval_t val = pmd_val(pmd);
+ pmdval_t val = pmd_val(pmd), oldval = val;
val &= _HPAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
-
+ val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
return __pmd(val);
}
@@ -640,8 +663,31 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return pgd;
+ return __pti_set_user_pgtbl(pgdp, pgd);
+}
+#else /* CONFIG_PAGE_TABLE_ISOLATION */
+static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
#endif /* __ASSEMBLY__ */
+
#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
@@ -1154,6 +1200,70 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
}
}
#endif
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+ unsigned long ptr = (unsigned long)__ptr;
+
+ return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
+}
+
+static inline int pgd_large(pgd_t pgd) { return 0; }
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
+ * the user one is in the last 4k. To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr |= BIT(bit);
+ return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr &= ~BIT(bit);
+ return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
@@ -1320,6 +1430,14 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
return __pte_access_permitted(pud_val(pud), write);
}
+#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
+extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
+
+static inline bool arch_has_pfn_modify_check(void)
+{
+ return boot_cpu_has_bug(X86_BUG_L1TF);
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 88a056b01db4..b3ec519e3982 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -34,8 +34,6 @@ static inline void check_pgt_cache(void) { }
void paging_init(void);
void sync_initial_page_table(void);
-static inline int pgd_large(pgd_t pgd) { return 0; }
-
/*
* Define this if things work differently on an i386 and an i486:
* it will (on an i486) warn about kernel memory accesses that are
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index d9a001a4a872..b0bc0fff5f1f 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -50,13 +50,18 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
& PMD_MASK)
-#define PKMAP_BASE \
+#define LDT_BASE_ADDR \
((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
+#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE)
+
+#define PKMAP_BASE \
+ ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK)
+
#ifdef CONFIG_HIGHMEM
# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
#else
-# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
+# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE)
#endif
#define MODULES_VADDR VMALLOC_START
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 3c5385f9a88f..f773d5e6c8cc 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -132,90 +132,6 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
#endif
}
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
-/*
- * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
- * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
- * the user one is in the last 4k. To switch between them, you
- * just need to flip the 12th bit in their addresses.
- */
-#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
-
-/*
- * This generates better code than the inline assembly in
- * __set_bit().
- */
-static inline void *ptr_set_bit(void *ptr, int bit)
-{
- unsigned long __ptr = (unsigned long)ptr;
-
- __ptr |= BIT(bit);
- return (void *)__ptr;
-}
-static inline void *ptr_clear_bit(void *ptr, int bit)
-{
- unsigned long __ptr = (unsigned long)ptr;
-
- __ptr &= ~BIT(bit);
- return (void *)__ptr;
-}
-
-static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
-{
- return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
-}
-
-static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
-{
- return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
-}
-
-static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
-{
- return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
-}
-
-static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
-{
- return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
-}
-#endif /* CONFIG_PAGE_TABLE_ISOLATION */
-
-/*
- * Page table pages are page-aligned. The lower half of the top
- * level is used for userspace and the top half for the kernel.
- *
- * Returns true for parts of the PGD that map userspace and
- * false for the parts that map the kernel.
- */
-static inline bool pgdp_maps_userspace(void *__ptr)
-{
- unsigned long ptr = (unsigned long)__ptr;
-
- return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
-}
-
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
-pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
-
-/*
- * Take a PGD location (pgdp) and a pgd value that needs to be set there.
- * Populates the user and returns the resulting PGD that must be set in
- * the kernel copy of the page tables.
- */
-static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
-{
- if (!static_cpu_has(X86_FEATURE_PTI))
- return pgd;
- return __pti_set_user_pgd(pgdp, pgd);
-}
-#else
-static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
-{
- return pgd;
-}
-#endif
-
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
pgd_t pgd;
@@ -226,7 +142,7 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
}
pgd = native_make_pgd(native_p4d_val(p4d));
- pgd = pti_set_user_pgd((pgd_t *)p4dp, pgd);
+ pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
*p4dp = native_make_p4d(native_pgd_val(pgd));
}
@@ -237,7 +153,7 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pti_set_user_pgd(pgdp, pgd);
+ *pgdp = pti_set_user_pgtbl(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)
@@ -255,7 +171,6 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
/*
* Level 4 access.
*/
-static inline int pgd_large(pgd_t pgd) { return 0; }
#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
/* PUD - Level3 access */
@@ -273,7 +188,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
*
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
- * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry
+ * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
@@ -286,20 +201,34 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
*
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
+ *
+ * The offset is inverted by a binary not operation to make the high
+ * physical bits set.
*/
-#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
-#define SWP_TYPE_BITS 5
-/* Place the offset above the type: */
-#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
+#define SWP_TYPE_BITS 5
+
+#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+
+/* We always extract/encode the offset by shifting it all the way up, and then down again */
+#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
-#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \
- & ((1U << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT)
-#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << (SWP_TYPE_FIRST_BIT)) \
- | ((offset) << SWP_OFFSET_FIRST_BIT) })
+/* Extract the high bits for type */
+#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
+
+/* Shift up (to get rid of type), then down to get value */
+#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
+
+/*
+ * Shift the offset up "too far" by TYPE bits, then down again
+ * The offset is inverted by a binary not operation to make the high
+ * physical bits set.
+ */
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
+
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
@@ -343,5 +272,7 @@ static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
return true;
}
+#include <asm/pgtable-invert.h>
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 054765ab2da2..04edd2d58211 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -115,6 +115,7 @@ extern unsigned int ptrs_per_p4d;
#define LDT_PGD_ENTRY_L5 -112UL
#define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4)
#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE)
#define __VMALLOC_BASE_L4 0xffffc90000000000UL
#define __VMALLOC_BASE_L5 0xffa0000000000000UL
@@ -153,4 +154,6 @@ extern unsigned int ptrs_per_p4d;
#define EARLY_DYNAMIC_PAGE_TABLES 64
+#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 99fff853c944..b64acb08a62b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -50,6 +50,7 @@
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
+#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
@@ -266,14 +267,37 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
typedef struct { pgdval_t pgd; } pgd_t;
+#ifdef CONFIG_X86_PAE
+
+/*
+ * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
+ * use it here.
+ */
+
+#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK)
+#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)
+
+/*
+ * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
+ * All other bits are Reserved MBZ
+ */
+#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
+ _PAGE_PWT | _PAGE_PCD | \
+ _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)
+
+#else
+/* No need to mask any bits for !PAE */
+#define PGD_ALLOWED_BITS (~0ULL)
+#endif
+
static inline pgd_t native_make_pgd(pgdval_t val)
{
- return (pgd_t) { val };
+ return (pgd_t) { val & PGD_ALLOWED_BITS };
}
static inline pgdval_t native_pgd_val(pgd_t pgd)
{
- return pgd.pgd;
+ return pgd.pgd & PGD_ALLOWED_BITS;
}
static inline pgdval_t pgd_flags(pgd_t pgd)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 625a52a5594f..02c2cbda4a74 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -39,10 +39,6 @@
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
-# define X86_CR3_PTI_PCID_USER_BIT 11
-#endif
-
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
@@ -53,4 +49,8 @@
#define CR3_NOFLUSH 0
#endif
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_PCID_USER_BIT 11
+#endif
+
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cfd29ee8c3da..682286aca881 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -181,6 +181,11 @@ extern const struct seq_operations cpuinfo_op;
extern void cpu_detect(struct cpuinfo_x86 *c);
+static inline unsigned long l1tf_pfn_limit(void)
+{
+ return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1;
+}
+
extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
@@ -966,6 +971,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
extern unsigned long arch_align_stack(unsigned long sp);
extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+extern void free_kernel_image_pages(void *begin, void *end);
void default_idle(void);
#ifdef CONFIG_XEN
@@ -977,4 +983,16 @@ bool xen_set_default_idle(void);
void stop_this_cpu(void *dummy);
void df_debug(struct pt_regs *regs, long error_code);
void microcode_check(void);
+
+enum l1tf_mitigations {
+ L1TF_MITIGATION_OFF,
+ L1TF_MITIGATION_FLUSH_NOWARN,
+ L1TF_MITIGATION_FLUSH,
+ L1TF_MITIGATION_FLUSH_NOSMT,
+ L1TF_MITIGATION_FULL,
+ L1TF_MITIGATION_FULL_FORCE
+};
+
+extern enum l1tf_mitigations l1tf_mitigation;
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
index 38a17f1d5c9d..5df09a0b80b8 100644
--- a/arch/x86/include/asm/pti.h
+++ b/arch/x86/include/asm/pti.h
@@ -6,10 +6,9 @@
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern void pti_init(void);
extern void pti_check_boottime_disable(void);
-extern void pti_clone_kernel_text(void);
+extern void pti_finalize(void);
#else
static inline void pti_check_boottime_disable(void) { }
-static inline void pti_clone_kernel_text(void) { }
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 9ef5ee03d2d7..159622ee0674 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -43,7 +43,7 @@ asm (".pushsection .text;"
"push %rdx;"
"mov $0x1,%eax;"
"xor %edx,%edx;"
- "lock cmpxchg %dl,(%rdi);"
+ LOCK_PREFIX "cmpxchg %dl,(%rdi);"
"cmp $0x1,%al;"
"jne .slowpath;"
"pop %rdx;"
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
index 4cf11d88d3b3..19b90521954c 100644
--- a/arch/x86/include/asm/refcount.h
+++ b/arch/x86/include/asm/refcount.h
@@ -5,6 +5,7 @@
* PaX/grsecurity.
*/
#include <linux/refcount.h>
+#include <asm/bug.h>
/*
* This is the first portion of the refcount error handling, which lives in
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 5c019d23d06b..4a911a382ade 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,6 +7,7 @@
extern char __brk_base[], __brk_limit[];
extern struct exception_table_entry __stop___ex_table[];
+extern char __end_rodata_aligned[];
#if defined(CONFIG_X86_64)
extern char __end_rodata_hpage_align[];
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index bd090367236c..34cffcef7375 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -46,6 +46,7 @@ int set_memory_np(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
+int set_memory_np_noalias(unsigned long addr, int numpages);
int set_memory_array_uc(unsigned long *addr, int addrinarray);
int set_memory_array_wc(unsigned long *addr, int addrinarray);
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index eb5f7999a893..36bd243843d6 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -87,15 +87,25 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
#endif
/* This is used when switching tasks or entering/exiting vm86 mode. */
-static inline void update_sp0(struct task_struct *task)
+static inline void update_task_stack(struct task_struct *task)
{
- /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
+ /* sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
- load_sp0(task->thread.sp0);
+ if (static_cpu_has(X86_FEATURE_XENPV))
+ load_sp0(task->thread.sp0);
+ else
+ this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
+ /*
+ * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
+ * doesn't work on x86-32 because sp1 and
+ * cpu_current_top_of_stack have different values (because of
+ * the non-zero stack-padding on 32bit).
+ */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
+
}
#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 2ecd34e2d46c..e85ff65c43c3 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -37,5 +37,6 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
+extern int after_bootmem;
#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6690cd3fc8b1..511bf5fae8b8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
-static inline bool tlb_defer_switch_to_init_mm(void)
-{
- /*
- * If we have PCID, then switching to init_mm is reasonably
- * fast. If we don't have PCID, then switching to init_mm is
- * quite slow, so we try to defer it in the hopes that we can
- * avoid it entirely. The latter approach runs the risk of
- * receiving otherwise unnecessary IPIs.
- *
- * This choice is just a heuristic. The tlb code can handle this
- * function returning true or false regardless of whether we have
- * PCID.
- */
- return !static_cpu_has(X86_FEATURE_PCID);
-}
-
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
native_flush_tlb_others(mask, info)
#endif
+extern void tlb_flush_remove_tables(struct mm_struct *mm);
+extern void tlb_flush_remove_tables_local(void *arg);
+
+#define HAVE_TLB_FLUSH_REMOVE_TABLES
+
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c1d2a9892352..453cf38a1c33 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -123,13 +123,17 @@ static inline int topology_max_smt_threads(void)
}
int topology_update_package_map(unsigned int apicid, unsigned int cpu);
-extern int topology_phys_to_logical_pkg(unsigned int pkg);
+int topology_phys_to_logical_pkg(unsigned int pkg);
+bool topology_is_primary_thread(unsigned int cpu);
+bool topology_smt_supported(void);
#else
#define topology_max_packages() (1)
static inline int
topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_smt_threads(void) { return 1; }
+static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
+static inline bool topology_smt_supported(void) { return false; }
#endif
static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h
index 4253bca99989..9c0d4b588e3f 100644
--- a/arch/x86/include/asm/trace/hyperv.h
+++ b/arch/x86/include/asm/trace/hyperv.h
@@ -28,6 +28,21 @@ TRACE_EVENT(hyperv_mmu_flush_tlb_others,
__entry->addr, __entry->end)
);
+TRACE_EVENT(hyperv_send_ipi_mask,
+ TP_PROTO(const struct cpumask *cpus,
+ int vector),
+ TP_ARGS(cpus, vector),
+ TP_STRUCT__entry(
+ __field(unsigned int, ncpus)
+ __field(int, vector)
+ ),
+ TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
+ __entry->vector = vector;
+ ),
+ TP_printk("ncpus %d vector %x",
+ __entry->ncpus, __entry->vector)
+ );
+
#endif /* CONFIG_HYPERV */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 2701d221583a..eb5bbfeccb66 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -33,13 +33,13 @@ static inline cycles_t get_cycles(void)
extern struct system_counterval_t convert_art_to_tsc(u64 art);
extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns);
-extern void tsc_early_delay_calibrate(void);
+extern void tsc_early_init(void);
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern void mark_tsc_async_resets(char *reason);
-extern unsigned long native_calibrate_cpu(void);
+extern unsigned long native_calibrate_cpu_early(void);
extern unsigned long native_calibrate_tsc(void);
extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index bae46fc6b9de..0bcdb1279361 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -26,7 +26,7 @@
* the debuginfo as necessary. It will also warn if it sees any
* inconsistencies.
*/
-.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL
+.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL end=0
#ifdef CONFIG_STACK_VALIDATION
.Lunwind_hint_ip_\@:
.pushsection .discard.unwind_hints
@@ -35,12 +35,14 @@
.short \sp_offset
.byte \sp_reg
.byte \type
+ .byte \end
+ .balign 4
.popsection
#endif
.endm
.macro UNWIND_HINT_EMPTY
- UNWIND_HINT sp_reg=ORC_REG_UNDEFINED
+ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED end=1
.endm
.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
@@ -86,19 +88,21 @@
#else /* !__ASSEMBLY__ */
-#define UNWIND_HINT(sp_reg, sp_offset, type) \
+#define UNWIND_HINT(sp_reg, sp_offset, type, end) \
"987: \n\t" \
".pushsection .discard.unwind_hints\n\t" \
/* struct unwind_hint */ \
".long 987b - .\n\t" \
- ".short " __stringify(sp_offset) "\n\t" \
+ ".short " __stringify(sp_offset) "\n\t" \
".byte " __stringify(sp_reg) "\n\t" \
".byte " __stringify(type) "\n\t" \
+ ".byte " __stringify(end) "\n\t" \
+ ".balign 4 \n\t" \
".popsection\n\t"
-#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE)
+#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE, 0)
-#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE)
+#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE, 0)
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 6aa8499e1f62..95f9107449bf 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -576,4 +576,15 @@ enum vm_instruction_error_number {
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
};
+enum vmx_l1d_flush_state {
+ VMENTER_L1D_FLUSH_AUTO,
+ VMENTER_L1D_FLUSH_NEVER,
+ VMENTER_L1D_FLUSH_COND,
+ VMENTER_L1D_FLUSH_ALWAYS,
+ VMENTER_L1D_FLUSH_EPT_DISABLED,
+ VMENTER_L1D_FLUSH_NOT_REQUIRED,
+};
+
+extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+
#endif
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a481763a3776..014f214da581 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -668,6 +668,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
local_irq_save(flags);
memcpy(addr, opcode, len);
local_irq_restore(flags);
+ sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
return addr;
@@ -693,6 +694,12 @@ void *text_poke(void *addr, const void *opcode, size_t len)
struct page *pages[2];
int i;
+ /*
+ * While boot memory allocator is runnig we cannot use struct
+ * pages as they are not yet initialized.
+ */
+ BUG_ON(!after_bootmem);
+
if (!core_kernel_text((unsigned long)addr)) {
pages[0] = vmalloc_to_page(addr);
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2aabd4cb0e3f..87ff6235bbfe 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -56,6 +56,7 @@
#include <asm/hypervisor.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
+#include <asm/irq_regs.h>
unsigned int num_processors;
@@ -573,6 +574,9 @@ static u32 skx_deadline_rev(void)
case 0x04: return 0x02000014;
}
+ if (boot_cpu_data.x86_stepping > 4)
+ return 0;
+
return ~0U;
}
@@ -937,7 +941,7 @@ static int __init calibrate_APIC_clock(void)
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
pr_warning("APIC timer disabled due to verification failure\n");
- return -1;
+ return -1;
}
return 0;
@@ -2189,6 +2193,21 @@ static int cpuid_to_apicid[] = {
[0 ... NR_CPUS - 1] = -1,
};
+/**
+ * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
+ * @id: APIC ID to check
+ */
+bool apic_id_is_primary_thread(unsigned int apicid)
+{
+ u32 mask;
+
+ if (smp_num_siblings == 1)
+ return true;
+ /* Isolate the SMT bit(s) in the APICID and check for 0 */
+ mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
+ return !(apicid & mask);
+}
+
/*
* Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
* and cpuid_to_apicid[] synchronized.
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3982f79d2377..ff0d14cd9e82 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -33,6 +33,7 @@
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index ce503c99f5c4..72a94401f9e0 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -12,6 +12,7 @@
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/pci.h>
#include <linux/dmar.h>
#include <linux/hpet.h>
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 35aaee4fc028..9f148e3d45b4 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -11,6 +11,7 @@
* published by the Free Software Foundation.
*/
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/compiler.h>
@@ -218,7 +219,8 @@ static int reserve_irq_vector(struct irq_data *irqd)
return 0;
}
-static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest)
+static int
+assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
{
struct apic_chip_data *apicd = apic_chip_data(irqd);
bool resvd = apicd->has_reserved;
@@ -245,22 +247,12 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest)
return -EBUSY;
vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);
- if (vector > 0)
- apic_update_vector(irqd, vector, cpu);
trace_vector_alloc(irqd->irq, vector, resvd, vector);
- return vector;
-}
-
-static int assign_vector_locked(struct irq_data *irqd,
- const struct cpumask *dest)
-{
- struct apic_chip_data *apicd = apic_chip_data(irqd);
- int vector = allocate_vector(irqd, dest);
-
if (vector < 0)
return vector;
+ apic_update_vector(irqd, vector, cpu);
+ apic_update_irq_cfg(irqd, vector, cpu);
- apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
return 0;
}
@@ -433,7 +425,7 @@ static int activate_managed(struct irq_data *irqd)
pr_err("Managed startup irq %u, no vector available\n",
irqd->irq);
}
- return ret;
+ return ret;
}
static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d492752f79e1..391f358ebb4c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -394,10 +394,10 @@ extern int uv_hub_info_version(void)
EXPORT_SYMBOL(uv_hub_info_version);
/* Default UV memory block size is 2GB */
-static unsigned long mem_block_size = (2UL << 30);
+static unsigned long mem_block_size __initdata = (2UL << 30);
/* Kernel parameter to specify UV mem block size */
-static int parse_mem_block_size(char *ptr)
+static int __init parse_mem_block_size(char *ptr)
{
unsigned long size = memparse(ptr, NULL);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index dcb008c320fe..01de31db300d 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -103,4 +103,9 @@ void common(void) {
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
+ DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
+
+ /* Offset for sp0 and sp1 into the tss_struct */
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index a4a3be399f4b..82826f2275cc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -46,8 +46,14 @@ void foo(void)
OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
- /* Offset from the sysenter stack to tss.sp0 */
- DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+ /*
+ * Offset from the entry stack to task stack stored in TSS. Kernel entry
+ * happens on the per-cpu entry-stack, and the asm code switches to the
+ * task-stack pointer stored in x86_tss.sp1, which is a copy of
+ * task->thread.sp0 where entry code can find it.
+ */
+ DEFINE(TSS_entry2task_stack,
+ offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
offsetofend(struct cpu_entry_area, entry_stack_page.stack));
#ifdef CONFIG_STACKPROTECTOR
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index b2dcd161f514..3b9405e7ba2b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -65,8 +65,6 @@ int main(void)
#undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
- OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
- OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK();
#ifdef CONFIG_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7a40196967cb..347137e80bf5 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -35,7 +35,9 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o
+CFLAGS_intel_rdt_pseudo_lock.o = -I$(src)
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 38915fbfae73..22ab408177b2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -232,8 +232,6 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
}
}
- set_cpu_cap(c, X86_FEATURE_K7);
-
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -315,6 +313,13 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
c->cpu_core_id %= cus_per_node;
}
+
+static void amd_get_topology_early(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+}
+
/*
* Fixup core topology information for
* (1) AMD multi-node processors
@@ -334,7 +339,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
node_id = ecx & 0xff;
- smp_num_siblings = ((ebx >> 8) & 0xff) + 1;
if (c->x86 == 0x15)
c->cu_id = ebx & 0xff;
@@ -613,10 +617,19 @@ clear_sev:
static void early_init_amd(struct cpuinfo_x86 *c)
{
+ u64 value;
u32 dummy;
early_init_amd_mc(c);
+#ifdef CONFIG_X86_32
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_K7);
+#endif
+
+ if (c->x86 >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
/*
@@ -683,6 +696,22 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_E400);
early_detect_mem_encrypt(c);
+
+ /* Re-enable TopologyExtensions if switched off by BIOS */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
+ !cpu_has(c, X86_FEATURE_TOPOEXT)) {
+
+ if (msr_set_bit(0xc0011005, 54) > 0) {
+ rdmsrl(0xc0011005, value);
+ if (value & BIT_64(54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+ }
+ }
+
+ amd_get_topology_early(c);
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -774,19 +803,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
- /* re-enable TopologyExtensions if switched off by BIOS */
- if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) &&
- !cpu_has(c, X86_FEATURE_TOPOEXT)) {
-
- if (msr_set_bit(0xc0011005, 54) > 0) {
- rdmsrl(0xc0011005, value);
- if (value & BIT_64(54)) {
- set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
- }
- }
- }
-
/*
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
@@ -850,22 +866,12 @@ static void init_amd(struct cpuinfo_x86 *c)
cpu_detect_cache_sizes(c);
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008) {
- amd_detect_cmp(c);
- amd_get_topology(c);
- srat_detect_node(c);
- }
-
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
+ amd_detect_cmp(c);
+ amd_get_topology(c);
+ srat_detect_node(c);
init_amd_cacheinfo(c);
- if (c->x86 >= 0xf)
- set_cpu_cap(c, X86_FEATURE_K8);
-
if (cpu_has(c, X86_FEATURE_XMM2)) {
unsigned long long val;
int ret;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 5c0ea39311fe..27830880e7a7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,15 +22,18 @@
#include <asm/processor-flags.h>
#include <asm/fpu/internal.h>
#include <asm/msr.h>
+#include <asm/vmx.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/intel-family.h>
+#include <asm/e820/api.h>
#include <asm/hypervisor.h>
static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
+static void __init l1tf_select_mitigation(void);
/*
* Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
@@ -56,6 +59,12 @@ void __init check_bugs(void)
{
identify_boot_cpu();
+ /*
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
+ */
+ cpu_smt_check_topology_early();
+
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
print_cpu_info(&boot_cpu_data);
@@ -82,6 +91,8 @@ void __init check_bugs(void)
*/
ssb_select_mitigation();
+ l1tf_select_mitigation();
+
#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
@@ -130,6 +141,7 @@ static const char *spectre_v2_strings[] = {
[SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
[SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
[SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
+ [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
};
#undef pr_fmt
@@ -313,23 +325,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return cmd;
}
-/* Check for Skylake-like CPUs (for RSB handling) */
-static bool __init is_skylake_era(void)
-{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6) {
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_SKYLAKE_MOBILE:
- case INTEL_FAM6_SKYLAKE_DESKTOP:
- case INTEL_FAM6_SKYLAKE_X:
- case INTEL_FAM6_KABYLAKE_MOBILE:
- case INTEL_FAM6_KABYLAKE_DESKTOP:
- return true;
- }
- }
- return false;
-}
-
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -349,6 +344,13 @@ static void __init spectre_v2_select_mitigation(void)
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ mode = SPECTRE_V2_IBRS_ENHANCED;
+ /* Force it so VMEXIT will restore correctly */
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ goto specv2_set_mode;
+ }
if (IS_ENABLED(CONFIG_RETPOLINE))
goto retpoline_auto;
break;
@@ -386,26 +388,20 @@ retpoline_auto:
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
}
+specv2_set_mode:
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
/*
- * If neither SMEP nor PTI are available, there is a risk of
- * hitting userspace addresses in the RSB after a context switch
- * from a shallow call stack to a deeper one. To prevent this fill
- * the entire RSB, even when using IBRS.
+ * If spectre v2 protection has been enabled, unconditionally fill
+ * RSB during a context switch; this protects against two independent
+ * issues:
*
- * Skylake era CPUs have a separate issue with *underflow* of the
- * RSB, when they will predict 'ret' targets from the generic BTB.
- * The proper mitigation for this is IBRS. If IBRS is not supported
- * or deactivated in favour of retpolines the RSB fill on context
- * switch is required.
+ * - RSB underflow (and switch to BTB) on Skylake+
+ * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
*/
- if ((!boot_cpu_has(X86_FEATURE_PTI) &&
- !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
- setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
- pr_info("Spectre v2 mitigation: Filling RSB on context switch\n");
- }
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
/* Initialize Indirect Branch Prediction Barrier if supported */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
@@ -415,9 +411,16 @@ retpoline_auto:
/*
* Retpoline means the kernel is safe because it has no indirect
- * branches. But firmware isn't, so use IBRS to protect that.
+ * branches. Enhanced IBRS protects firmware too, so, enable restricted
+ * speculation around firmware calls only when Enhanced IBRS isn't
+ * supported.
+ *
+ * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
+ * the user might select retpoline on the kernel command line and if
+ * the CPU supports Enhanced IBRS, kernel might un-intentionally not
+ * enable IBRS around firmware calls.
*/
- if (boot_cpu_has(X86_FEATURE_IBRS)) {
+ if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
@@ -654,8 +657,121 @@ void x86_spec_ctrl_setup_ap(void)
x86_amd_ssb_disable();
}
+#undef pr_fmt
+#define pr_fmt(fmt) "L1TF: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(l1tf_mitigation);
+
+enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+#endif
+
+static void __init l1tf_select_mitigation(void)
+{
+ u64 half_pa;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return;
+
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ break;
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ cpu_smt_disable(false);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ cpu_smt_disable(true);
+ break;
+ }
+
+#if CONFIG_PGTABLE_LEVELS == 2
+ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+ return;
+#endif
+
+ /*
+ * This is extremely unlikely to happen because almost all
+ * systems have far more MAX_PA/2 than RAM can be fit into
+ * DIMM slots.
+ */
+ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+ if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
+ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+ return;
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+}
+
+static int __init l1tf_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (!strcmp(str, "flush,nowarn"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
+ else if (!strcmp(str, "flush"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ else if (!strcmp(str, "flush,nosmt"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else if (!strcmp(str, "full"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL;
+ else if (!strcmp(str, "full,force"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
+
+ return 0;
+}
+early_param("l1tf", l1tf_cmdline);
+
+#undef pr_fmt
+
#ifdef CONFIG_SYSFS
+#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static const char *l1tf_vmx_states[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = "auto",
+ [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
+ [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
+ [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
+};
+
+static ssize_t l1tf_show_state(char *buf)
+{
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
+ (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
+ cpu_smt_control == CPU_SMT_ENABLED))
+ return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
+
+ return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
+}
+#else
+static ssize_t l1tf_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+}
+#endif
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -684,6 +800,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+ case X86_BUG_L1TF:
+ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+ return l1tf_show_state(buf);
+ break;
default:
break;
}
@@ -710,4 +830,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *
{
return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
}
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index eb4cb3efd20e..f3234010847c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -661,33 +661,36 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
}
-void detect_ht(struct cpuinfo_x86 *c)
+int detect_ht_early(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
- static bool printed;
if (!cpu_has(c, X86_FEATURE_HT))
- return;
+ return -1;
if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- goto out;
+ return -1;
if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
- return;
+ return -1;
cpuid(1, &eax, &ebx, &ecx, &edx);
smp_num_siblings = (ebx & 0xff0000) >> 16;
-
- if (smp_num_siblings == 1) {
+ if (smp_num_siblings == 1)
pr_info_once("CPU0: Hyper-Threading is disabled\n");
- goto out;
- }
+#endif
+ return 0;
+}
- if (smp_num_siblings <= 1)
- goto out;
+void detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ int index_msb, core_bits;
+
+ if (detect_ht_early(c) < 0)
+ return;
index_msb = get_count_order(smp_num_siblings);
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
@@ -700,15 +703,6 @@ void detect_ht(struct cpuinfo_x86 *c)
c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
((1 << core_bits) - 1);
-
-out:
- if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
- pr_info("CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- pr_info("CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- printed = 1;
- }
#endif
}
@@ -987,6 +981,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
{}
};
+static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
+ /* in addition to cpu_no_speculation */
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
+ {}
+};
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = 0;
@@ -1005,6 +1014,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+ if (ia32_cap & ARCH_CAP_IBRS_ALL)
+ setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+
if (x86_match_cpu(cpu_no_meltdown))
return;
@@ -1013,6 +1025,29 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
return;
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+ if (x86_match_cpu(cpu_no_l1tf))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_L1TF);
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
+ * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
+ */
+static void detect_nopl(void)
+{
+#ifdef CONFIG_X86_32
+ setup_clear_cpu_cap(X86_FEATURE_NOPL);
+#else
+ setup_force_cpu_cap(X86_FEATURE_NOPL);
+#endif
}
/*
@@ -1089,6 +1124,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
*/
if (!pgtable_l5_enabled())
setup_clear_cpu_cap(X86_FEATURE_LA57);
+
+ detect_nopl();
}
void __init early_cpu_init(void)
@@ -1124,24 +1161,6 @@ void __init early_cpu_init(void)
early_identify_cpu(&boot_cpu_data);
}
-/*
- * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
- * unfortunately, that's not true in practice because of early VIA
- * chips and (more importantly) broken virtualizers that are not easy
- * to detect. In the latter case it doesn't even *fail* reliably, so
- * probing for it doesn't even work. Disable it completely on 32-bit
- * unless we can find a reliable way to detect all the broken cases.
- * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
- */
-static void detect_nopl(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_32
- clear_cpu_cap(c, X86_FEATURE_NOPL);
-#else
- set_cpu_cap(c, X86_FEATURE_NOPL);
-#endif
-}
-
static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
@@ -1204,8 +1223,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_nopl(c);
-
detect_null_seg_behavior(c);
/*
@@ -1804,11 +1821,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, curr);
/*
- * Initialize the TSS. Don't bother initializing sp0, as the initial
- * task never enters user mode.
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 38216f678fc3..e59c0ea82a33 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -55,7 +55,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
+extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
extern int detect_extended_topology(struct cpuinfo_x86 *c);
+extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index eb75564f2d25..401e8c133108 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -301,6 +301,13 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
check_mpx_erratum(c);
+
+ /*
+ * Get the number of SMT siblings early from the extended topology
+ * leaf, if available. Otherwise try the legacy SMT detection.
+ */
+ if (detect_extended_topology_early(c) < 0)
+ detect_ht_early(c);
}
#ifdef CONFIG_X86_32
@@ -465,14 +472,17 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
+#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000
u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+ u32 msr_vpid_cap, msr_ept_cap;
clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
clear_cpu_cap(c, X86_FEATURE_VNMI);
clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
clear_cpu_cap(c, X86_FEATURE_EPT);
clear_cpu_cap(c, X86_FEATURE_VPID);
+ clear_cpu_cap(c, X86_FEATURE_EPT_AD);
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
msr_ctl = vmx_msr_high | vmx_msr_low;
@@ -487,8 +497,13 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
(msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) {
set_cpu_cap(c, X86_FEATURE_EPT);
+ rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
+ msr_ept_cap, msr_vpid_cap);
+ if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD)
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ }
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
set_cpu_cap(c, X86_FEATURE_VPID);
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index ec4754f81cbd..abb71ac70443 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -859,6 +859,8 @@ static __init bool get_rdt_resources(void)
return (rdt_mon_capable || rdt_alloc_capable);
}
+static enum cpuhp_state rdt_online;
+
static int __init intel_rdt_late_init(void)
{
struct rdt_resource *r;
@@ -880,6 +882,7 @@ static int __init intel_rdt_late_init(void)
cpuhp_remove_state(state);
return ret;
}
+ rdt_online = state;
for_each_alloc_capable_rdt_resource(r)
pr_info("Intel RDT %s allocation detected\n", r->name);
@@ -891,3 +894,11 @@ static int __init intel_rdt_late_init(void)
}
late_initcall(intel_rdt_late_init);
+
+static void __exit intel_rdt_exit(void)
+{
+ cpuhp_remove_state(rdt_online);
+ rdtgroup_exit();
+}
+
+__exitcall(intel_rdt_exit);
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 39752825e376..4e588f36228f 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -81,6 +81,34 @@ enum rdt_group_type {
};
/**
+ * enum rdtgrp_mode - Mode of a RDT resource group
+ * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
+ * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
+ * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
+ * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
+ * allowed AND the allocations are Cache Pseudo-Locked
+ *
+ * The mode of a resource group enables control over the allowed overlap
+ * between allocations associated with different resource groups (classes
+ * of service). User is able to modify the mode of a resource group by
+ * writing to the "mode" resctrl file associated with the resource group.
+ *
+ * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
+ * writing the appropriate text to the "mode" file. A resource group enters
+ * "pseudo-locked" mode after the schemata is written while the resource
+ * group is in "pseudo-locksetup" mode.
+ */
+enum rdtgrp_mode {
+ RDT_MODE_SHAREABLE = 0,
+ RDT_MODE_EXCLUSIVE,
+ RDT_MODE_PSEUDO_LOCKSETUP,
+ RDT_MODE_PSEUDO_LOCKED,
+
+ /* Must be last */
+ RDT_NUM_MODES,
+};
+
+/**
* struct mongroup - store mon group's data in resctrl fs.
* @mon_data_kn kernlfs node for the mon_data directory
* @parent: parent rdtgrp
@@ -95,6 +123,43 @@ struct mongroup {
};
/**
+ * struct pseudo_lock_region - pseudo-lock region information
+ * @r: RDT resource to which this pseudo-locked region
+ * belongs
+ * @d: RDT domain to which this pseudo-locked region
+ * belongs
+ * @cbm: bitmask of the pseudo-locked region
+ * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
+ * completion
+ * @thread_done: variable used by waitqueue to test if pseudo-locking
+ * thread completed
+ * @cpu: core associated with the cache on which the setup code
+ * will be run
+ * @line_size: size of the cache lines
+ * @size: size of pseudo-locked region in bytes
+ * @kmem: the kernel memory associated with pseudo-locked region
+ * @minor: minor number of character device associated with this
+ * region
+ * @debugfs_dir: pointer to this region's directory in the debugfs
+ * filesystem
+ * @pm_reqs: Power management QoS requests related to this region
+ */
+struct pseudo_lock_region {
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ u32 cbm;
+ wait_queue_head_t lock_thread_wq;
+ int thread_done;
+ int cpu;
+ unsigned int line_size;
+ unsigned int size;
+ void *kmem;
+ unsigned int minor;
+ struct dentry *debugfs_dir;
+ struct list_head pm_reqs;
+};
+
+/**
* struct rdtgroup - store rdtgroup's data in resctrl file system.
* @kn: kernfs node
* @rdtgroup_list: linked list for all rdtgroups
@@ -106,16 +171,20 @@ struct mongroup {
* @type: indicates type of this rdtgroup - either
* monitor only or ctrl_mon group
* @mon: mongroup related data
+ * @mode: mode of resource group
+ * @plr: pseudo-locked region
*/
struct rdtgroup {
- struct kernfs_node *kn;
- struct list_head rdtgroup_list;
- u32 closid;
- struct cpumask cpu_mask;
- int flags;
- atomic_t waitcount;
- enum rdt_group_type type;
- struct mongroup mon;
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ u32 closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+ enum rdt_group_type type;
+ struct mongroup mon;
+ enum rdtgrp_mode mode;
+ struct pseudo_lock_region *plr;
};
/* rdtgroup.flags */
@@ -148,6 +217,7 @@ extern struct list_head rdt_all_groups;
extern int max_name_width, max_data_width;
int __init rdtgroup_init(void);
+void __exit rdtgroup_exit(void);
/**
* struct rftype - describe each file in the resctrl file system
@@ -216,22 +286,24 @@ struct mbm_state {
* @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
* @new_ctrl: new ctrl value to be loaded
* @have_new_ctrl: did user provide new_ctrl for this domain
+ * @plr: pseudo-locked region (if any) associated with domain
*/
struct rdt_domain {
- struct list_head list;
- int id;
- struct cpumask cpu_mask;
- unsigned long *rmid_busy_llc;
- struct mbm_state *mbm_total;
- struct mbm_state *mbm_local;
- struct delayed_work mbm_over;
- struct delayed_work cqm_limbo;
- int mbm_work_cpu;
- int cqm_work_cpu;
- u32 *ctrl_val;
- u32 *mbps_val;
- u32 new_ctrl;
- bool have_new_ctrl;
+ struct list_head list;
+ int id;
+ struct cpumask cpu_mask;
+ unsigned long *rmid_busy_llc;
+ struct mbm_state *mbm_total;
+ struct mbm_state *mbm_local;
+ struct delayed_work mbm_over;
+ struct delayed_work cqm_limbo;
+ int mbm_work_cpu;
+ int cqm_work_cpu;
+ u32 *ctrl_val;
+ u32 *mbps_val;
+ u32 new_ctrl;
+ bool have_new_ctrl;
+ struct pseudo_lock_region *plr;
};
/**
@@ -351,7 +423,7 @@ struct rdt_resource {
struct rdt_cache cache;
struct rdt_membw membw;
const char *format_str;
- int (*parse_ctrlval) (char *buf, struct rdt_resource *r,
+ int (*parse_ctrlval) (void *data, struct rdt_resource *r,
struct rdt_domain *d);
struct list_head evt_list;
int num_rmid;
@@ -359,8 +431,8 @@ struct rdt_resource {
unsigned long fflags;
};
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d);
+int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d);
extern struct mutex rdtgroup_mutex;
@@ -368,7 +440,7 @@ extern struct rdt_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default;
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-int __init rdtgroup_init(void);
+extern struct dentry *debugfs_resctrl;
enum {
RDT_RESOURCE_L3,
@@ -439,13 +511,32 @@ void rdt_last_cmd_printf(const char *fmt, ...);
void rdt_ctrl_update(void *arg);
struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
void rdtgroup_kn_unlock(struct kernfs_node *kn);
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask);
struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
struct list_head **pos);
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ u32 _cbm, int closid, bool exclusive);
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
+ u32 cbm);
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
+int rdtgroup_tasks_assigned(struct rdtgroup *r);
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
+int rdt_pseudo_lock_init(void);
+void rdt_pseudo_lock_release(void);
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int update_domains(struct rdt_resource *r, int closid);
+void closid_free(int closid);
int alloc_rmid(void);
void free_rmid(u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index 116d57b248d3..af358ca05160 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -64,9 +64,10 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
-int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d)
{
unsigned long data;
+ char *buf = _buf;
if (d->have_new_ctrl) {
rdt_last_cmd_printf("duplicate domain %d\n", d->id);
@@ -87,7 +88,7 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
* are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
* Additionally Haswell requires at least two bits set.
*/
-static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
{
unsigned long first_bit, zero_bit, val;
unsigned int cbm_len = r->cache.cbm_len;
@@ -122,22 +123,64 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
+struct rdt_cbm_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
/*
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
{
- unsigned long data;
+ struct rdt_cbm_parse_data *data = _data;
+ struct rdtgroup *rdtgrp = data->rdtgrp;
+ u32 cbm_val;
if (d->have_new_ctrl) {
rdt_last_cmd_printf("duplicate domain %d\n", d->id);
return -EINVAL;
}
- if(!cbm_validate(buf, &data, r))
+ /*
+ * Cannot set up more than one pseudo-locked region in a cache
+ * hierarchy.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ rdtgroup_pseudo_locked_in_hierarchy(d)) {
+ rdt_last_cmd_printf("pseudo-locked region in hierarchy\n");
return -EINVAL;
- d->new_ctrl = data;
+ }
+
+ if (!cbm_validate(data->buf, &cbm_val, r))
+ return -EINVAL;
+
+ if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_SHAREABLE) &&
+ rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
+ rdt_last_cmd_printf("CBM overlaps with pseudo-locked region\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The CBM may not overlap with the CBM of another closid if
+ * either is exclusive.
+ */
+ if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) {
+ rdt_last_cmd_printf("overlaps with exclusive group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) {
+ if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ rdt_last_cmd_printf("overlaps with other group\n");
+ return -EINVAL;
+ }
+ }
+
+ d->new_ctrl = cbm_val;
d->have_new_ctrl = true;
return 0;
@@ -149,8 +192,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
* separated by ";". The "id" is in decimal, and must match one of
* the "id"s for this resource.
*/
-static int parse_line(char *line, struct rdt_resource *r)
+static int parse_line(char *line, struct rdt_resource *r,
+ struct rdtgroup *rdtgrp)
{
+ struct rdt_cbm_parse_data data;
char *dom = NULL, *id;
struct rdt_domain *d;
unsigned long dom_id;
@@ -167,15 +212,32 @@ next:
dom = strim(dom);
list_for_each_entry(d, &r->domains, list) {
if (d->id == dom_id) {
- if (r->parse_ctrlval(dom, r, d))
+ data.buf = dom;
+ data.rdtgrp = rdtgrp;
+ if (r->parse_ctrlval(&data, r, d))
return -EINVAL;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * In pseudo-locking setup mode and just
+ * parsed a valid CBM that should be
+ * pseudo-locked. Only one locked region per
+ * resource group and domain so just do
+ * the required initialization for single
+ * region and return.
+ */
+ rdtgrp->plr->r = r;
+ rdtgrp->plr->d = d;
+ rdtgrp->plr->cbm = d->new_ctrl;
+ d->plr = rdtgrp->plr;
+ return 0;
+ }
goto next;
}
}
return -EINVAL;
}
-static int update_domains(struct rdt_resource *r, int closid)
+int update_domains(struct rdt_resource *r, int closid)
{
struct msr_param msr_param;
cpumask_var_t cpu_mask;
@@ -220,13 +282,14 @@ done:
return 0;
}
-static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
+static int rdtgroup_parse_resource(char *resname, char *tok,
+ struct rdtgroup *rdtgrp)
{
struct rdt_resource *r;
for_each_alloc_enabled_rdt_resource(r) {
- if (!strcmp(resname, r->name) && closid < r->num_closid)
- return parse_line(tok, r);
+ if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid)
+ return parse_line(tok, r, rdtgrp);
}
rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname);
return -EINVAL;
@@ -239,7 +302,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
struct rdt_domain *dom;
struct rdt_resource *r;
char *tok, *resname;
- int closid, ret = 0;
+ int ret = 0;
/* Valid input requires a trailing newline */
if (nbytes == 0 || buf[nbytes - 1] != '\n')
@@ -253,7 +316,15 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
}
rdt_last_cmd_clear();
- closid = rdtgrp->closid;
+ /*
+ * No changes to pseudo-locked region allowed. It has to be removed
+ * and re-created instead.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("resource group is pseudo-locked\n");
+ goto out;
+ }
for_each_alloc_enabled_rdt_resource(r) {
list_for_each_entry(dom, &r->domains, list)
@@ -272,17 +343,27 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
ret = -EINVAL;
goto out;
}
- ret = rdtgroup_parse_resource(resname, tok, closid);
+ ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
if (ret)
goto out;
}
for_each_alloc_enabled_rdt_resource(r) {
- ret = update_domains(r, closid);
+ ret = update_domains(r, rdtgrp->closid);
if (ret)
goto out;
}
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * If pseudo-locking fails we keep the resource group in
+ * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
+ * active and updated for just the domain the pseudo-locked
+ * region was requested for.
+ */
+ ret = rdtgroup_pseudo_lock_create(rdtgrp);
+ }
+
out:
rdtgroup_kn_unlock(of->kn);
return ret ?: nbytes;
@@ -318,10 +399,18 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (rdtgrp) {
- closid = rdtgrp->closid;
- for_each_alloc_enabled_rdt_resource(r) {
- if (closid < r->num_closid)
- show_doms(s, r, closid);
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ for_each_alloc_enabled_rdt_resource(r)
+ seq_printf(s, "%s:uninitialized\n", r->name);
+ } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name,
+ rdtgrp->plr->d->id, rdtgrp->plr->cbm);
+ } else {
+ closid = rdtgrp->closid;
+ for_each_alloc_enabled_rdt_resource(r) {
+ if (closid < r->num_closid)
+ show_doms(s, r, closid);
+ }
}
} else {
ret = -ENOENT;
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
new file mode 100644
index 000000000000..40f3903ae5d9
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
@@ -0,0 +1,1522 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/mman.h>
+#include <linux/pm_qos.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/intel-family.h>
+#include <asm/intel_rdt_sched.h>
+#include <asm/perf_event.h>
+
+#include "intel_rdt.h"
+
+#define CREATE_TRACE_POINTS
+#include "intel_rdt_pseudo_lock_event.h"
+
+/*
+ * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware
+ * prefetcher state. Details about this register can be found in the MSR
+ * tables for specific platforms found in Intel's SDM.
+ */
+#define MSR_MISC_FEATURE_CONTROL 0x000001a4
+
+/*
+ * The bits needed to disable hardware prefetching varies based on the
+ * platform. During initialization we will discover which bits to use.
+ */
+static u64 prefetch_disable_bits;
+
+/*
+ * Major number assigned to and shared by all devices exposing
+ * pseudo-locked regions.
+ */
+static unsigned int pseudo_lock_major;
+static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
+static struct class *pseudo_lock_class;
+
+/**
+ * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ *
+ * Capture the list of platforms that have been validated to support
+ * pseudo-locking. This includes testing to ensure pseudo-locked regions
+ * with low cache miss rates can be created under variety of load conditions
+ * as well as that these pseudo-locked regions can maintain their low cache
+ * miss rates under variety of load conditions for significant lengths of time.
+ *
+ * After a platform has been validated to support pseudo-locking its
+ * hardware prefetch disable bits are included here as they are documented
+ * in the SDM.
+ *
+ * When adding a platform here also add support for its cache events to
+ * measure_cycles_perf_fn()
+ *
+ * Return:
+ * If platform is supported, the bits to disable hardware prefetchers, 0
+ * if platform is not supported.
+ */
+static u64 get_prefetch_disable_bits(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_BROADWELL_X:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 3 DCU IP Prefetcher Disable (R/W)
+ * 63:4 Reserved
+ */
+ return 0xF;
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_GEMINI_LAKE:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 Reserved
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 63:3 Reserved
+ */
+ return 0x5;
+ }
+
+ return 0;
+}
+
+/*
+ * Helper to write 64bit value to MSR without tracing. Used when
+ * use of the cache should be restricted and use of registers used
+ * for local variables avoided.
+ */
+static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val)
+{
+ __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
+}
+
+/**
+ * pseudo_lock_minor_get - Obtain available minor number
+ * @minor: Pointer to where new minor number will be stored
+ *
+ * A bitmask is used to track available minor numbers. Here the next free
+ * minor number is marked as unavailable and returned.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+static int pseudo_lock_minor_get(unsigned int *minor)
+{
+ unsigned long first_bit;
+
+ first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
+
+ if (first_bit == MINORBITS)
+ return -ENOSPC;
+
+ __clear_bit(first_bit, &pseudo_lock_minor_avail);
+ *minor = first_bit;
+
+ return 0;
+}
+
+/**
+ * pseudo_lock_minor_release - Return minor number to available
+ * @minor: The minor number made available
+ */
+static void pseudo_lock_minor_release(unsigned int minor)
+{
+ __set_bit(minor, &pseudo_lock_minor_avail);
+}
+
+/**
+ * region_find_by_minor - Locate a pseudo-lock region by inode minor number
+ * @minor: The minor number of the device representing pseudo-locked region
+ *
+ * When the character device is accessed we need to determine which
+ * pseudo-locked region it belongs to. This is done by matching the minor
+ * number of the device to the pseudo-locked region it belongs.
+ *
+ * Minor numbers are assigned at the time a pseudo-locked region is associated
+ * with a cache instance.
+ *
+ * Return: On success return pointer to resource group owning the pseudo-locked
+ * region, NULL on failure.
+ */
+static struct rdtgroup *region_find_by_minor(unsigned int minor)
+{
+ struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
+
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
+ rdtgrp_match = rdtgrp;
+ break;
+ }
+ }
+ return rdtgrp_match;
+}
+
+/**
+ * pseudo_lock_pm_req - A power management QoS request list entry
+ * @list: Entry within the @pm_reqs list for a pseudo-locked region
+ * @req: PM QoS request
+ */
+struct pseudo_lock_pm_req {
+ struct list_head list;
+ struct dev_pm_qos_request req;
+};
+
+static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
+{
+ struct pseudo_lock_pm_req *pm_req, *next;
+
+ list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
+ dev_pm_qos_remove_request(&pm_req->req);
+ list_del(&pm_req->list);
+ kfree(pm_req);
+ }
+}
+
+/**
+ * pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ *
+ * To prevent the cache from being affected by power management entering
+ * C6 has to be avoided. This is accomplished by requesting a latency
+ * requirement lower than lowest C6 exit latency of all supported
+ * platforms as found in the cpuidle state tables in the intel_idle driver.
+ * At this time it is possible to do so with a single latency requirement
+ * for all supported platforms.
+ *
+ * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
+ * the ACPI latencies need to be considered while keeping in mind that C2
+ * may be set to map to deeper sleep states. In this case the latency
+ * requirement needs to prevent entering C2 also.
+ */
+static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
+{
+ struct pseudo_lock_pm_req *pm_req;
+ int cpu;
+ int ret;
+
+ for_each_cpu(cpu, &plr->d->cpu_mask) {
+ pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
+ if (!pm_req) {
+ rdt_last_cmd_puts("fail allocating mem for PM QoS\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = dev_pm_qos_add_request(get_cpu_device(cpu),
+ &pm_req->req,
+ DEV_PM_QOS_RESUME_LATENCY,
+ 30);
+ if (ret < 0) {
+ rdt_last_cmd_printf("fail to add latency req cpu%d\n",
+ cpu);
+ kfree(pm_req);
+ ret = -1;
+ goto out_err;
+ }
+ list_add(&pm_req->list, &plr->pm_reqs);
+ }
+
+ return 0;
+
+out_err:
+ pseudo_lock_cstates_relax(plr);
+ return ret;
+}
+
+/**
+ * pseudo_lock_region_clear - Reset pseudo-lock region data
+ * @plr: pseudo-lock region
+ *
+ * All content of the pseudo-locked region is reset - any memory allocated
+ * freed.
+ *
+ * Return: void
+ */
+static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
+{
+ plr->size = 0;
+ plr->line_size = 0;
+ kfree(plr->kmem);
+ plr->kmem = NULL;
+ plr->r = NULL;
+ if (plr->d)
+ plr->d->plr = NULL;
+ plr->d = NULL;
+ plr->cbm = 0;
+ plr->debugfs_dir = NULL;
+}
+
+/**
+ * pseudo_lock_region_init - Initialize pseudo-lock region information
+ * @plr: pseudo-lock region
+ *
+ * Called after user provided a schemata to be pseudo-locked. From the
+ * schemata the &struct pseudo_lock_region is on entry already initialized
+ * with the resource, domain, and capacity bitmask. Here the information
+ * required for pseudo-locking is deduced from this data and &struct
+ * pseudo_lock_region initialized further. This information includes:
+ * - size in bytes of the region to be pseudo-locked
+ * - cache line size to know the stride with which data needs to be accessed
+ * to be pseudo-locked
+ * - a cpu associated with the cache instance on which the pseudo-locking
+ * flow can be executed
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
+{
+ struct cpu_cacheinfo *ci;
+ int ret;
+ int i;
+
+ /* Pick the first cpu we find that is associated with the cache. */
+ plr->cpu = cpumask_first(&plr->d->cpu_mask);
+
+ if (!cpu_online(plr->cpu)) {
+ rdt_last_cmd_printf("cpu %u associated with cache not online\n",
+ plr->cpu);
+ ret = -ENODEV;
+ goto out_region;
+ }
+
+ ci = get_cpu_cacheinfo(plr->cpu);
+
+ plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm);
+
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == plr->r->cache_level) {
+ plr->line_size = ci->info_list[i].coherency_line_size;
+ return 0;
+ }
+ }
+
+ ret = -1;
+ rdt_last_cmd_puts("unable to determine cache line size\n");
+out_region:
+ pseudo_lock_region_clear(plr);
+ return ret;
+}
+
+/**
+ * pseudo_lock_init - Initialize a pseudo-lock region
+ * @rdtgrp: resource group to which new pseudo-locked region will belong
+ *
+ * A pseudo-locked region is associated with a resource group. When this
+ * association is created the pseudo-locked region is initialized. The
+ * details of the pseudo-locked region are not known at this time so only
+ * allocation is done and association established.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_init(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr;
+
+ plr = kzalloc(sizeof(*plr), GFP_KERNEL);
+ if (!plr)
+ return -ENOMEM;
+
+ init_waitqueue_head(&plr->lock_thread_wq);
+ INIT_LIST_HEAD(&plr->pm_reqs);
+ rdtgrp->plr = plr;
+ return 0;
+}
+
+/**
+ * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
+ * @plr: pseudo-lock region
+ *
+ * Initialize the details required to set up the pseudo-locked region and
+ * allocate the contiguous memory that will be pseudo-locked to the cache.
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
+{
+ int ret;
+
+ ret = pseudo_lock_region_init(plr);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We do not yet support contiguous regions larger than
+ * KMALLOC_MAX_SIZE.
+ */
+ if (plr->size > KMALLOC_MAX_SIZE) {
+ rdt_last_cmd_puts("requested region exceeds maximum size\n");
+ ret = -E2BIG;
+ goto out_region;
+ }
+
+ plr->kmem = kzalloc(plr->size, GFP_KERNEL);
+ if (!plr->kmem) {
+ rdt_last_cmd_puts("unable to allocate memory\n");
+ ret = -ENOMEM;
+ goto out_region;
+ }
+
+ ret = 0;
+ goto out;
+out_region:
+ pseudo_lock_region_clear(plr);
+out:
+ return ret;
+}
+
+/**
+ * pseudo_lock_free - Free a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-locked region belonged
+ *
+ * The pseudo-locked region's resources have already been released, or not
+ * yet created at this point. Now it can be freed and disassociated from the
+ * resource group.
+ *
+ * Return: void
+ */
+static void pseudo_lock_free(struct rdtgroup *rdtgrp)
+{
+ pseudo_lock_region_clear(rdtgrp->plr);
+ kfree(rdtgrp->plr);
+ rdtgrp->plr = NULL;
+}
+
+/**
+ * pseudo_lock_fn - Load kernel memory into cache
+ * @_rdtgrp: resource group to which pseudo-lock region belongs
+ *
+ * This is the core pseudo-locking flow.
+ *
+ * First we ensure that the kernel memory cannot be found in the cache.
+ * Then, while taking care that there will be as little interference as
+ * possible, the memory to be loaded is accessed while core is running
+ * with class of service set to the bitmask of the pseudo-locked region.
+ * After this is complete no future CAT allocations will be allowed to
+ * overlap with this bitmask.
+ *
+ * Local register variables are utilized to ensure that the memory region
+ * to be locked is the only memory access made during the critical locking
+ * loop.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+static int pseudo_lock_fn(void *_rdtgrp)
+{
+ struct rdtgroup *rdtgrp = _rdtgrp;
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ u32 rmid_p, closid_p;
+ unsigned long i;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer, but the cost
+ * is that this variable will enter the cache through evicting the
+ * memory we are trying to lock into the cache. Thus expect lower
+ * pseudo-locking success rate when KASAN is active.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Make sure none of the allocated memory is cached. If it is we
+ * will get a cache hit in below loop from outside of pseudo-locked
+ * region.
+ * wbinvd (as opposed to clflush/clflushopt) is required to
+ * increase likelihood that allocated cache portion will be filled
+ * with associated memory.
+ */
+ native_wbinvd();
+
+ /*
+ * Always called with interrupts enabled. By disabling interrupts
+ * ensure that we will not be preempted during this critical section.
+ */
+ local_irq_disable();
+
+ /*
+ * Call wrmsr and rdmsr as directly as possible to avoid tracing
+ * clobbering local register variables or affecting cache accesses.
+ *
+ * Disable the hardware prefetcher so that when the end of the memory
+ * being pseudo-locked is reached the hardware will not read beyond
+ * the buffer and evict pseudo-locked memory read earlier from the
+ * cache.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ closid_p = this_cpu_read(pqr_state.cur_closid);
+ rmid_p = this_cpu_read(pqr_state.cur_rmid);
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ /*
+ * Critical section begin: start by writing the closid associated
+ * with the capacity bitmask of the cache region being
+ * pseudo-locked followed by reading of kernel memory to load it
+ * into the cache.
+ */
+ __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ /*
+ * Cache was flushed earlier. Now access kernel memory to read it
+ * into cache region associated with just activated plr->closid.
+ * Loop over data twice:
+ * - In first loop the cache region is shared with the page walker
+ * as it populates the paging structure caches (including TLB).
+ * - In the second loop the paging structure caches are used and
+ * cache region is populated with the memory being referenced.
+ */
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Critical section end: restore closid with capacity bitmask that
+ * does not overlap with pseudo-locked region.
+ */
+ __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
+
+ /* Re-enable the hardware prefetcher(s) */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * rdtgroup_monitor_in_progress - Test if monitoring in progress
+ * @r: resource group being queried
+ *
+ * Return: 1 if monitor groups have been created for this resource
+ * group, 0 otherwise.
+ */
+static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
+{
+ return !list_empty(&rdtgrp->mon.crdtgrp_list);
+}
+
+/**
+ * rdtgroup_locksetup_user_restrict - Restrict user access to group
+ * @rdtgrp: resource group needing access restricted
+ *
+ * A resource group used for cache pseudo-locking cannot have cpus or tasks
+ * assigned to it. This is communicated to the user by restricting access
+ * to all the files that can be used to make such changes.
+ *
+ * Permissions restored with rdtgroup_locksetup_user_restore()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restriction of access an attempt will be made to restore permissions but
+ * the state of the mode of these files will be uncertain when a failure
+ * occurs.
+ */
+static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+ if (ret)
+ goto err_tasks;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+ if (ret)
+ goto err_cpus;
+
+ if (rdt_mon_capable) {
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
+ if (ret)
+ goto err_cpus_list;
+ }
+
+ ret = 0;
+ goto out;
+
+err_cpus_list:
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+err_cpus:
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+err_tasks:
+ rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_user_restore - Restore user access to group
+ * @rdtgrp: resource group needing access restored
+ *
+ * Restore all file access previously removed using
+ * rdtgroup_locksetup_user_restrict()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restoration of access an attempt will be made to restrict permissions
+ * again but the state of the mode of these files will be uncertain when
+ * a failure occurs.
+ */
+static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+ if (ret)
+ goto err_tasks;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+ if (ret)
+ goto err_cpus;
+
+ if (rdt_mon_capable) {
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
+ if (ret)
+ goto err_cpus_list;
+ }
+
+ ret = 0;
+ goto out;
+
+err_cpus_list:
+ rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+err_cpus:
+ rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+err_tasks:
+ rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_enter - Resource group enters locksetup mode
+ * @rdtgrp: resource group requested to enter locksetup mode
+ *
+ * A resource group enters locksetup mode to reflect that it would be used
+ * to represent a pseudo-locked region and is in the process of being set
+ * up to do so. A resource group used for a pseudo-locked region would
+ * lose the closid associated with it so we cannot allow it to have any
+ * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
+ * future. Monitoring of a pseudo-locked region is not allowed either.
+ *
+ * The above and more restrictions on a pseudo-locked region are checked
+ * for and enforced before the resource group enters the locksetup mode.
+ *
+ * Returns: 0 if the resource group successfully entered locksetup mode, <0
+ * on failure. On failure the last_cmd_status buffer is updated with text to
+ * communicate details of failure to the user.
+ */
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ /*
+ * The default resource group can neither be removed nor lose the
+ * default closid associated with it.
+ */
+ if (rdtgrp == &rdtgroup_default) {
+ rdt_last_cmd_puts("cannot pseudo-lock default group\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Cache Pseudo-locking not supported when CDP is enabled.
+ *
+ * Some things to consider if you would like to enable this
+ * support (using L3 CDP as example):
+ * - When CDP is enabled two separate resources are exposed,
+ * L3DATA and L3CODE, but they are actually on the same cache.
+ * The implication for pseudo-locking is that if a
+ * pseudo-locked region is created on a domain of one
+ * resource (eg. L3CODE), then a pseudo-locked region cannot
+ * be created on that same domain of the other resource
+ * (eg. L3DATA). This is because the creation of a
+ * pseudo-locked region involves a call to wbinvd that will
+ * affect all cache allocations on particular domain.
+ * - Considering the previous, it may be possible to only
+ * expose one of the CDP resources to pseudo-locking and
+ * hide the other. For example, we could consider to only
+ * expose L3DATA and since the L3 cache is unified it is
+ * still possible to place instructions there are execute it.
+ * - If only one region is exposed to pseudo-locking we should
+ * still keep in mind that availability of a portion of cache
+ * for pseudo-locking should take into account both resources.
+ * Similarly, if a pseudo-locked region is created in one
+ * resource, the portion of cache used by it should be made
+ * unavailable to all future allocations from both resources.
+ */
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled ||
+ rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) {
+ rdt_last_cmd_puts("CDP enabled\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Not knowing the bits to disable prefetching implies that this
+ * platform does not support Cache Pseudo-Locking.
+ */
+ prefetch_disable_bits = get_prefetch_disable_bits();
+ if (prefetch_disable_bits == 0) {
+ rdt_last_cmd_puts("pseudo-locking not supported\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_monitor_in_progress(rdtgrp)) {
+ rdt_last_cmd_puts("monitoring in progress\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_tasks_assigned(rdtgrp)) {
+ rdt_last_cmd_puts("tasks assigned to resource group\n");
+ return -EINVAL;
+ }
+
+ if (!cpumask_empty(&rdtgrp->cpu_mask)) {
+ rdt_last_cmd_puts("CPUs assigned to resource group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
+ rdt_last_cmd_puts("unable to modify resctrl permissions\n");
+ return -EIO;
+ }
+
+ ret = pseudo_lock_init(rdtgrp);
+ if (ret) {
+ rdt_last_cmd_puts("unable to init pseudo-lock region\n");
+ goto out_release;
+ }
+
+ /*
+ * If this system is capable of monitoring a rmid would have been
+ * allocated when the control group was created. This is not needed
+ * anymore when this group would be used for pseudo-locking. This
+ * is safe to call on platforms not capable of monitoring.
+ */
+ free_rmid(rdtgrp->mon.rmid);
+
+ ret = 0;
+ goto out;
+
+out_release:
+ rdtgroup_locksetup_user_restore(rdtgrp);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_exit - resource group exist locksetup mode
+ * @rdtgrp: resource group
+ *
+ * When a resource group exits locksetup mode the earlier restrictions are
+ * lifted.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ if (rdt_mon_capable) {
+ ret = alloc_rmid();
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of RMIDs\n");
+ return ret;
+ }
+ rdtgrp->mon.rmid = ret;
+ }
+
+ ret = rdtgroup_locksetup_user_restore(rdtgrp);
+ if (ret) {
+ free_rmid(rdtgrp->mon.rmid);
+ return ret;
+ }
+
+ pseudo_lock_free(rdtgrp);
+ return 0;
+}
+
+/**
+ * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
+ * @d: RDT domain
+ * @_cbm: CBM to test
+ *
+ * @d represents a cache instance and @_cbm a capacity bitmask that is
+ * considered for it. Determine if @_cbm overlaps with any existing
+ * pseudo-locked region on @d.
+ *
+ * Return: true if @_cbm overlaps with pseudo-locked region on @d, false
+ * otherwise.
+ */
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm)
+{
+ unsigned long *cbm = (unsigned long *)&_cbm;
+ unsigned long *cbm_b;
+ unsigned int cbm_len;
+
+ if (d->plr) {
+ cbm_len = d->plr->r->cache.cbm_len;
+ cbm_b = (unsigned long *)&d->plr->cbm;
+ if (bitmap_intersects(cbm, cbm_b, cbm_len))
+ return true;
+ }
+ return false;
+}
+
+/**
+ * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
+ * @d: RDT domain under test
+ *
+ * The setup of a pseudo-locked region affects all cache instances within
+ * the hierarchy of the region. It is thus essential to know if any
+ * pseudo-locked regions exist within a cache hierarchy to prevent any
+ * attempts to create new pseudo-locked regions in the same hierarchy.
+ *
+ * Return: true if a pseudo-locked region exists in the hierarchy of @d or
+ * if it is not possible to test due to memory allocation issue,
+ * false otherwise.
+ */
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
+{
+ cpumask_var_t cpu_with_psl;
+ struct rdt_resource *r;
+ struct rdt_domain *d_i;
+ bool ret = false;
+
+ if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
+ return true;
+
+ /*
+ * First determine which cpus have pseudo-locked regions
+ * associated with them.
+ */
+ for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(d_i, &r->domains, list) {
+ if (d_i->plr)
+ cpumask_or(cpu_with_psl, cpu_with_psl,
+ &d_i->cpu_mask);
+ }
+ }
+
+ /*
+ * Next test if new pseudo-locked region would intersect with
+ * existing region.
+ */
+ if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
+ ret = true;
+
+ free_cpumask_var(cpu_with_psl);
+ return ret;
+}
+
+/**
+ * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
+ * @_plr: pseudo-lock region to measure
+ *
+ * There is no deterministic way to test if a memory region is cached. One
+ * way is to measure how long it takes to read the memory, the speed of
+ * access is a good way to learn how close to the cpu the data was. Even
+ * more, if the prefetcher is disabled and the memory is read at a stride
+ * of half the cache line, then a cache miss will be easy to spot since the
+ * read of the first half would be significantly slower than the read of
+ * the second half.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+static int measure_cycles_lat_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ unsigned long i;
+ u64 start, end;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer to access memory.
+ * The cost is that accessing this pointer, which could be in
+ * cache, will be included in the measurement of memory read latency.
+ */
+ void *mem_r;
+#else
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ local_irq_disable();
+ /*
+ * The wrmsr call may be reordered with the assignment below it.
+ * Call wrmsr as directly as possible to avoid tracing clobbering
+ * local register variable used for memory pointer.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ mem_r = plr->kmem;
+ /*
+ * Dummy execute of the time measurement to load the needed
+ * instructions into the L1 instruction cache.
+ */
+ start = rdtsc_ordered();
+ for (i = 0; i < plr->size; i += 32) {
+ start = rdtsc_ordered();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ end = rdtsc_ordered();
+ trace_pseudo_lock_mem_latency((u32)(end - start));
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+static int measure_cycles_perf_fn(void *_plr)
+{
+ unsigned long long l3_hits = 0, l3_miss = 0;
+ u64 l3_hit_bits = 0, l3_miss_bits = 0;
+ struct pseudo_lock_region *plr = _plr;
+ unsigned long long l2_hits, l2_miss;
+ u64 l2_hit_bits, l2_miss_bits;
+ unsigned long i;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use regular variables
+ * at the cost of including cache access latency to these variables
+ * in the measurements.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Non-architectural event for the Goldmont Microarchitecture
+ * from Intel x86 Architecture Software Developer Manual (SDM):
+ * MEM_LOAD_UOPS_RETIRED D1H (event number)
+ * Umask values:
+ * L1_HIT 01H
+ * L2_HIT 02H
+ * L1_MISS 08H
+ * L2_MISS 10H
+ *
+ * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+ * has two "no fix" errata associated with it: BDM35 and BDM100. On
+ * this platform we use the following events instead:
+ * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/)
+ * REFERENCES FFH
+ * MISS 3FH
+ * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+ * REFERENCE 4FH
+ * MISS 41H
+ */
+
+ /*
+ * Start by setting flags for IA32_PERFEVTSELx:
+ * OS (Operating system mode) 0x2
+ * INT (APIC interrupt enable) 0x10
+ * EN (Enable counter) 0x40
+ *
+ * Then add the Umask value and event number to select performance
+ * event.
+ */
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_GEMINI_LAKE:
+ l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1;
+ l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1;
+ break;
+ case INTEL_FAM6_BROADWELL_X:
+ /* On BDW the l2_hit_bits count references, not hits */
+ l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24;
+ l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24;
+ /* On BDW the l3_hit_bits count references, not hits */
+ l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e;
+ l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e;
+ break;
+ default:
+ goto out;
+ }
+
+ local_irq_disable();
+ /*
+ * Call wrmsr direcly to avoid the local register variables from
+ * being overwritten due to reordering of their assignment with
+ * the wrmsr calls.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ /* Disable events and reset counters */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0);
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0);
+ }
+ /* Set and enable the L2 counters */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits);
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
+ l3_hit_bits);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ l3_miss_bits);
+ }
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ for (i = 0; i < size; i += line_size) {
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Call wrmsr directly (no tracing) to not influence
+ * the cache access counters as they are disabled.
+ */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0,
+ l2_hit_bits & ~(0x40ULL << 16));
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1,
+ l2_miss_bits & ~(0x40ULL << 16));
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
+ l3_hit_bits & ~(0x40ULL << 16));
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ l3_miss_bits & ~(0x40ULL << 16));
+ }
+ l2_hits = native_read_pmc(0);
+ l2_miss = native_read_pmc(1);
+ if (l3_hit_bits > 0) {
+ l3_hits = native_read_pmc(2);
+ l3_miss = native_read_pmc(3);
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+ /*
+ * On BDW we count references and misses, need to adjust. Sometimes
+ * the "hits" counter is a bit more than the references, for
+ * example, x references but x + 1 hits. To not report invalid
+ * hit values in this case we treat that as misses eaqual to
+ * references.
+ */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
+ l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss);
+ trace_pseudo_lock_l2(l2_hits, l2_miss);
+ if (l3_hit_bits > 0) {
+ if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
+ l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss);
+ trace_pseudo_lock_l3(l3_hits, l3_miss);
+ }
+
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ *
+ * The measurement of latency to access a pseudo-locked region should be
+ * done from a cpu that is associated with that pseudo-locked region.
+ * Determine which cpu is associated with this region and start a thread on
+ * that cpu to perform the measurement, wait for that thread to complete.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct task_struct *thread;
+ unsigned int cpu;
+ int ret = -1;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ if (rdtgrp->flags & RDT_DELETED) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ plr->thread_done = 0;
+ cpu = cpumask_first(&plr->d->cpu_mask);
+ if (!cpu_online(cpu)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (sel == 1)
+ thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
+ cpu_to_node(cpu),
+ "pseudo_lock_measure/%u",
+ cpu);
+ else if (sel == 2)
+ thread = kthread_create_on_node(measure_cycles_perf_fn, plr,
+ cpu_to_node(cpu),
+ "pseudo_lock_measure/%u",
+ cpu);
+ else
+ goto out;
+
+ if (IS_ERR(thread)) {
+ ret = PTR_ERR(thread);
+ goto out;
+ }
+ kthread_bind(thread, cpu);
+ wake_up_process(thread);
+
+ ret = wait_event_interruptible(plr->lock_thread_wq,
+ plr->thread_done == 1);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+
+static ssize_t pseudo_lock_measure_trigger(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct rdtgroup *rdtgrp = file->private_data;
+ size_t buf_size;
+ char buf[32];
+ int ret;
+ int sel;
+
+ buf_size = min(count, (sizeof(buf) - 1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ ret = kstrtoint(buf, 10, &sel);
+ if (ret == 0) {
+ if (sel != 1)
+ return -EINVAL;
+ ret = debugfs_file_get(file->f_path.dentry);
+ if (ret)
+ return ret;
+ ret = pseudo_lock_measure_cycles(rdtgrp, sel);
+ if (ret == 0)
+ ret = count;
+ debugfs_file_put(file->f_path.dentry);
+ }
+
+ return ret;
+}
+
+static const struct file_operations pseudo_measure_fops = {
+ .write = pseudo_lock_measure_trigger,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+/**
+ * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-lock region belongs
+ *
+ * Called when a resource group in the pseudo-locksetup mode receives a
+ * valid schemata that should be pseudo-locked. Since the resource group is
+ * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
+ * allocated and initialized with the essential information. If a failure
+ * occurs the resource group remains in the pseudo-locksetup mode with the
+ * &struct pseudo_lock_region associated with it, but cleared from all
+ * information and ready for the user to re-attempt pseudo-locking by
+ * writing the schemata again.
+ *
+ * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
+ * on failure. Descriptive error will be written to last_cmd_status buffer.
+ */
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct task_struct *thread;
+ unsigned int new_minor;
+ struct device *dev;
+ int ret;
+
+ ret = pseudo_lock_region_alloc(plr);
+ if (ret < 0)
+ return ret;
+
+ ret = pseudo_lock_cstates_constrain(plr);
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto out_region;
+ }
+
+ plr->thread_done = 0;
+
+ thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
+ cpu_to_node(plr->cpu),
+ "pseudo_lock/%u", plr->cpu);
+ if (IS_ERR(thread)) {
+ ret = PTR_ERR(thread);
+ rdt_last_cmd_printf("locking thread returned error %d\n", ret);
+ goto out_cstates;
+ }
+
+ kthread_bind(thread, plr->cpu);
+ wake_up_process(thread);
+
+ ret = wait_event_interruptible(plr->lock_thread_wq,
+ plr->thread_done == 1);
+ if (ret < 0) {
+ /*
+ * If the thread does not get on the CPU for whatever
+ * reason and the process which sets up the region is
+ * interrupted then this will leave the thread in runnable
+ * state and once it gets on the CPU it will derefence
+ * the cleared, but not freed, plr struct resulting in an
+ * empty pseudo-locking loop.
+ */
+ rdt_last_cmd_puts("locking thread interrupted\n");
+ goto out_cstates;
+ }
+
+ ret = pseudo_lock_minor_get(&new_minor);
+ if (ret < 0) {
+ rdt_last_cmd_puts("unable to obtain a new minor number\n");
+ goto out_cstates;
+ }
+
+ /*
+ * Unlock access but do not release the reference. The
+ * pseudo-locked region will still be here on return.
+ *
+ * The mutex has to be released temporarily to avoid a potential
+ * deadlock with the mm->mmap_sem semaphore which is obtained in
+ * the device_create() and debugfs_create_dir() callpath below
+ * as well as before the mmap() callback is called.
+ */
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
+ plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
+ debugfs_resctrl);
+ if (!IS_ERR_OR_NULL(plr->debugfs_dir))
+ debugfs_create_file("pseudo_lock_measure", 0200,
+ plr->debugfs_dir, rdtgrp,
+ &pseudo_measure_fops);
+ }
+
+ dev = device_create(pseudo_lock_class, NULL,
+ MKDEV(pseudo_lock_major, new_minor),
+ rdtgrp, "%s", rdtgrp->kn->name);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ rdt_last_cmd_printf("failed to create character device: %d\n",
+ ret);
+ goto out_debugfs;
+ }
+
+ /* We released the mutex - check if group was removed while we did so */
+ if (rdtgrp->flags & RDT_DELETED) {
+ ret = -ENODEV;
+ goto out_device;
+ }
+
+ plr->minor = new_minor;
+
+ rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
+ closid_free(rdtgrp->closid);
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
+
+ ret = 0;
+ goto out;
+
+out_device:
+ device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
+out_debugfs:
+ debugfs_remove_recursive(plr->debugfs_dir);
+ pseudo_lock_minor_release(new_minor);
+out_cstates:
+ pseudo_lock_cstates_relax(plr);
+out_region:
+ pseudo_lock_region_clear(plr);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
+ * @rdtgrp: resource group to which the pseudo-locked region belongs
+ *
+ * The removal of a pseudo-locked region can be initiated when the resource
+ * group is removed from user space via a "rmdir" from userspace or the
+ * unmount of the resctrl filesystem. On removal the resource group does
+ * not go back to pseudo-locksetup mode before it is removed, instead it is
+ * removed directly. There is thus assymmetry with the creation where the
+ * &struct pseudo_lock_region is removed here while it was not created in
+ * rdtgroup_pseudo_lock_create().
+ *
+ * Return: void
+ */
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * Default group cannot be a pseudo-locked region so we can
+ * free closid here.
+ */
+ closid_free(rdtgrp->closid);
+ goto free;
+ }
+
+ pseudo_lock_cstates_relax(plr);
+ debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
+ device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
+ pseudo_lock_minor_release(plr->minor);
+
+free:
+ pseudo_lock_free(rdtgrp);
+}
+
+static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
+{
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgrp = region_find_by_minor(iminor(inode));
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ filp->private_data = rdtgrp;
+ atomic_inc(&rdtgrp->waitcount);
+ /* Perform a non-seekable open - llseek is not supported */
+ filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
+{
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+ rdtgrp = filp->private_data;
+ WARN_ON(!rdtgrp);
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+ filp->private_data = NULL;
+ atomic_dec(&rdtgrp->waitcount);
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+{
+ /* Not supported */
+ return -EINVAL;
+}
+
+static const struct vm_operations_struct pseudo_mmap_ops = {
+ .mremap = pseudo_lock_dev_mremap,
+};
+
+static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ unsigned long vsize = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct pseudo_lock_region *plr;
+ struct rdtgroup *rdtgrp;
+ unsigned long physical;
+ unsigned long psize;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgrp = filp->private_data;
+ WARN_ON(!rdtgrp);
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ plr = rdtgrp->plr;
+
+ /*
+ * Task is required to run with affinity to the cpus associated
+ * with the pseudo-locked region. If this is not the case the task
+ * may be scheduled elsewhere and invalidate entries in the
+ * pseudo-locked region.
+ */
+ if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EINVAL;
+ }
+
+ physical = __pa(plr->kmem) >> PAGE_SHIFT;
+ psize = plr->size - off;
+
+ if (off > plr->size) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENOSPC;
+ }
+
+ /*
+ * Ensure changes are carried directly to the memory being mapped,
+ * do not allow copy-on-write mapping.
+ */
+ if (!(vma->vm_flags & VM_SHARED)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EINVAL;
+ }
+
+ if (vsize > psize) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENOSPC;
+ }
+
+ memset(plr->kmem + off, 0, vsize);
+
+ if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
+ vsize, vma->vm_page_prot)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EAGAIN;
+ }
+ vma->vm_ops = &pseudo_mmap_ops;
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static const struct file_operations pseudo_lock_dev_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = NULL,
+ .write = NULL,
+ .open = pseudo_lock_dev_open,
+ .release = pseudo_lock_dev_release,
+ .mmap = pseudo_lock_dev_mmap,
+};
+
+static char *pseudo_lock_devnode(struct device *dev, umode_t *mode)
+{
+ struct rdtgroup *rdtgrp;
+
+ rdtgrp = dev_get_drvdata(dev);
+ if (mode)
+ *mode = 0600;
+ return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
+}
+
+int rdt_pseudo_lock_init(void)
+{
+ int ret;
+
+ ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
+ if (ret < 0)
+ return ret;
+
+ pseudo_lock_major = ret;
+
+ pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock");
+ if (IS_ERR(pseudo_lock_class)) {
+ ret = PTR_ERR(pseudo_lock_class);
+ unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+ return ret;
+ }
+
+ pseudo_lock_class->devnode = pseudo_lock_devnode;
+ return 0;
+}
+
+void rdt_pseudo_lock_release(void)
+{
+ class_destroy(pseudo_lock_class);
+ pseudo_lock_class = NULL;
+ unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+ pseudo_lock_major = 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h
new file mode 100644
index 000000000000..2c041e6d9f05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PSEUDO_LOCK_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(pseudo_lock_mem_latency,
+ TP_PROTO(u32 latency),
+ TP_ARGS(latency),
+ TP_STRUCT__entry(__field(u32, latency)),
+ TP_fast_assign(__entry->latency = latency),
+ TP_printk("latency=%u", __entry->latency)
+ );
+
+TRACE_EVENT(pseudo_lock_l2,
+ TP_PROTO(u64 l2_hits, u64 l2_miss),
+ TP_ARGS(l2_hits, l2_miss),
+ TP_STRUCT__entry(__field(u64, l2_hits)
+ __field(u64, l2_miss)),
+ TP_fast_assign(__entry->l2_hits = l2_hits;
+ __entry->l2_miss = l2_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l2_hits, __entry->l2_miss));
+
+TRACE_EVENT(pseudo_lock_l3,
+ TP_PROTO(u64 l3_hits, u64 l3_miss),
+ TP_ARGS(l3_hits, l3_miss),
+ TP_STRUCT__entry(__field(u64, l3_hits)
+ __field(u64, l3_miss)),
+ TP_fast_assign(__entry->l3_hits = l3_hits;
+ __entry->l3_miss = l3_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l3_hits, __entry->l3_miss));
+
+#endif /* _TRACE_PSEUDO_LOCK_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE intel_rdt_pseudo_lock_event
+#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 749856a2e736..d6d7ea7349d0 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -20,7 +20,9 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cacheinfo.h>
#include <linux/cpu.h>
+#include <linux/debugfs.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kernfs.h>
@@ -55,6 +57,8 @@ static struct kernfs_node *kn_mondata;
static struct seq_buf last_cmd_status;
static char last_cmd_status_buf[512];
+struct dentry *debugfs_resctrl;
+
void rdt_last_cmd_clear(void)
{
lockdep_assert_held(&rdtgroup_mutex);
@@ -121,11 +125,65 @@ static int closid_alloc(void)
return closid;
}
-static void closid_free(int closid)
+void closid_free(int closid)
{
closid_free_map |= 1 << closid;
}
+/**
+ * closid_allocated - test if provided closid is in use
+ * @closid: closid to be tested
+ *
+ * Return: true if @closid is currently associated with a resource group,
+ * false if @closid is free
+ */
+static bool closid_allocated(unsigned int closid)
+{
+ return (closid_free_map & (1 << closid)) == 0;
+}
+
+/**
+ * rdtgroup_mode_by_closid - Return mode of resource group with closid
+ * @closid: closid if the resource group
+ *
+ * Each resource group is associated with a @closid. Here the mode
+ * of a resource group can be queried by searching for it using its closid.
+ *
+ * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
+ */
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
+{
+ struct rdtgroup *rdtgrp;
+
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (rdtgrp->closid == closid)
+ return rdtgrp->mode;
+ }
+
+ return RDT_NUM_MODES;
+}
+
+static const char * const rdt_mode_str[] = {
+ [RDT_MODE_SHAREABLE] = "shareable",
+ [RDT_MODE_EXCLUSIVE] = "exclusive",
+ [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup",
+ [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked",
+};
+
+/**
+ * rdtgroup_mode_str - Return the string representation of mode
+ * @mode: the resource group mode as &enum rdtgroup_mode
+ *
+ * Return: string representation of valid mode, "unknown" otherwise
+ */
+static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
+{
+ if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
+ return "unknown";
+
+ return rdt_mode_str[mode];
+}
+
/* set uid and gid of rdtgroup dirs and files to that of the creator */
static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
{
@@ -207,8 +265,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (rdtgrp) {
- seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
- cpumask_pr_args(&rdtgrp->cpu_mask));
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(&rdtgrp->plr->d->cpu_mask));
+ else
+ seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(&rdtgrp->cpu_mask));
} else {
ret = -ENOENT;
}
@@ -394,6 +456,13 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
goto unlock;
}
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto unlock;
+ }
+
if (is_cpu_list(of))
ret = cpulist_parse(buf, newmask);
else
@@ -509,6 +578,32 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
return ret;
}
+/**
+ * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
+ * @r: Resource group
+ *
+ * Return: 1 if tasks have been assigned to @r, 0 otherwise
+ */
+int rdtgroup_tasks_assigned(struct rdtgroup *r)
+{
+ struct task_struct *p, *t;
+ int ret = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+ (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
+ ret = 1;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
static int rdtgroup_task_write_permission(struct task_struct *task,
struct kernfs_open_file *of)
{
@@ -570,13 +665,22 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
rdt_last_cmd_clear();
- if (rdtgrp)
- ret = rdtgroup_move_task(pid, rdtgrp, of);
- else
- ret = -ENOENT;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto unlock;
+ }
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+
+unlock:
rdtgroup_kn_unlock(of->kn);
return ret ?: nbytes;
@@ -662,6 +766,94 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
return 0;
}
+/**
+ * rdt_bit_usage_show - Display current usage of resources
+ *
+ * A domain is a shared resource that can now be allocated differently. Here
+ * we display the current regions of the domain as an annotated bitmask.
+ * For each domain of this resource its allocation bitmask
+ * is annotated as below to indicate the current usage of the corresponding bit:
+ * 0 - currently unused
+ * X - currently available for sharing and used by software and hardware
+ * H - currently used by hardware only but available for software use
+ * S - currently used and shareable by software only
+ * E - currently used exclusively by one resource group
+ * P - currently pseudo-locked by one resource group
+ */
+static int rdt_bit_usage_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ u32 sw_shareable = 0, hw_shareable = 0;
+ u32 exclusive = 0, pseudo_locked = 0;
+ struct rdt_domain *dom;
+ int i, hwb, swb, excl, psl;
+ enum rdtgrp_mode mode;
+ bool sep = false;
+ u32 *ctrl;
+
+ mutex_lock(&rdtgroup_mutex);
+ hw_shareable = r->cache.shareable_bits;
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_putc(seq, ';');
+ ctrl = dom->ctrl_val;
+ sw_shareable = 0;
+ exclusive = 0;
+ seq_printf(seq, "%d=", dom->id);
+ for (i = 0; i < r->num_closid; i++, ctrl++) {
+ if (!closid_allocated(i))
+ continue;
+ mode = rdtgroup_mode_by_closid(i);
+ switch (mode) {
+ case RDT_MODE_SHAREABLE:
+ sw_shareable |= *ctrl;
+ break;
+ case RDT_MODE_EXCLUSIVE:
+ exclusive |= *ctrl;
+ break;
+ case RDT_MODE_PSEUDO_LOCKSETUP:
+ /*
+ * RDT_MODE_PSEUDO_LOCKSETUP is possible
+ * here but not included since the CBM
+ * associated with this CLOSID in this mode
+ * is not initialized and no task or cpu can be
+ * assigned this CLOSID.
+ */
+ break;
+ case RDT_MODE_PSEUDO_LOCKED:
+ case RDT_NUM_MODES:
+ WARN(1,
+ "invalid mode for closid %d\n", i);
+ break;
+ }
+ }
+ for (i = r->cache.cbm_len - 1; i >= 0; i--) {
+ pseudo_locked = dom->plr ? dom->plr->cbm : 0;
+ hwb = test_bit(i, (unsigned long *)&hw_shareable);
+ swb = test_bit(i, (unsigned long *)&sw_shareable);
+ excl = test_bit(i, (unsigned long *)&exclusive);
+ psl = test_bit(i, (unsigned long *)&pseudo_locked);
+ if (hwb && swb)
+ seq_putc(seq, 'X');
+ else if (hwb && !swb)
+ seq_putc(seq, 'H');
+ else if (!hwb && swb)
+ seq_putc(seq, 'S');
+ else if (excl)
+ seq_putc(seq, 'E');
+ else if (psl)
+ seq_putc(seq, 'P');
+ else /* Unused bits remain */
+ seq_putc(seq, '0');
+ }
+ sep = true;
+ }
+ seq_putc(seq, '\n');
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
static int rdt_min_bw_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -740,6 +932,269 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
return nbytes;
}
+/*
+ * rdtgroup_mode_show - Display mode of this resource group
+ */
+static int rdtgroup_mode_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
+
+ rdtgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+/**
+ * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
+ * @r: Resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Checks if provided @cbm intended to be used for @closid on domain
+ * @d overlaps with any other closids or other hardware usage associated
+ * with this domain. If @exclusive is true then only overlaps with
+ * resource groups in exclusive mode will be considered. If @exclusive
+ * is false then overlaps with any resource group or hardware entities
+ * will be considered.
+ *
+ * Return: false if CBM does not overlap, true if it does.
+ */
+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ u32 _cbm, int closid, bool exclusive)
+{
+ unsigned long *cbm = (unsigned long *)&_cbm;
+ unsigned long *ctrl_b;
+ enum rdtgrp_mode mode;
+ u32 *ctrl;
+ int i;
+
+ /* Check for any overlap with regions used by hardware directly */
+ if (!exclusive) {
+ if (bitmap_intersects(cbm,
+ (unsigned long *)&r->cache.shareable_bits,
+ r->cache.cbm_len))
+ return true;
+ }
+
+ /* Check for overlap with other resource groups */
+ ctrl = d->ctrl_val;
+ for (i = 0; i < r->num_closid; i++, ctrl++) {
+ ctrl_b = (unsigned long *)ctrl;
+ mode = rdtgroup_mode_by_closid(i);
+ if (closid_allocated(i) && i != closid &&
+ mode != RDT_MODE_PSEUDO_LOCKSETUP) {
+ if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) {
+ if (exclusive) {
+ if (mode == RDT_MODE_EXCLUSIVE)
+ return true;
+ continue;
+ }
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/**
+ * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
+ *
+ * An exclusive resource group implies that there should be no sharing of
+ * its allocated resources. At the time this group is considered to be
+ * exclusive this test can determine if its current schemata supports this
+ * setting by testing for overlap with all other resource groups.
+ *
+ * Return: true if resource group can be exclusive, false if there is overlap
+ * with allocations of other resource groups and thus this resource group
+ * cannot be exclusive.
+ */
+static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
+{
+ int closid = rdtgrp->closid;
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(d, &r->domains, list) {
+ if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
+ rdtgrp->closid, false))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * rdtgroup_mode_write - Modify the resource group's mode
+ *
+ */
+static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ enum rdtgrp_mode mode;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ rdt_last_cmd_clear();
+
+ mode = rdtgrp->mode;
+
+ if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
+ (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
+ (!strcmp(buf, "pseudo-locksetup") &&
+ mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
+ (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
+ goto out;
+
+ if (mode == RDT_MODE_PSEUDO_LOCKED) {
+ rdt_last_cmd_printf("cannot change pseudo-locked group\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!strcmp(buf, "shareable")) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = rdtgroup_locksetup_exit(rdtgrp);
+ if (ret)
+ goto out;
+ }
+ rdtgrp->mode = RDT_MODE_SHAREABLE;
+ } else if (!strcmp(buf, "exclusive")) {
+ if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
+ rdt_last_cmd_printf("schemata overlaps\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = rdtgroup_locksetup_exit(rdtgrp);
+ if (ret)
+ goto out;
+ }
+ rdtgrp->mode = RDT_MODE_EXCLUSIVE;
+ } else if (!strcmp(buf, "pseudo-locksetup")) {
+ ret = rdtgroup_locksetup_enter(rdtgrp);
+ if (ret)
+ goto out;
+ rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
+ } else {
+ rdt_last_cmd_printf("unknown/unsupported mode\n");
+ ret = -EINVAL;
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+/**
+ * rdtgroup_cbm_to_size - Translate CBM to size in bytes
+ * @r: RDT resource to which @d belongs.
+ * @d: RDT domain instance.
+ * @cbm: bitmask for which the size should be computed.
+ *
+ * The bitmask provided associated with the RDT domain instance @d will be
+ * translated into how many bytes it represents. The size in bytes is
+ * computed by first dividing the total cache size by the CBM length to
+ * determine how many bytes each bit in the bitmask represents. The result
+ * is multiplied with the number of bits set in the bitmask.
+ */
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
+ struct rdt_domain *d, u32 cbm)
+{
+ struct cpu_cacheinfo *ci;
+ unsigned int size = 0;
+ int num_b, i;
+
+ num_b = bitmap_weight((unsigned long *)&cbm, r->cache.cbm_len);
+ ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == r->cache_level) {
+ size = ci->info_list[i].size / r->cache.cbm_len * num_b;
+ break;
+ }
+ }
+
+ return size;
+}
+
+/**
+ * rdtgroup_size_show - Display size in bytes of allocated regions
+ *
+ * The "size" file mirrors the layout of the "schemata" file, printing the
+ * size in bytes of each region instead of the capacity bitmask.
+ *
+ */
+static int rdtgroup_size_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ unsigned int size;
+ bool sep = false;
+ u32 cbm;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name);
+ size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
+ rdtgrp->plr->d,
+ rdtgrp->plr->cbm);
+ seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
+ goto out;
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ seq_printf(s, "%*s:", max_name_width, r->name);
+ list_for_each_entry(d, &r->domains, list) {
+ if (sep)
+ seq_putc(s, ';');
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ size = 0;
+ } else {
+ cbm = d->ctrl_val[rdtgrp->closid];
+ size = rdtgroup_cbm_to_size(r, d, cbm);
+ }
+ seq_printf(s, "%d=%u", d->id, size);
+ sep = true;
+ }
+ seq_putc(s, '\n');
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+
+ return 0;
+}
+
/* rdtgroup information files for one cache resource. */
static struct rftype res_common_files[] = {
{
@@ -792,6 +1247,13 @@ static struct rftype res_common_files[] = {
.fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
},
{
+ .name = "bit_usage",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bit_usage_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
.name = "min_bandwidth",
.mode = 0444,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -853,6 +1315,22 @@ static struct rftype res_common_files[] = {
.seq_show = rdtgroup_schemata_show,
.fflags = RF_CTRL_BASE,
},
+ {
+ .name = "mode",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_mode_write,
+ .seq_show = rdtgroup_mode_show,
+ .fflags = RF_CTRL_BASE,
+ },
+ {
+ .name = "size",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_size_show,
+ .fflags = RF_CTRL_BASE,
+ },
+
};
static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
@@ -883,6 +1361,103 @@ error:
return ret;
}
+/**
+ * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ *
+ * The permissions of named resctrl file, directory, or link are modified
+ * to not allow read, write, or execute by any user.
+ *
+ * WARNING: This function is intended to communicate to the user that the
+ * resctrl file has been locked down - that it is not relevant to the
+ * particular state the system finds itself in. It should not be relied
+ * on to protect from user access because after the file's permissions
+ * are restricted the user can still change the permissions using chmod
+ * from the command line.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
+{
+ struct iattr iattr = {.ia_valid = ATTR_MODE,};
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+ if (!kn)
+ return -ENOENT;
+
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ iattr.ia_mode = S_IFDIR;
+ break;
+ case KERNFS_FILE:
+ iattr.ia_mode = S_IFREG;
+ break;
+ case KERNFS_LINK:
+ iattr.ia_mode = S_IFLNK;
+ break;
+ }
+
+ ret = kernfs_setattr(kn, &iattr);
+ kernfs_put(kn);
+ return ret;
+}
+
+/**
+ * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ * @mask: Mask of permissions that should be restored
+ *
+ * Restore the permissions of the named file. If @name is a directory the
+ * permissions of its parent will be used.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask)
+{
+ struct iattr iattr = {.ia_valid = ATTR_MODE,};
+ struct kernfs_node *kn, *parent;
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if (!strcmp(rft->name, name))
+ iattr.ia_mode = rft->mode & mask;
+ }
+
+ kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+ if (!kn)
+ return -ENOENT;
+
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ parent = kernfs_get_parent(kn);
+ if (parent) {
+ iattr.ia_mode |= parent->mode;
+ kernfs_put(parent);
+ }
+ iattr.ia_mode |= S_IFDIR;
+ break;
+ case KERNFS_FILE:
+ iattr.ia_mode |= S_IFREG;
+ break;
+ case KERNFS_LINK:
+ iattr.ia_mode |= S_IFLNK;
+ break;
+ }
+
+ ret = kernfs_setattr(kn, &iattr);
+ kernfs_put(kn);
+ return ret;
+}
+
static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
unsigned long fflags)
{
@@ -1224,6 +1799,9 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
if (atomic_dec_and_test(&rdtgrp->waitcount) &&
(rdtgrp->flags & RDT_DELETED)) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
kernfs_unbreak_active_protection(kn);
kernfs_put(rdtgrp->kn);
kfree(rdtgrp);
@@ -1289,10 +1867,16 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
rdtgroup_default.mon.mon_data_kn = kn_mondata;
}
+ ret = rdt_pseudo_lock_init();
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_mondata;
+ }
+
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_mondata;
+ goto out_psl;
if (rdt_alloc_capable)
static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
@@ -1310,6 +1894,8 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
goto out;
+out_psl:
+ rdt_pseudo_lock_release();
out_mondata:
if (rdt_mon_capable)
kernfs_remove(kn_mondata);
@@ -1447,6 +2033,10 @@ static void rmdir_all_sub(void)
if (rdtgrp == &rdtgroup_default)
continue;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
+
/*
* Give any CPUs back to the default group. We cannot copy
* cpu_online_mask because a CPU might have executed the
@@ -1483,6 +2073,8 @@ static void rdt_kill_sb(struct super_block *sb)
reset_all_ctrls(r);
cdp_disable_all();
rmdir_all_sub();
+ rdt_pseudo_lock_release();
+ rdtgroup_default.mode = RDT_MODE_SHAREABLE;
static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
static_branch_disable_cpuslocked(&rdt_mon_enable_key);
static_branch_disable_cpuslocked(&rdt_enable_key);
@@ -1682,6 +2274,114 @@ out_destroy:
return ret;
}
+/**
+ * cbm_ensure_valid - Enforce validity on provided CBM
+ * @_val: Candidate CBM
+ * @r: RDT resource to which the CBM belongs
+ *
+ * The provided CBM represents all cache portions available for use. This
+ * may be represented by a bitmap that does not consist of contiguous ones
+ * and thus be an invalid CBM.
+ * Here the provided CBM is forced to be a valid CBM by only considering
+ * the first set of contiguous bits as valid and clearing all bits.
+ * The intention here is to provide a valid default CBM with which a new
+ * resource group is initialized. The user can follow this with a
+ * modification to the CBM if the default does not satisfy the
+ * requirements.
+ */
+static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
+{
+ /*
+ * Convert the u32 _val to an unsigned long required by all the bit
+ * operations within this function. No more than 32 bits of this
+ * converted value can be accessed because all bit operations are
+ * additionally provided with cbm_len that is initialized during
+ * hardware enumeration using five bits from the EAX register and
+ * thus never can exceed 32 bits.
+ */
+ unsigned long *val = (unsigned long *)_val;
+ unsigned int cbm_len = r->cache.cbm_len;
+ unsigned long first_bit, zero_bit;
+
+ if (*val == 0)
+ return;
+
+ first_bit = find_first_bit(val, cbm_len);
+ zero_bit = find_next_zero_bit(val, cbm_len, first_bit);
+
+ /* Clear any remaining bits to ensure contiguous region */
+ bitmap_clear(val, zero_bit, cbm_len - zero_bit);
+}
+
+/**
+ * rdtgroup_init_alloc - Initialize the new RDT group's allocations
+ *
+ * A new RDT group is being created on an allocation capable (CAT)
+ * supporting system. Set this group up to start off with all usable
+ * allocations. That is, all shareable and unused bits.
+ *
+ * All-zero CBM is invalid. If there are no more shareable bits available
+ * on any domain then the entire allocation will fail.
+ */
+static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
+{
+ u32 used_b = 0, unused_b = 0;
+ u32 closid = rdtgrp->closid;
+ struct rdt_resource *r;
+ enum rdtgrp_mode mode;
+ struct rdt_domain *d;
+ int i, ret;
+ u32 *ctrl;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(d, &r->domains, list) {
+ d->have_new_ctrl = false;
+ d->new_ctrl = r->cache.shareable_bits;
+ used_b = r->cache.shareable_bits;
+ ctrl = d->ctrl_val;
+ for (i = 0; i < r->num_closid; i++, ctrl++) {
+ if (closid_allocated(i) && i != closid) {
+ mode = rdtgroup_mode_by_closid(i);
+ if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
+ break;
+ used_b |= *ctrl;
+ if (mode == RDT_MODE_SHAREABLE)
+ d->new_ctrl |= *ctrl;
+ }
+ }
+ if (d->plr && d->plr->cbm > 0)
+ used_b |= d->plr->cbm;
+ unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
+ unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
+ d->new_ctrl |= unused_b;
+ /*
+ * Force the initial CBM to be valid, user can
+ * modify the CBM based on system availability.
+ */
+ cbm_ensure_valid(&d->new_ctrl, r);
+ if (bitmap_weight((unsigned long *) &d->new_ctrl,
+ r->cache.cbm_len) <
+ r->cache.min_cbm_bits) {
+ rdt_last_cmd_printf("no space on %s:%d\n",
+ r->name, d->id);
+ return -ENOSPC;
+ }
+ d->have_new_ctrl = true;
+ }
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ ret = update_domains(r, rdtgrp->closid);
+ if (ret < 0) {
+ rdt_last_cmd_puts("failed to initialize allocations\n");
+ return ret;
+ }
+ rdtgrp->mode = RDT_MODE_SHAREABLE;
+ }
+
+ return 0;
+}
+
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
struct kernfs_node *prgrp_kn,
const char *name, umode_t mode,
@@ -1700,6 +2400,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
goto out_unlock;
}
+ if (rtype == RDTMON_GROUP &&
+ (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto out_unlock;
+ }
+
/* allocate the rdtgroup. */
rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
if (!rdtgrp) {
@@ -1840,6 +2548,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
ret = 0;
rdtgrp->closid = closid;
+ ret = rdtgroup_init_alloc(rdtgrp);
+ if (ret < 0)
+ goto out_id_free;
+
list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
if (rdt_mon_capable) {
@@ -1850,15 +2562,16 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
if (ret) {
rdt_last_cmd_puts("kernfs subdir error\n");
- goto out_id_free;
+ goto out_del_list;
}
}
goto out_unlock;
+out_del_list:
+ list_del(&rdtgrp->rdtgroup_list);
out_id_free:
closid_free(closid);
- list_del(&rdtgrp->rdtgroup_list);
out_common_fail:
mkdir_rdt_prepare_clean(rdtgrp);
out_unlock:
@@ -1945,6 +2658,21 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
return 0;
}
+static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
+ struct rdtgroup *rdtgrp)
+{
+ rdtgrp->flags = RDT_DELETED;
+ list_del(&rdtgrp->rdtgroup_list);
+
+ /*
+ * one extra hold on this, will drop when we kfree(rdtgrp)
+ * in rdtgroup_kn_unlock()
+ */
+ kernfs_get(kn);
+ kernfs_remove(rdtgrp->kn);
+ return 0;
+}
+
static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
cpumask_var_t tmpmask)
{
@@ -1970,7 +2698,6 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
update_closid_rmid(tmpmask, NULL);
- rdtgrp->flags = RDT_DELETED;
closid_free(rdtgrp->closid);
free_rmid(rdtgrp->mon.rmid);
@@ -1979,14 +2706,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
*/
free_all_child_rdtgrp(rdtgrp);
- list_del(&rdtgrp->rdtgroup_list);
-
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
- kernfs_remove(rdtgrp->kn);
+ rdtgroup_ctrl_remove(kn, rdtgrp);
return 0;
}
@@ -2014,13 +2734,19 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
* If the rdtgroup is a mon group and parent directory
* is a valid "mon_groups" directory, remove the mon group.
*/
- if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
- ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
- else if (rdtgrp->type == RDTMON_GROUP &&
- is_mon_groups(parent_kn, kn->name))
+ if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ ret = rdtgroup_ctrl_remove(kn, rdtgrp);
+ } else {
+ ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+ }
+ } else if (rdtgrp->type == RDTMON_GROUP &&
+ is_mon_groups(parent_kn, kn->name)) {
ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
- else
+ } else {
ret = -EPERM;
+ }
out:
rdtgroup_kn_unlock(kn);
@@ -2046,7 +2772,8 @@ static int __init rdtgroup_setup_root(void)
int ret;
rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
&rdtgroup_default);
if (IS_ERR(rdt_root))
return PTR_ERR(rdt_root);
@@ -2102,6 +2829,29 @@ int __init rdtgroup_init(void)
if (ret)
goto cleanup_mountpoint;
+ /*
+ * Adding the resctrl debugfs directory here may not be ideal since
+ * it would let the resctrl debugfs directory appear on the debugfs
+ * filesystem before the resctrl filesystem is mounted.
+ * It may also be ok since that would enable debugging of RDT before
+ * resctrl is mounted.
+ * The reason why the debugfs directory is created here and not in
+ * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
+ * during the debugfs directory creation also &sb->s_type->i_mutex_key
+ * (the lockdep class of inode->i_rwsem). Other filesystem
+ * interactions (eg. SyS_getdents) have the lock ordering:
+ * &sb->s_type->i_mutex_key --> &mm->mmap_sem
+ * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
+ * is taken, thus creating dependency:
+ * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
+ * issues considering the other two lock dependencies.
+ * By creating the debugfs directory here we avoid a dependency
+ * that may cause deadlock (even though file operations cannot
+ * occur until the filesystem is mounted, but I do not know how to
+ * tell lockdep that).
+ */
+ debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
+
return 0;
cleanup_mountpoint:
@@ -2111,3 +2861,11 @@ cleanup_root:
return ret;
}
+
+void __exit rdtgroup_exit(void)
+{
+ debugfs_remove_recursive(debugfs_resctrl);
+ unregister_filesystem(&rdt_fs_type);
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+ kernfs_destroy_root(rdt_root);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8c50754c09c1..4b767284b7f5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -123,8 +123,8 @@ void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
- /* We hope get_seconds stays lockless */
- m->time = get_seconds();
+ /* need the internal __ version to avoid deadlocks */
+ m->time = __ktime_get_real_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
m->socketid = cpu_data(m->extcpu).phys_proc_id;
@@ -1104,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn)
}
#endif
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+static bool __mc_check_crashing_cpu(int cpu)
+{
+ if (cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return true;
+ }
+ }
+ return false;
+}
+
+static void __mc_scan_banks(struct mce *m, struct mce *final,
+ unsigned long *toclear, unsigned long *valid_banks,
+ int no_way_out, int *worst)
+{
+ struct mca_config *cfg = &mca_cfg;
+ int severity, i;
+
+ for (i = 0; i < cfg->banks; i++) {
+ __clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
+
+ if (!mce_banks[i].ctl)
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ m->status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Corrected or non-signaled errors are handled by
+ * machine_check_poll(). Leave them alone, unless this panics.
+ */
+ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /* Set taint even when machine check was not enabled. */
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ severity = mce_severity(m, cfg->tolerant, NULL, true);
+
+ /*
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicking.
+ */
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ continue;
+
+ __set_bit(i, toclear);
+
+ /* Machine check event was not enabled. Clear, but ignore. */
+ if (severity == MCE_NO_SEVERITY)
+ continue;
+
+ mce_read_aux(m, i);
+
+ /* assuming valid severity level != 0 */
+ m->severity = severity;
+
+ mce_log(m);
+
+ if (severity > *worst) {
+ *final = *m;
+ *worst = severity;
+ }
+ }
+
+ /* mce_clear_state will clear *final, save locally for use later */
+ *m = *final;
+}
+
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1118,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn)
*/
void do_machine_check(struct pt_regs *regs, long error_code)
{
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
+ int cpu = smp_processor_id();
+ char *msg = "Unknown";
struct mce m, *final;
- int i;
int worst = 0;
- int severity;
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
*/
int order = -1;
+
/*
* If no_way_out gets set, there is no safe way to recover from this
* MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
*/
int no_way_out = 0;
+
/*
* If kill_it gets set, there might be a way to recover from this
* error.
*/
int kill_it = 0;
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
- DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
- char *msg = "Unknown";
/*
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
* on Intel.
*/
int lmce = 1;
- int cpu = smp_processor_id();
- /*
- * Cases where we avoid rendezvous handler timeout:
- * 1) If this CPU is offline.
- *
- * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
- * skip those CPUs which remain looping in the 1st kernel - see
- * crash_nmi_callback().
- *
- * Note: there still is a small window between kexec-ing and the new,
- * kdump kernel establishing a new #MC handler where a broadcasted MCE
- * might not get handled properly.
- */
- if (cpu_is_offline(cpu) ||
- (crashing_cpu != -1 && crashing_cpu != cpu)) {
- u64 mcgstatus;
-
- mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- if (mcgstatus & MCG_STATUS_RIPV) {
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
- return;
- }
- }
+ if (__mc_check_crashing_cpu(cpu))
+ return;
ist_enter(regs);
this_cpu_inc(mce_exception_count);
- if (!cfg->banks)
- goto out;
-
mce_gather_info(&m, regs);
m.tsc = rdtsc();
@@ -1220,67 +1292,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
order = mce_start(&no_way_out);
}
- for (i = 0; i < cfg->banks; i++) {
- __clear_bit(i, toclear);
- if (!test_bit(i, valid_banks))
- continue;
- if (!mce_banks[i].ctl)
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
-
- m.status = mce_rdmsrl(msr_ops.status(i));
- if ((m.status & MCI_STATUS_VAL) == 0)
- continue;
-
- /*
- * Non uncorrected or non signaled errors are handled by
- * machine_check_poll. Leave them alone, unless this panics.
- */
- if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
- !no_way_out)
- continue;
-
- /*
- * Set taint even when machine check was not enabled.
- */
- add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- severity = mce_severity(&m, cfg->tolerant, NULL, true);
-
- /*
- * When machine check was for corrected/deferred handler don't
- * touch, unless we're panicing.
- */
- if ((severity == MCE_KEEP_SEVERITY ||
- severity == MCE_UCNA_SEVERITY) && !no_way_out)
- continue;
- __set_bit(i, toclear);
- if (severity == MCE_NO_SEVERITY) {
- /*
- * Machine check event was not enabled. Clear, but
- * ignore.
- */
- continue;
- }
-
- mce_read_aux(&m, i);
-
- /* assuming valid severity level != 0 */
- m.severity = severity;
-
- mce_log(&m);
-
- if (severity > worst) {
- *final = m;
- worst = severity;
- }
- }
-
- /* mce_clear_state will clear *final, save locally for use later */
- m = *final;
+ __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1319,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (worst > 0)
mce_report_event(regs);
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-out:
+
sync_core();
if (worst != MCE_AR_SEVERITY && !kill_it)
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 08286269fd24..b9bc8a1a584e 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -509,12 +509,20 @@ static struct platform_device *microcode_pdev;
static int check_online_cpus(void)
{
- if (num_online_cpus() == num_present_cpus())
- return 0;
+ unsigned int cpu;
- pr_err("Not all CPUs online, aborting microcode update.\n");
+ /*
+ * Make sure all CPUs are online. It's fine for SMT to be disabled if
+ * all the primary threads are still online.
+ */
+ for_each_present_cpu(cpu) {
+ if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
+ pr_err("Not all CPUs online, aborting microcode update.\n");
+ return -EINVAL;
+ }
+ }
- return -EINVAL;
+ return 0;
}
static atomic_t late_cpus_in;
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 81c0afb39d0a..71ca064e3794 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -22,18 +22,10 @@
#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
-/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
- */
-int detect_extended_topology(struct cpuinfo_x86 *c)
+int detect_extended_topology_early(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int ht_mask_width, core_plus_mask_width;
- unsigned int core_select_mask, core_level_siblings;
- static bool printed;
+ unsigned int eax, ebx, ecx, edx;
if (c->cpuid_level < 0xb)
return -1;
@@ -52,10 +44,30 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
* initial apic id, which also represents 32-bit extended x2apic id.
*/
c->initial_apicid = edx;
+ smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+#endif
+ return 0;
+}
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+int detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int eax, ebx, ecx, edx, sub_index;
+ unsigned int ht_mask_width, core_plus_mask_width;
+ unsigned int core_select_mask, core_level_siblings;
+
+ if (detect_extended_topology_early(c) < 0)
+ return -1;
/*
* Populate HT related information from sub-leaf level 0.
*/
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
@@ -86,15 +98,6 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
-
- if (!printed) {
- pr_info("CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- if (c->x86_max_cores > 1)
- pr_info("CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- printed = 1;
- }
#endif
return 0;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 666a284116ac..9c8652974f8e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,8 +22,6 @@
#include <asm/stacktrace.h>
#include <asm/unwind.h>
-#define OPCODE_BUFSIZE 64
-
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
static int die_counter;
@@ -93,26 +91,18 @@ static void printk_stack_address(unsigned long address, int reliable,
*/
void show_opcodes(u8 *rip, const char *loglvl)
{
- unsigned int code_prologue = OPCODE_BUFSIZE * 2 / 3;
+#define PROLOGUE_SIZE 42
+#define EPILOGUE_SIZE 21
+#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
u8 opcodes[OPCODE_BUFSIZE];
- u8 *ip;
- int i;
-
- printk("%sCode: ", loglvl);
-
- ip = (u8 *)rip - code_prologue;
- if (probe_kernel_read(opcodes, ip, OPCODE_BUFSIZE)) {
- pr_cont("Bad RIP value.\n");
- return;
- }
- for (i = 0; i < OPCODE_BUFSIZE; i++, ip++) {
- if (ip == rip)
- pr_cont("<%02x> ", opcodes[i]);
- else
- pr_cont("%02x ", opcodes[i]);
+ if (probe_kernel_read(opcodes, rip - PROLOGUE_SIZE, OPCODE_BUFSIZE)) {
+ printk("%sCode: Bad RIP value.\n", loglvl);
+ } else {
+ printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
+ __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
+ opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
}
- pr_cont("\n");
}
void show_ip(struct pt_regs *regs, const char *loglvl)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index f92a6593de1e..2ea85b32421a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -10,6 +10,7 @@
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
+#include <asm/irq_regs.h>
#include <linux/hardirq.h>
#include <linux/pkeys.h>
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index abe6df15a8fb..30f9cb2c0b55 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -512,11 +512,18 @@ ENTRY(initial_code)
ENTRY(setup_once_ref)
.long setup_once
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#define PGD_ALIGN (2 * PAGE_SIZE)
+#define PTI_USER_PGD_FILL 1024
+#else
+#define PGD_ALIGN (PAGE_SIZE)
+#define PTI_USER_PGD_FILL 0
+#endif
/*
* BSS section
*/
__PAGE_ALIGNED_BSS
- .align PAGE_SIZE
+ .align PGD_ALIGN
#ifdef CONFIG_X86_PAE
.globl initial_pg_pmd
initial_pg_pmd:
@@ -526,14 +533,17 @@ initial_pg_pmd:
initial_page_table:
.fill 1024,4,0
#endif
+ .align PGD_ALIGN
initial_pg_fixmap:
.fill 1024,4,0
-.globl empty_zero_page
-empty_zero_page:
- .fill 4096,1,0
.globl swapper_pg_dir
+ .align PGD_ALIGN
swapper_pg_dir:
.fill 1024,4,0
+ .fill PTI_USER_PGD_FILL,4,0
+.globl empty_zero_page
+empty_zero_page:
+ .fill 4096,1,0
EXPORT_SYMBOL(empty_zero_page)
/*
@@ -542,7 +552,7 @@ EXPORT_SYMBOL(empty_zero_page)
#ifdef CONFIG_X86_PAE
__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
- .align PAGE_SIZE
+ .align PGD_ALIGN
ENTRY(initial_page_table)
.long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 8344dd2f310a..15ebc2fc166e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
* address given in m16:64.
*/
pushq $.Lafter_lret # put return address on stack for unwinder
- xorq %rbp, %rbp # clear frame pointer
+ xorl %ebp, %ebp # clear frame pointer
movq initial_code(%rip), %rax
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 346b24883911..b0acb22e5a46 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,6 +1,7 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/export.h>
#include <linux/delay.h>
#include <linux/errno.h>
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 8771766d46b6..34a5c1715148 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -169,28 +169,29 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
set_dr_addr_mask(0, i);
}
-/*
- * Check for virtual address in kernel space.
- */
-int arch_check_bp_in_kernelspace(struct perf_event *bp)
+static int arch_bp_generic_len(int x86_len)
{
- unsigned int len;
- unsigned long va;
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
-
- va = info->address;
- len = bp->attr.bp_len;
-
- /*
- * We don't need to worry about va + len - 1 overflowing:
- * we already require that va is aligned to a multiple of len.
- */
- return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
+ switch (x86_len) {
+ case X86_BREAKPOINT_LEN_1:
+ return HW_BREAKPOINT_LEN_1;
+ case X86_BREAKPOINT_LEN_2:
+ return HW_BREAKPOINT_LEN_2;
+ case X86_BREAKPOINT_LEN_4:
+ return HW_BREAKPOINT_LEN_4;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ return HW_BREAKPOINT_LEN_8;
+#endif
+ default:
+ return -EINVAL;
+ }
}
int arch_bp_generic_fields(int x86_len, int x86_type,
int *gen_len, int *gen_type)
{
+ int len;
+
/* Type */
switch (x86_type) {
case X86_BREAKPOINT_EXECUTE:
@@ -211,42 +212,47 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
}
/* Len */
- switch (x86_len) {
- case X86_BREAKPOINT_LEN_1:
- *gen_len = HW_BREAKPOINT_LEN_1;
- break;
- case X86_BREAKPOINT_LEN_2:
- *gen_len = HW_BREAKPOINT_LEN_2;
- break;
- case X86_BREAKPOINT_LEN_4:
- *gen_len = HW_BREAKPOINT_LEN_4;
- break;
-#ifdef CONFIG_X86_64
- case X86_BREAKPOINT_LEN_8:
- *gen_len = HW_BREAKPOINT_LEN_8;
- break;
-#endif
- default:
+ len = arch_bp_generic_len(x86_len);
+ if (len < 0)
return -EINVAL;
- }
+ *gen_len = len;
return 0;
}
-
-static int arch_build_bp_info(struct perf_event *bp)
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
{
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long va;
+ int len;
- info->address = bp->attr.bp_addr;
+ va = hw->address;
+ len = arch_bp_generic_len(hw->len);
+ WARN_ON_ONCE(len < 0);
+
+ /*
+ * We don't need to worry about va + len - 1 overflowing:
+ * we already require that va is aligned to a multiple of len.
+ */
+ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
+}
+
+static int arch_build_bp_info(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ hw->address = attr->bp_addr;
+ hw->mask = 0;
/* Type */
- switch (bp->attr.bp_type) {
+ switch (attr->bp_type) {
case HW_BREAKPOINT_W:
- info->type = X86_BREAKPOINT_WRITE;
+ hw->type = X86_BREAKPOINT_WRITE;
break;
case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
- info->type = X86_BREAKPOINT_RW;
+ hw->type = X86_BREAKPOINT_RW;
break;
case HW_BREAKPOINT_X:
/*
@@ -254,23 +260,23 @@ static int arch_build_bp_info(struct perf_event *bp)
* acceptable for kprobes. On non-kprobes kernels, we don't
* allow kernel breakpoints at all.
*/
- if (bp->attr.bp_addr >= TASK_SIZE_MAX) {
+ if (attr->bp_addr >= TASK_SIZE_MAX) {
#ifdef CONFIG_KPROBES
- if (within_kprobe_blacklist(bp->attr.bp_addr))
+ if (within_kprobe_blacklist(attr->bp_addr))
return -EINVAL;
#else
return -EINVAL;
#endif
}
- info->type = X86_BREAKPOINT_EXECUTE;
+ hw->type = X86_BREAKPOINT_EXECUTE;
/*
* x86 inst breakpoints need to have a specific undefined len.
* But we still need to check userspace is not trying to setup
* an unsupported length, to get a range breakpoint for example.
*/
- if (bp->attr.bp_len == sizeof(long)) {
- info->len = X86_BREAKPOINT_LEN_X;
+ if (attr->bp_len == sizeof(long)) {
+ hw->len = X86_BREAKPOINT_LEN_X;
return 0;
}
default:
@@ -278,28 +284,26 @@ static int arch_build_bp_info(struct perf_event *bp)
}
/* Len */
- info->mask = 0;
-
- switch (bp->attr.bp_len) {
+ switch (attr->bp_len) {
case HW_BREAKPOINT_LEN_1:
- info->len = X86_BREAKPOINT_LEN_1;
+ hw->len = X86_BREAKPOINT_LEN_1;
break;
case HW_BREAKPOINT_LEN_2:
- info->len = X86_BREAKPOINT_LEN_2;
+ hw->len = X86_BREAKPOINT_LEN_2;
break;
case HW_BREAKPOINT_LEN_4:
- info->len = X86_BREAKPOINT_LEN_4;
+ hw->len = X86_BREAKPOINT_LEN_4;
break;
#ifdef CONFIG_X86_64
case HW_BREAKPOINT_LEN_8:
- info->len = X86_BREAKPOINT_LEN_8;
+ hw->len = X86_BREAKPOINT_LEN_8;
break;
#endif
default:
/* AMD range breakpoint */
- if (!is_power_of_2(bp->attr.bp_len))
+ if (!is_power_of_2(attr->bp_len))
return -EINVAL;
- if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
+ if (attr->bp_addr & (attr->bp_len - 1))
return -EINVAL;
if (!boot_cpu_has(X86_FEATURE_BPEXT))
@@ -312,8 +316,8 @@ static int arch_build_bp_info(struct perf_event *bp)
* breakpoints, then we'll have to check for kprobe-blacklisted
* addresses anywhere in the range.
*/
- info->mask = bp->attr.bp_len - 1;
- info->len = X86_BREAKPOINT_LEN_1;
+ hw->mask = attr->bp_len - 1;
+ hw->len = X86_BREAKPOINT_LEN_1;
}
return 0;
@@ -322,22 +326,23 @@ static int arch_build_bp_info(struct perf_event *bp)
/*
* Validate the arch-specific HW Breakpoint register settings
*/
-int arch_validate_hwbkpt_settings(struct perf_event *bp)
+int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
{
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned int align;
int ret;
- ret = arch_build_bp_info(bp);
+ ret = arch_build_bp_info(bp, attr, hw);
if (ret)
return ret;
- switch (info->len) {
+ switch (hw->len) {
case X86_BREAKPOINT_LEN_1:
align = 0;
- if (info->mask)
- align = info->mask;
+ if (hw->mask)
+ align = hw->mask;
break;
case X86_BREAKPOINT_LEN_2:
align = 1;
@@ -358,7 +363,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
* Check that the low-order bits of the address are appropriate
* for the alignment implied by len.
*/
- if (info->address & align)
+ if (hw->address & align)
return -EINVAL;
return 0;
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 86c4439f9d74..519649ddf100 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/timex.h>
#include <linux/random.h>
#include <linux/init.h>
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 74383a3780dc..01adea278a71 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -8,6 +8,7 @@
#include <asm/traps.h>
#include <asm/proto.h>
#include <asm/desc.h>
+#include <asm/hw_irq.h>
struct idt_data {
unsigned int vector;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 328d027d829d..59b5f2ea7c2f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -10,6 +10,7 @@
#include <linux/ftrace.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/irq.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index c1bdbd3d3232..95600a99ae93 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -11,6 +11,7 @@
#include <linux/seq_file.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/kernel_stat.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d86e344f5b3d..0469cd078db1 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -11,6 +11,7 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/delay.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 772196c1b8c4..a0693b71cfc1 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/timex.h>
#include <linux/random.h>
#include <linux/kprobes.h>
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index e56c95be2808..eeea935e9bb5 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -37,15 +37,18 @@ static void bug_at(unsigned char *ip, int line)
BUG();
}
-static void __jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- void *(*poker)(void *, const void *, size_t),
- int init)
+static void __ref __jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ void *(*poker)(void *, const void *, size_t),
+ int init)
{
union jump_code_union code;
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+ if (early_boot_irqs_disabled)
+ poker = text_poke_early;
+
if (type == JUMP_LABEL_JMP) {
if (init) {
/*
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index ae38dccf0c8f..2b949f4fd4d8 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -105,14 +105,4 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig
}
#endif
-#ifdef CONFIG_KPROBES_ON_FTRACE
-extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb);
-#else
-static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- return 0;
-}
-#endif
#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6f4d42377fe5..b0d1e81c96bb 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -66,8 +66,6 @@
#include "common.h"
-void jprobe_return_end(void);
-
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -395,8 +393,6 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
- (u8 *) real;
if ((s64) (s32) newdisp != newdisp) {
pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
- pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
- src, real, insn->displacement.value);
return 0;
}
disp = (u8 *) dest + insn_offset_displacement(insn);
@@ -596,7 +592,6 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
* stepping.
*/
regs->ip = (unsigned long)p->ainsn.insn;
- preempt_enable_no_resched();
return;
}
#endif
@@ -640,8 +635,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
* Raise a BUG or we'll continue in an endless reentering loop
* and eventually a stack overflow.
*/
- printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
- p->addr);
+ pr_err("Unrecoverable kprobe detected.\n");
dump_kprobe(p);
BUG();
default:
@@ -669,12 +663,10 @@ int kprobe_int3_handler(struct pt_regs *regs)
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
- * We don't want to be preempted for the entire
- * duration of kprobe processing. We conditionally
- * re-enable preemption at the end of this function,
- * and also in reenter_kprobe() and setup_singlestep().
+ * We don't want to be preempted for the entire duration of kprobe
+ * processing. Since int3 and debug trap disables irqs and we clear
+ * IF while singlestepping, it must be no preemptible.
*/
- preempt_disable();
kcb = get_kprobe_ctlblk();
p = get_kprobe(addr);
@@ -690,13 +682,14 @@ int kprobe_int3_handler(struct pt_regs *regs)
/*
* If we have no pre-handler or it returned 0, we
* continue with normal processing. If we have a
- * pre-handler and it returned non-zero, it prepped
- * for calling the break_handler below on re-entry
- * for jprobe processing, so get out doing nothing
- * more here.
+ * pre-handler and it returned non-zero, that means
+ * user handler setup registers to exit to another
+ * instruction, we must skip the single stepping.
*/
if (!p->pre_handler || !p->pre_handler(p, regs))
setup_singlestep(p, regs, kcb, 0);
+ else
+ reset_current_kprobe();
return 1;
}
} else if (*addr != BREAKPOINT_INSTRUCTION) {
@@ -710,18 +703,9 @@ int kprobe_int3_handler(struct pt_regs *regs)
* the original instruction.
*/
regs->ip = (unsigned long)addr;
- preempt_enable_no_resched();
return 1;
- } else if (kprobe_running()) {
- p = __this_cpu_read(current_kprobe);
- if (p->break_handler && p->break_handler(p, regs)) {
- if (!skip_singlestep(p, regs, kcb))
- setup_singlestep(p, regs, kcb, 0);
- return 1;
- }
} /* else: not a kprobe fault; let the kernel handle it */
- preempt_enable_no_resched();
return 0;
}
NOKPROBE_SYMBOL(kprobe_int3_handler);
@@ -972,8 +956,6 @@ int kprobe_debug_handler(struct pt_regs *regs)
}
reset_current_kprobe();
out:
- preempt_enable_no_resched();
-
/*
* if somebody else is singlestepping across a probe point, flags
* will have TF set, in which case, continue the remaining processing
@@ -1020,7 +1002,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
restore_previous_kprobe(kcb);
else
reset_current_kprobe();
- preempt_enable_no_resched();
} else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
kcb->kprobe_status == KPROBE_HIT_SSDONE) {
/*
@@ -1083,93 +1064,6 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct jprobe *jp = container_of(p, struct jprobe, kp);
- unsigned long addr;
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- kcb->jprobe_saved_regs = *regs;
- kcb->jprobe_saved_sp = stack_addr(regs);
- addr = (unsigned long)(kcb->jprobe_saved_sp);
-
- /*
- * As Linus pointed out, gcc assumes that the callee
- * owns the argument space and could overwrite it, e.g.
- * tailcall optimization. So, to be absolutely safe
- * we also save and restore enough stack bytes to cover
- * the argument area.
- * Use __memcpy() to avoid KASAN stack out-of-bounds reports as we copy
- * raw stack chunk with redzones:
- */
- __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
- regs->ip = (unsigned long)(jp->entry);
-
- /*
- * jprobes use jprobe_return() which skips the normal return
- * path of the function, and this messes up the accounting of the
- * function graph tracer to get messed up.
- *
- * Pause function graph tracing while performing the jprobe function.
- */
- pause_graph_tracing();
- return 1;
-}
-NOKPROBE_SYMBOL(setjmp_pre_handler);
-
-void jprobe_return(void)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- /* Unpoison stack redzones in the frames we are going to jump over. */
- kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp);
-
- asm volatile (
-#ifdef CONFIG_X86_64
- " xchg %%rbx,%%rsp \n"
-#else
- " xchgl %%ebx,%%esp \n"
-#endif
- " int3 \n"
- " .globl jprobe_return_end\n"
- " jprobe_return_end: \n"
- " nop \n"::"b"
- (kcb->jprobe_saved_sp):"memory");
-}
-NOKPROBE_SYMBOL(jprobe_return);
-NOKPROBE_SYMBOL(jprobe_return_end);
-
-int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- u8 *addr = (u8 *) (regs->ip - 1);
- struct jprobe *jp = container_of(p, struct jprobe, kp);
- void *saved_sp = kcb->jprobe_saved_sp;
-
- if ((addr > (u8 *) jprobe_return) &&
- (addr < (u8 *) jprobe_return_end)) {
- if (stack_addr(regs) != saved_sp) {
- struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
- printk(KERN_ERR
- "current sp %p does not match saved sp %p\n",
- stack_addr(regs), saved_sp);
- printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
- show_regs(saved_regs);
- printk(KERN_ERR "Current registers\n");
- show_regs(regs);
- BUG();
- }
- /* It's OK to start function graph tracing again */
- unpause_graph_tracing();
- *regs = kcb->jprobe_saved_regs;
- __memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp));
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-NOKPROBE_SYMBOL(longjmp_break_handler);
-
bool arch_within_kprobe_blacklist(unsigned long addr)
{
bool is_in_entry_trampoline_section = false;
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 8dc0161cec8f..ef819e19650b 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -25,36 +25,6 @@
#include "common.h"
-static nokprobe_inline
-void __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb, unsigned long orig_ip)
-{
- /*
- * Emulate singlestep (and also recover regs->ip)
- * as if there is a 5byte nop
- */
- regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
- if (unlikely(p->post_handler)) {
- kcb->kprobe_status = KPROBE_HIT_SSDONE;
- p->post_handler(p, regs, 0);
- }
- __this_cpu_write(current_kprobe, NULL);
- if (orig_ip)
- regs->ip = orig_ip;
-}
-
-int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- if (kprobe_ftrace(p)) {
- __skip_singlestep(p, regs, kcb, 0);
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-NOKPROBE_SYMBOL(skip_singlestep);
-
/* Ftrace callback handler for kprobes -- called under preepmt disabed */
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *regs)
@@ -75,18 +45,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
regs->ip = ip + sizeof(kprobe_opcode_t);
- /* To emulate trap based kprobes, preempt_disable here */
- preempt_disable();
__this_cpu_write(current_kprobe, p);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (!p->pre_handler || !p->pre_handler(p, regs)) {
- __skip_singlestep(p, regs, kcb, orig_ip);
- preempt_enable_no_resched();
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ regs->ip = orig_ip;
}
/*
- * If pre_handler returns !0, it sets regs->ip and
- * resets current kprobe, and keep preempt count +1.
+ * If pre_handler returns !0, it changes regs->ip. We have to
+ * skip emulating post_handler.
*/
+ __this_cpu_write(current_kprobe, NULL);
}
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 203d398802a3..eaf02f2e7300 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -491,7 +491,6 @@ int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
if (!reenter)
reset_current_kprobe();
- preempt_enable_no_resched();
return 1;
}
return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5b2300b818af..09aaabb2bbf1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -45,7 +45,6 @@
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
-#include <asm/kvm_guest.h>
static int kvmapf = 1;
@@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc);
-static int kvmclock_vsyscall = 1;
-static int __init parse_no_kvmclock_vsyscall(char *arg)
-{
- kvmclock_vsyscall = 0;
- return 0;
-}
-
-early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-
static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
@@ -154,7 +144,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
for (;;) {
if (!n.halted)
- prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
@@ -188,7 +178,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
if (n->halted)
smp_send_reschedule(n->cpu);
else if (swq_has_sleeper(&n->wq))
- swake_up(&n->wq);
+ swake_up_one(&n->wq);
}
static void apf_task_wake_all(void)
@@ -560,9 +550,6 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
- if (kvmclock_vsyscall)
- kvm_setup_vsyscall_timeinfo();
-
#ifdef CONFIG_SMP
smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
@@ -628,6 +615,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.name = "KVM",
.detect = kvm_detect,
.type = X86_HYPER_KVM,
+ .init.init_platform = kvmclock_init,
.init.guest_late_init = kvm_guest_init,
.init.x2apic_available = kvm_para_available,
};
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 3b8e7c13c614..1e6764648af3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,30 +23,57 @@
#include <asm/apic.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
-#include <linux/memblock.h>
+#include <linux/cpuhotplug.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/hypervisor.h>
#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
#include <asm/kvmclock.h>
-static int kvmclock __ro_after_init = 1;
-static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
-static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
-static u64 kvm_sched_clock_offset;
+static int kvmclock __initdata = 1;
+static int kvmclock_vsyscall __initdata = 1;
+static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
+static u64 kvm_sched_clock_offset __ro_after_init;
-static int parse_no_kvmclock(char *arg)
+static int __init parse_no_kvmclock(char *arg)
{
kvmclock = 0;
return 0;
}
early_param("no-kvmclock", parse_no_kvmclock);
-/* The hypervisor will put information about time periodically here */
-static struct pvclock_vsyscall_time_info *hv_clock;
-static struct pvclock_wall_clock *wall_clock;
+static int __init parse_no_kvmclock_vsyscall(char *arg)
+{
+ kvmclock_vsyscall = 0;
+ return 0;
+}
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
+/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
+#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
+#define HVC_BOOT_ARRAY_SIZE \
+ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
+
+static struct pvclock_vsyscall_time_info
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock;
+static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+
+static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+{
+ return &this_cpu_read(hv_clock_per_cpu)->pvti;
+}
+
+static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
+{
+ return this_cpu_read(hv_clock_per_cpu);
+}
/*
* The wallclock is the time of day when we booted. Since then, some time may
@@ -55,21 +82,10 @@ static struct pvclock_wall_clock *wall_clock;
*/
static void kvm_get_wallclock(struct timespec64 *now)
{
- struct pvclock_vcpu_time_info *vcpu_time;
- int low, high;
- int cpu;
-
- low = (int)slow_virt_to_phys(wall_clock);
- high = ((u64)slow_virt_to_phys(wall_clock) >> 32);
-
- native_write_msr(msr_kvm_wall_clock, low, high);
-
- cpu = get_cpu();
-
- vcpu_time = &hv_clock[cpu].pvti;
- pvclock_read_wallclock(wall_clock, vcpu_time, now);
-
- put_cpu();
+ wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
+ preempt_disable();
+ pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
+ preempt_enable();
}
static int kvm_set_wallclock(const struct timespec64 *now)
@@ -79,14 +95,10 @@ static int kvm_set_wallclock(const struct timespec64 *now)
static u64 kvm_clock_read(void)
{
- struct pvclock_vcpu_time_info *src;
u64 ret;
- int cpu;
preempt_disable_notrace();
- cpu = smp_processor_id();
- src = &hv_clock[cpu].pvti;
- ret = pvclock_clocksource_read(src);
+ ret = pvclock_clocksource_read(this_cpu_pvti());
preempt_enable_notrace();
return ret;
}
@@ -112,11 +124,11 @@ static inline void kvm_sched_clock_init(bool stable)
kvm_sched_clock_offset = kvm_clock_read();
pv_time_ops.sched_clock = kvm_sched_clock_read;
- printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
- kvm_sched_clock_offset);
+ pr_info("kvm-clock: using sched offset of %llu cycles",
+ kvm_sched_clock_offset);
BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
- sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
}
/*
@@ -130,19 +142,11 @@ static inline void kvm_sched_clock_init(bool stable)
*/
static unsigned long kvm_get_tsc_khz(void)
{
- struct pvclock_vcpu_time_info *src;
- int cpu;
- unsigned long tsc_khz;
-
- cpu = get_cpu();
- src = &hv_clock[cpu].pvti;
- tsc_khz = pvclock_tsc_khz(src);
- put_cpu();
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
- return tsc_khz;
+ return pvclock_tsc_khz(this_cpu_pvti());
}
-static void kvm_get_preset_lpj(void)
+static void __init kvm_get_preset_lpj(void)
{
unsigned long khz;
u64 lpj;
@@ -156,49 +160,40 @@ static void kvm_get_preset_lpj(void)
bool kvm_check_and_clear_guest_paused(void)
{
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
bool ret = false;
- struct pvclock_vcpu_time_info *src;
- int cpu = smp_processor_id();
- if (!hv_clock)
+ if (!src)
return ret;
- src = &hv_clock[cpu].pvti;
- if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
- src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) {
+ src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED;
pvclock_touch_watchdogs();
ret = true;
}
-
return ret;
}
struct clocksource kvm_clock = {
- .name = "kvm-clock",
- .read = kvm_clock_get_cycles,
- .rating = 400,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .name = "kvm-clock",
+ .read = kvm_clock_get_cycles,
+ .rating = 400,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
EXPORT_SYMBOL_GPL(kvm_clock);
-int kvm_register_clock(char *txt)
+static void kvm_register_clock(char *txt)
{
- int cpu = smp_processor_id();
- int low, high, ret;
- struct pvclock_vcpu_time_info *src;
-
- if (!hv_clock)
- return 0;
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ u64 pa;
- src = &hv_clock[cpu].pvti;
- low = (int)slow_virt_to_phys(src) | 1;
- high = ((u64)slow_virt_to_phys(src) >> 32);
- ret = native_write_msr_safe(msr_kvm_system_time, low, high);
- printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
- cpu, high, low, txt);
+ if (!src)
+ return;
- return ret;
+ pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
+ wrmsrl(msr_kvm_system_time, pa);
+ pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}
static void kvm_save_sched_clock_state(void)
@@ -213,11 +208,7 @@ static void kvm_restore_sched_clock_state(void)
#ifdef CONFIG_X86_LOCAL_APIC
static void kvm_setup_secondary_clock(void)
{
- /*
- * Now that the first cpu already had this clocksource initialized,
- * we shouldn't fail.
- */
- WARN_ON(kvm_register_clock("secondary cpu clock"));
+ kvm_register_clock("secondary cpu clock");
}
#endif
@@ -245,100 +236,84 @@ static void kvm_shutdown(void)
native_machine_shutdown();
}
-static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size,
- phys_addr_t align)
+static int __init kvm_setup_vsyscall_timeinfo(void)
{
- phys_addr_t mem;
+#ifdef CONFIG_X86_64
+ u8 flags;
- mem = memblock_alloc(size, align);
- if (!mem)
+ if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
return 0;
- if (sev_active()) {
- if (early_set_memory_decrypted((unsigned long)__va(mem), size))
- goto e_free;
- }
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+ return 0;
- return mem;
-e_free:
- memblock_free(mem, size);
+ kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
+#endif
return 0;
}
+early_initcall(kvm_setup_vsyscall_timeinfo);
-static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size)
+static int kvmclock_setup_percpu(unsigned int cpu)
{
- if (sev_active())
- early_set_memory_encrypted((unsigned long)__va(addr), size);
+ struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu);
- memblock_free(addr, size);
+ /*
+ * The per cpu area setup replicates CPU0 data to all cpu
+ * pointers. So carefully check. CPU0 has been set up in init
+ * already.
+ */
+ if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
+ return 0;
+
+ /* Use the static page for the first CPUs, allocate otherwise */
+ if (cpu < HVC_BOOT_ARRAY_SIZE)
+ p = &hv_clock_boot[cpu];
+ else
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+
+ per_cpu(hv_clock_per_cpu, cpu) = p;
+ return p ? 0 : -ENOMEM;
}
void __init kvmclock_init(void)
{
- struct pvclock_vcpu_time_info *vcpu_time;
- unsigned long mem, mem_wall_clock;
- int size, cpu, wall_clock_size;
u8 flags;
- size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
-
- if (!kvm_para_available())
+ if (!kvm_para_available() || !kvmclock)
return;
- if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
- } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
- return;
-
- wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock));
- mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE);
- if (!mem_wall_clock)
- return;
-
- wall_clock = __va(mem_wall_clock);
- memset(wall_clock, 0, wall_clock_size);
-
- mem = kvm_memblock_alloc(size, PAGE_SIZE);
- if (!mem) {
- kvm_memblock_free(mem_wall_clock, wall_clock_size);
- wall_clock = NULL;
+ } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
return;
}
- hv_clock = __va(mem);
- memset(hv_clock, 0, size);
-
- if (kvm_register_clock("primary cpu clock")) {
- hv_clock = NULL;
- kvm_memblock_free(mem, size);
- kvm_memblock_free(mem_wall_clock, wall_clock_size);
- wall_clock = NULL;
+ if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
+ kvmclock_setup_percpu, NULL) < 0) {
return;
}
- printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+ pr_info("kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);
- pvclock_set_pvti_cpu0_va(hv_clock);
+ this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
+ kvm_register_clock("primary cpu clock");
+ pvclock_set_pvti_cpu0_va(hv_clock_boot);
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
- cpu = get_cpu();
- vcpu_time = &hv_clock[cpu].pvti;
- flags = pvclock_read_flags(vcpu_time);
-
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
- put_cpu();
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.calibrate_cpu = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- x86_cpuinit.early_percpu_clock_init =
- kvm_setup_secondary_clock;
+ x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
#endif
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
@@ -350,31 +325,3 @@ void __init kvmclock_init(void)
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
}
-
-int __init kvm_setup_vsyscall_timeinfo(void)
-{
-#ifdef CONFIG_X86_64
- int cpu;
- u8 flags;
- struct pvclock_vcpu_time_info *vcpu_time;
- unsigned int size;
-
- if (!hv_clock)
- return 0;
-
- size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
-
- cpu = get_cpu();
-
- vcpu_time = &hv_clock[cpu].pvti;
- flags = pvclock_read_flags(vcpu_time);
-
- put_cpu();
-
- if (!(flags & PVCLOCK_TSC_STABLE_BIT))
- return 1;
-
- kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
-#endif
- return 0;
-}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index c9b14020f4dd..733e6ace0fa4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -100,6 +100,102 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return new_ldt;
}
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+static void do_sanity_check(struct mm_struct *mm,
+ bool had_kernel_mapping,
+ bool had_user_mapping)
+{
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_kernel_mapping);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(!had_user_mapping);
+ } else {
+ /*
+ * This is the first time we're mapping an LDT for this process.
+ * Sync the pgd to the usermode tables.
+ */
+ WARN_ON(had_kernel_mapping);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(had_user_mapping);
+ }
+}
+
+#ifdef CONFIG_X86_PAE
+
+static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+
+ if (pgd->pgd == 0)
+ return NULL;
+
+ p4d = p4d_offset(pgd, va);
+ if (p4d_none(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, va);
+ if (pud_none(*pud))
+ return NULL;
+
+ return pmd_offset(pud, va);
+}
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+
+ if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pmd(u_pmd, *k_pmd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ bool had_kernel, had_user;
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+ had_kernel = (k_pmd->pmd != 0);
+ had_user = (u_pmd->pmd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#else /* !CONFIG_X86_PAE */
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+ if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ bool had_kernel = (pgd->pgd != 0);
+ bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#endif /* CONFIG_X86_PAE */
+
/*
* If PTI is enabled, this maps the LDT into the kernelmode and
* usermode tables for the given mm.
@@ -115,9 +211,8 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
- bool is_vmalloc, had_top_level_entry;
unsigned long va;
+ bool is_vmalloc;
spinlock_t *ptl;
pgd_t *pgd;
int i;
@@ -131,13 +226,15 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
*/
WARN_ON(ldt->slot != -1);
+ /* Check if the current mappings are sane */
+ sanity_check_ldt_mapping(mm);
+
/*
* Did we already have the top level entry allocated? We can't
* use pgd_none() for this because it doens't do anything on
* 4-level page table kernels.
*/
pgd = pgd_offset(mm, LDT_BASE_ADDR);
- had_top_level_entry = (pgd->pgd != 0);
is_vmalloc = is_vmalloc_addr(ldt->entries);
@@ -172,41 +269,31 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
pte_unmap_unlock(ptep, ptl);
}
- if (mm->context.ldt) {
- /*
- * We already had an LDT. The top-level entry should already
- * have been allocated and synchronized with the usermode
- * tables.
- */
- WARN_ON(!had_top_level_entry);
- if (static_cpu_has(X86_FEATURE_PTI))
- WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
- } else {
- /*
- * This is the first time we're mapping an LDT for this process.
- * Sync the pgd to the usermode tables.
- */
- WARN_ON(had_top_level_entry);
- if (static_cpu_has(X86_FEATURE_PTI)) {
- WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
- set_pgd(kernel_to_user_pgdp(pgd), *pgd);
- }
- }
+ /* Propagate LDT mapping to the user page-table */
+ map_ldt_struct_to_user(mm);
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
ldt->slot = slot;
-#endif
return 0;
}
+#else /* !CONFIG_PAGE_TABLE_ISOLATION */
+
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+ return 0;
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
static void free_ldt_pgtables(struct mm_struct *mm)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
struct mmu_gather tlb;
unsigned long start = LDT_BASE_ADDR;
- unsigned long end = start + (1UL << PGDIR_SHIFT);
+ unsigned long end = LDT_END_ADDR;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d1ab07ec8c9a..5409c2800ab5 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -56,7 +56,7 @@ static void load_segments(void)
static void machine_kexec_free_page_tables(struct kimage *image)
{
- free_page((unsigned long)image->arch.pgd);
+ free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
image->arch.pgd = NULL;
#ifdef CONFIG_X86_PAE
free_page((unsigned long)image->arch.pmd0);
@@ -72,7 +72,8 @@ static void machine_kexec_free_page_tables(struct kimage *image)
static int machine_kexec_alloc_page_tables(struct kimage *image)
{
- image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ PGD_ALLOCATION_ORDER);
#ifdef CONFIG_X86_PAE
image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 99dc79e76bdc..930c88341e4e 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -88,10 +88,12 @@ unsigned paravirt_patch_call(void *insnbuf,
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
- if (tgt_clobbers & ~site_clobbers)
- return len; /* target would clobber too much for this site */
- if (len < 5)
+ if (len < 5) {
+#ifdef CONFIG_RETPOLINE
+ WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr);
+#endif
return len; /* call too long for patch site */
+ }
b->opcode = 0xe8; /* call */
b->delta = delta;
@@ -106,8 +108,12 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
- if (len < 5)
+ if (len < 5) {
+#ifdef CONFIG_RETPOLINE
+ WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr);
+#endif
return len; /* call too long for patch site */
+ }
b->opcode = 0xe9; /* jmp */
b->delta = delta;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 9edadabf04f6..9cb98f7b07c9 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index ab5d9dd668d2..80f9fe8d27d0 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -175,7 +175,7 @@ rootfs_initcall(pci_iommu_init);
static int via_no_dac_cb(struct pci_dev *pdev, void *data)
{
- pdev->dev.dma_32bit_limit = true;
+ pdev->dev.bus_dma_mask = DMA_BIT_MASK(32);
return 0;
}
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 4dfd90a75e63..2e9006c1e240 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -60,7 +60,7 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
p->detect, q->detect);
/* Heavy handed way..*/
- x->depend = 0;
+ x->depend = NULL;
}
}
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index da5190a1ea16..4a710ffffd9a 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -9,6 +9,6 @@ static __init int add_pcspkr(void)
pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
- return IS_ERR(pd) ? PTR_ERR(pd) : 0;
+ return PTR_ERR_OR_ZERO(pd);
}
device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 30ca2d1a9231..c93fcfdf1673 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,14 +57,12 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
-#ifdef CONFIG_X86_64
/*
* .sp1 is cpu_current_top_of_stack. The init task never
* runs user code, but cpu_current_top_of_stack should still
* be well defined before the first context switch.
*/
.sp1 = TOP_OF_INIT_STACK,
-#endif
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0ae659de21eb..2924fd447e61 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -285,7 +285,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
- update_sp0(next_p);
+ update_task_stack(next_p);
refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 12bb445fb98d..476e3ddf8890 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -478,7 +478,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */
- update_sp0(next_p);
+ update_task_stack(next_p);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2f86d883dd95..8fe740e22030 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -823,6 +823,12 @@ void __init setup_arch(char **cmdline_p)
memblock_reserve(__pa_symbol(_text),
(unsigned long)__bss_stop - (unsigned long)_text);
+ /*
+ * Make sure page 0 is always reserved because on systems with
+ * L1TF its contents can be leaked to user processes.
+ */
+ memblock_reserve(0, PAGE_SIZE);
+
early_reserve_initrd();
/*
@@ -866,6 +872,8 @@ void __init setup_arch(char **cmdline_p)
idt_setup_early_traps();
early_cpu_init();
+ arch_init_ideal_nops();
+ jump_label_init();
early_ioremap_init();
setup_olpc_ofw_pgd();
@@ -1012,6 +1020,7 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
+ tsc_early_init();
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
@@ -1197,11 +1206,6 @@ void __init setup_arch(char **cmdline_p)
memblock_find_dma_reserve();
-#ifdef CONFIG_KVM_GUEST
- kvmclock_init();
-#endif
-
- tsc_early_delay_calibrate();
if (!early_xdbc_setup_hardware())
early_xdbc_register_console();
@@ -1272,8 +1276,6 @@ void __init setup_arch(char **cmdline_p)
mcheck_init();
- arch_init_ideal_nops();
-
register_refined_jiffies(CLOCK_TICK_RATE);
#ifdef CONFIG_EFI
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 5c574dff4c1a..04adc8d60aed 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -261,6 +261,7 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
+ kvm_set_cpu_l1tf_flush_l1d();
if (trace_resched_ipi_enabled()) {
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index db9656e13ea0..f02ecaf97904 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -80,6 +80,7 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
+#include <asm/hw_irq.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -271,6 +272,23 @@ static void notrace start_secondary(void *unused)
}
/**
+ * topology_is_primary_thread - Check whether CPU is the primary SMT thread
+ * @cpu: CPU to check
+ */
+bool topology_is_primary_thread(unsigned int cpu)
+{
+ return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
+}
+
+/**
+ * topology_smt_supported - Check whether SMT is supported by the CPUs
+ */
+bool topology_smt_supported(void)
+{
+ return smp_num_siblings > 1;
+}
+
+/**
* topology_phys_to_logical_pkg - Map a physical package id to a logical
*
* Returns logical package id or -1 if not found
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 093f2ea5dd56..7627455047c2 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -81,16 +81,6 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
-#define STACKTRACE_DUMP_ONCE(task) ({ \
- static bool __section(.data.unlikely) __dumped; \
- \
- if (!__dumped) { \
- __dumped = true; \
- WARN_ON(1); \
- show_stack(task, NULL); \
- } \
-})
-
static int __always_inline
__save_stack_trace_reliable(struct stack_trace *trace,
struct task_struct *task)
@@ -99,30 +89,25 @@ __save_stack_trace_reliable(struct stack_trace *trace,
struct pt_regs *regs;
unsigned long addr;
- for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
+ for (unwind_start(&state, task, NULL, NULL);
+ !unwind_done(&state) && !unwind_error(&state);
unwind_next_frame(&state)) {
regs = unwind_get_entry_regs(&state, NULL);
if (regs) {
+ /* Success path for user tasks */
+ if (user_mode(regs))
+ goto success;
+
/*
* Kernel mode registers on the stack indicate an
* in-kernel interrupt or exception (e.g., preemption
* or a page fault), which can make frame pointers
* unreliable.
*/
- if (!user_mode(regs))
- return -EINVAL;
- /*
- * The last frame contains the user mode syscall
- * pt_regs. Skip it and finish the unwind.
- */
- unwind_next_frame(&state);
- if (!unwind_done(&state)) {
- STACKTRACE_DUMP_ONCE(task);
+ if (IS_ENABLED(CONFIG_FRAME_POINTER))
return -EINVAL;
- }
- break;
}
addr = unwind_get_return_address(&state);
@@ -132,21 +117,22 @@ __save_stack_trace_reliable(struct stack_trace *trace,
* generated code which __kernel_text_address() doesn't know
* about.
*/
- if (!addr) {
- STACKTRACE_DUMP_ONCE(task);
+ if (!addr)
return -EINVAL;
- }
if (save_stack_address(trace, addr, false))
return -EINVAL;
}
/* Check for stack corruption */
- if (unwind_error(&state)) {
- STACKTRACE_DUMP_ONCE(task);
+ if (unwind_error(&state))
+ return -EINVAL;
+
+ /* Success path for non-user tasks, i.e. kthreads and idle tasks */
+ if (!(task->flags & (PF_KTHREAD | PF_IDLE)))
return -EINVAL;
- }
+success:
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 774ebafa97c4..be01328eb755 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -12,6 +12,7 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/export.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 74392d9d51e0..1463468ba9a0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -33,16 +33,13 @@ EXPORT_SYMBOL(cpu_khz);
unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);
+#define KHZ 1000
+
/*
* TSC can be unstable due to cpufreq or due to unsynced TSCs
*/
static int __read_mostly tsc_unstable;
-/* native_sched_clock() is called before tsc_init(), so
- we must start with the TSC soft disabled to prevent
- erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */
-static int __read_mostly tsc_disabled = -1;
-
static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
@@ -106,23 +103,6 @@ void cyc2ns_read_end(void)
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
*/
-static void cyc2ns_data_init(struct cyc2ns_data *data)
-{
- data->cyc2ns_mul = 0;
- data->cyc2ns_shift = 0;
- data->cyc2ns_offset = 0;
-}
-
-static void __init cyc2ns_init(int cpu)
-{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- cyc2ns_data_init(&c2n->data[0]);
- cyc2ns_data_init(&c2n->data[1]);
-
- seqcount_init(&c2n->seq);
-}
-
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
struct cyc2ns_data data;
@@ -138,18 +118,11 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
unsigned long long ns_now;
struct cyc2ns_data data;
struct cyc2ns *c2n;
- unsigned long flags;
-
- local_irq_save(flags);
- sched_clock_idle_sleep_event();
-
- if (!khz)
- goto done;
ns_now = cycles_2_ns(tsc_now);
@@ -181,13 +154,56 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_
c2n->data[0] = data;
raw_write_seqcount_latch(&c2n->seq);
c2n->data[1] = data;
+}
+
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sched_clock_idle_sleep_event();
+
+ if (khz)
+ __set_cyc2ns_scale(khz, cpu, tsc_now);
-done:
sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
/*
+ * Initialize cyc2ns for boot cpu
+ */
+static void __init cyc2ns_init_boot_cpu(void)
+{
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+
+ seqcount_init(&c2n->seq);
+ __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
+}
+
+/*
+ * Secondary CPUs do not run through tsc_init(), so set up
+ * all the scale factors for all CPUs, assuming the same
+ * speed as the bootup CPU. (cpufreq notifiers will fix this
+ * up if their speed diverges)
+ */
+static void __init cyc2ns_init_secondary_cpus(void)
+{
+ unsigned int cpu, this_cpu = smp_processor_id();
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+ struct cyc2ns_data *data = c2n->data;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != this_cpu) {
+ seqcount_init(&c2n->seq);
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+ c2n->data[0] = data[0];
+ c2n->data[1] = data[1];
+ }
+ }
+}
+
+/*
* Scheduler clock - returns current time in nanosec units.
*/
u64 native_sched_clock(void)
@@ -248,8 +264,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
- pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");
- tsc_disabled = 1;
+ mark_tsc_unstable("boot parameter notsc");
return 1;
}
#else
@@ -665,30 +680,17 @@ static unsigned long cpu_khz_from_cpuid(void)
return eax_base_mhz * 1000;
}
-/**
- * native_calibrate_cpu - calibrate the cpu on boot
+/*
+ * calibrate cpu using pit, hpet, and ptimer methods. They are available
+ * later in boot after acpi is initialized.
*/
-unsigned long native_calibrate_cpu(void)
+static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate;
+ unsigned long flags, latch, ms;
int hpet = is_hpet_enabled(), i, loopmin;
- fast_calibrate = cpu_khz_from_cpuid();
- if (fast_calibrate)
- return fast_calibrate;
-
- fast_calibrate = cpu_khz_from_msr();
- if (fast_calibrate)
- return fast_calibrate;
-
- local_irq_save(flags);
- fast_calibrate = quick_pit_calibrate();
- local_irq_restore(flags);
- if (fast_calibrate)
- return fast_calibrate;
-
/*
* Run 5 calibration loops to get the lowest frequency value
* (the best estimate). We use two different calibration modes
@@ -831,6 +833,37 @@ unsigned long native_calibrate_cpu(void)
return tsc_pit_min;
}
+/**
+ * native_calibrate_cpu_early - can calibrate the cpu early in boot
+ */
+unsigned long native_calibrate_cpu_early(void)
+{
+ unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();
+
+ if (!fast_calibrate)
+ fast_calibrate = cpu_khz_from_msr();
+ if (!fast_calibrate) {
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ }
+ return fast_calibrate;
+}
+
+
+/**
+ * native_calibrate_cpu - calibrate the cpu
+ */
+static unsigned long native_calibrate_cpu(void)
+{
+ unsigned long tsc_freq = native_calibrate_cpu_early();
+
+ if (!tsc_freq)
+ tsc_freq = pit_hpet_ptimer_calibrate_cpu();
+
+ return tsc_freq;
+}
+
void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
@@ -1307,7 +1340,7 @@ unreg:
static int __init init_tsc_clocksource(void)
{
- if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)
+ if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
return 0;
if (tsc_unstable)
@@ -1341,40 +1374,22 @@ unreg:
*/
device_initcall(init_tsc_clocksource);
-void __init tsc_early_delay_calibrate(void)
+static bool __init determine_cpu_tsc_frequencies(bool early)
{
- unsigned long lpj;
-
- if (!boot_cpu_has(X86_FEATURE_TSC))
- return;
-
- cpu_khz = x86_platform.calibrate_cpu();
- tsc_khz = x86_platform.calibrate_tsc();
-
- tsc_khz = tsc_khz ? : cpu_khz;
- if (!tsc_khz)
- return;
-
- lpj = tsc_khz * 1000;
- do_div(lpj, HZ);
- loops_per_jiffy = lpj;
-}
-
-void __init tsc_init(void)
-{
- u64 lpj, cyc;
- int cpu;
-
- if (!boot_cpu_has(X86_FEATURE_TSC)) {
- setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
- return;
+ /* Make sure that cpu and tsc are not already calibrated */
+ WARN_ON(cpu_khz || tsc_khz);
+
+ if (early) {
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+ } else {
+ /* We should not be here with non-native cpu calibration */
+ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
+ cpu_khz = pit_hpet_ptimer_calibrate_cpu();
}
- cpu_khz = x86_platform.calibrate_cpu();
- tsc_khz = x86_platform.calibrate_tsc();
-
/*
- * Trust non-zero tsc_khz as authorative,
+ * Trust non-zero tsc_khz as authoritative,
* and use it to sanity check cpu_khz,
* which will be off if system timer is off.
*/
@@ -1383,52 +1398,78 @@ void __init tsc_init(void)
else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
cpu_khz = tsc_khz;
- if (!tsc_khz) {
- mark_tsc_unstable("could not calculate TSC khz");
- setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
- return;
- }
+ if (tsc_khz == 0)
+ return false;
pr_info("Detected %lu.%03lu MHz processor\n",
- (unsigned long)cpu_khz / 1000,
- (unsigned long)cpu_khz % 1000);
+ (unsigned long)cpu_khz / KHZ,
+ (unsigned long)cpu_khz % KHZ);
if (cpu_khz != tsc_khz) {
pr_info("Detected %lu.%03lu MHz TSC",
- (unsigned long)tsc_khz / 1000,
- (unsigned long)tsc_khz % 1000);
+ (unsigned long)tsc_khz / KHZ,
+ (unsigned long)tsc_khz % KHZ);
}
+ return true;
+}
+
+static unsigned long __init get_loops_per_jiffy(void)
+{
+ unsigned long lpj = tsc_khz * KHZ;
+ do_div(lpj, HZ);
+ return lpj;
+}
+
+static void __init tsc_enable_sched_clock(void)
+{
/* Sanitize TSC ADJUST before cyc2ns gets initialized */
tsc_store_and_check_tsc_adjust(true);
+ cyc2ns_init_boot_cpu();
+ static_branch_enable(&__use_tsc);
+}
+
+void __init tsc_early_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+ if (!determine_cpu_tsc_frequencies(true))
+ return;
+ loops_per_jiffy = get_loops_per_jiffy();
+ tsc_enable_sched_clock();
+}
+
+void __init tsc_init(void)
+{
/*
- * Secondary CPUs do not run through tsc_init(), so set up
- * all the scale factors for all CPUs, assuming the same
- * speed as the bootup CPU. (cpufreq notifiers will fix this
- * up if their speed diverges)
+ * native_calibrate_cpu_early can only calibrate using methods that are
+ * available early in boot.
*/
- cyc = rdtsc();
- for_each_possible_cpu(cpu) {
- cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu, cyc);
- }
+ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
+ x86_platform.calibrate_cpu = native_calibrate_cpu;
- if (tsc_disabled > 0)
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
+ }
- /* now allow native_sched_clock() to use rdtsc */
+ if (!tsc_khz) {
+ /* We failed to determine frequencies earlier, try again */
+ if (!determine_cpu_tsc_frequencies(false)) {
+ mark_tsc_unstable("could not calculate TSC khz");
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+ tsc_enable_sched_clock();
+ }
- tsc_disabled = 0;
- static_branch_enable(&__use_tsc);
+ cyc2ns_init_secondary_cpus();
if (!no_sched_irq_time)
enable_sched_clock_irqtime();
- lpj = ((u64)tsc_khz * 1000);
- do_div(lpj, HZ);
- lpj_fine = lpj;
-
+ lpj_fine = get_loops_per_jiffy();
use_tsc_delay();
check_system_tsc_reliable();
@@ -1455,7 +1496,7 @@ unsigned long calibrate_delay_is_known(void)
int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
const struct cpumask *mask = topology_core_cpumask(cpu);
- if (tsc_disabled || !constant_tsc || !mask)
+ if (!constant_tsc || !mask)
return 0;
sibling = cpumask_any_but(mask, cpu);
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 19afdbd7d0a7..27ef714d886c 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -1,17 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * tsc_msr.c - TSC frequency enumeration via MSR
+ * TSC frequency enumeration via MSR
*
- * Copyright (C) 2013 Intel Corporation
+ * Copyright (C) 2013, 2018 Intel Corporation
* Author: Bin Gao <bin.gao@intel.com>
- *
- * This file is released under the GPLv2.
*/
#include <linux/kernel.h>
-#include <asm/processor.h>
-#include <asm/setup.h>
+
#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
#include <asm/param.h>
+#include <asm/tsc.h>
#define MAX_NUM_FREQS 9
@@ -23,44 +25,48 @@
* field msr_plat does.
*/
struct freq_desc {
- u8 x86_family; /* CPU family */
- u8 x86_model; /* model */
u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
u32 freqs[MAX_NUM_FREQS];
};
-static struct freq_desc freq_desc_tables[] = {
- /* PNW */
- { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } },
- /* CLV+ */
- { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } },
- /* TNG - Intel Atom processor Z3400 series */
- { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } },
- /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */
- { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } },
- /* ANN - Intel Atom processor Z3500 series */
- { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } },
- /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */
- { 6, 0x4c, 1, { 83300, 100000, 133300, 116700,
- 80000, 93300, 90000, 88900, 87500 } },
+/*
+ * Penwell and Clovertrail use spread spectrum clock,
+ * so the freq number is not exactly the same as reported
+ * by MSR based on SDM.
+ */
+static const struct freq_desc freq_desc_pnw = {
+ 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 }
};
-static int match_cpu(u8 family, u8 model)
-{
- int i;
+static const struct freq_desc freq_desc_clv = {
+ 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 }
+};
- for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) {
- if ((family == freq_desc_tables[i].x86_family) &&
- (model == freq_desc_tables[i].x86_model))
- return i;
- }
+static const struct freq_desc freq_desc_byt = {
+ 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 }
+};
- return -1;
-}
+static const struct freq_desc freq_desc_cht = {
+ 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 }
+};
-/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
-#define id_to_freq(cpu_index, freq_id) \
- (freq_desc_tables[cpu_index].freqs[freq_id])
+static const struct freq_desc freq_desc_tng = {
+ 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 }
+};
+
+static const struct freq_desc freq_desc_ann = {
+ 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 }
+};
+
+static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
+ INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw),
+ INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv),
+ INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt),
+ INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht),
+ INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng),
+ INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann),
+ {}
+};
/*
* MSR-based CPU/TSC frequency discovery for certain CPUs.
@@ -70,18 +76,17 @@ static int match_cpu(u8 family, u8 model)
*/
unsigned long cpu_khz_from_msr(void)
{
- u32 lo, hi, ratio, freq_id, freq;
+ u32 lo, hi, ratio, freq;
+ const struct freq_desc *freq_desc;
+ const struct x86_cpu_id *id;
unsigned long res;
- int cpu_index;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- return 0;
- cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
- if (cpu_index < 0)
+ id = x86_match_cpu(tsc_msr_cpu_ids);
+ if (!id)
return 0;
- if (freq_desc_tables[cpu_index].msr_plat) {
+ freq_desc = (struct freq_desc *)id->driver_data;
+ if (freq_desc->msr_plat) {
rdmsr(MSR_PLATFORM_INFO, lo, hi);
ratio = (lo >> 8) & 0xff;
} else {
@@ -91,8 +96,9 @@ unsigned long cpu_khz_from_msr(void)
/* Get FSB FREQ ID */
rdmsr(MSR_FSB_FREQ, lo, hi);
- freq_id = lo & 0x7;
- freq = id_to_freq(cpu_index, freq_id);
+
+ /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
+ freq = freq_desc->freqs[lo & 0x7];
/* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
res = freq * ratio;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index feb28fee6cea..26038eacf74a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -198,7 +198,7 @@ static int orc_sort_cmp(const void *_a, const void *_b)
* whitelisted .o files which didn't get objtool generation.
*/
orc_a = cur_orc_table + (a - cur_orc_ip_table);
- return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1;
+ return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
}
#ifdef CONFIG_MODULES
@@ -352,7 +352,7 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr
bool unwind_next_frame(struct unwind_state *state)
{
- unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
+ unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
bool indirect = false;
@@ -363,9 +363,9 @@ bool unwind_next_frame(struct unwind_state *state)
/* Don't let modules unload while we're reading their ORC data. */
preempt_disable();
- /* Have we reached the end? */
+ /* End-of-stack check for user tasks: */
if (state->regs && user_mode(state->regs))
- goto done;
+ goto the_end;
/*
* Find the orc_entry associated with the text address.
@@ -374,9 +374,16 @@ bool unwind_next_frame(struct unwind_state *state)
* calls and calls to noreturn functions.
*/
orc = orc_find(state->signal ? state->ip : state->ip - 1);
- if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
- goto done;
- orig_ip = state->ip;
+ if (!orc)
+ goto err;
+
+ /* End-of-stack check for kernel threads: */
+ if (orc->sp_reg == ORC_REG_UNDEFINED) {
+ if (!orc->end)
+ goto err;
+
+ goto the_end;
+ }
/* Find the previous frame's stack: */
switch (orc->sp_reg) {
@@ -402,7 +409,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (!state->regs || !state->full_regs) {
orc_warn("missing regs for base reg R10 at ip %pB\n",
(void *)state->ip);
- goto done;
+ goto err;
}
sp = state->regs->r10;
break;
@@ -411,7 +418,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (!state->regs || !state->full_regs) {
orc_warn("missing regs for base reg R13 at ip %pB\n",
(void *)state->ip);
- goto done;
+ goto err;
}
sp = state->regs->r13;
break;
@@ -420,7 +427,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (!state->regs || !state->full_regs) {
orc_warn("missing regs for base reg DI at ip %pB\n",
(void *)state->ip);
- goto done;
+ goto err;
}
sp = state->regs->di;
break;
@@ -429,7 +436,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (!state->regs || !state->full_regs) {
orc_warn("missing regs for base reg DX at ip %pB\n",
(void *)state->ip);
- goto done;
+ goto err;
}
sp = state->regs->dx;
break;
@@ -437,12 +444,12 @@ bool unwind_next_frame(struct unwind_state *state)
default:
orc_warn("unknown SP base reg %d for ip %pB\n",
orc->sp_reg, (void *)state->ip);
- goto done;
+ goto err;
}
if (indirect) {
if (!deref_stack_reg(state, sp, &sp))
- goto done;
+ goto err;
}
/* Find IP, SP and possibly regs: */
@@ -451,7 +458,7 @@ bool unwind_next_frame(struct unwind_state *state)
ip_p = sp - sizeof(long);
if (!deref_stack_reg(state, ip_p, &state->ip))
- goto done;
+ goto err;
state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
state->ip, (void *)ip_p);
@@ -465,7 +472,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
- goto done;
+ goto err;
}
state->regs = (struct pt_regs *)sp;
@@ -477,7 +484,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
- goto done;
+ goto err;
}
state->regs = (void *)sp - IRET_FRAME_OFFSET;
@@ -500,18 +507,18 @@ bool unwind_next_frame(struct unwind_state *state)
case ORC_REG_PREV_SP:
if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
- goto done;
+ goto err;
break;
case ORC_REG_BP:
if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
- goto done;
+ goto err;
break;
default:
orc_warn("unknown BP base reg %d for ip %pB\n",
orc->bp_reg, (void *)orig_ip);
- goto done;
+ goto err;
}
/* Prevent a recursive loop due to bad ORC data: */
@@ -520,13 +527,16 @@ bool unwind_next_frame(struct unwind_state *state)
state->sp <= prev_sp) {
orc_warn("stack going in the wrong direction? ip=%pB\n",
(void *)orig_ip);
- goto done;
+ goto err;
}
preempt_enable();
return true;
-done:
+err:
+ state->error = true;
+
+the_end:
preempt_enable();
state->stack_info.type = STACK_TYPE_UNKNOWN;
return false;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9d0b5af7db91..1c03e4aa6474 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -149,7 +149,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
- update_sp0(tsk);
+ update_task_stack(tsk);
refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
preempt_enable();
@@ -374,7 +374,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
refresh_sysenter_cs(&tsk->thread);
}
- update_sp0(tsk);
+ update_task_stack(tsk);
preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 5e1458f609a1..8bde0a419f86 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -55,19 +55,22 @@ jiffies_64 = jiffies;
* so we can enable protection checks as well as retain 2MB large page
* mappings for kernel text.
*/
-#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
-#define X64_ALIGN_RODATA_END \
+#define X86_ALIGN_RODATA_END \
. = ALIGN(HPAGE_SIZE); \
- __end_rodata_hpage_align = .;
+ __end_rodata_hpage_align = .; \
+ __end_rodata_aligned = .;
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
#else
-#define X64_ALIGN_RODATA_BEGIN
-#define X64_ALIGN_RODATA_END
+#define X86_ALIGN_RODATA_BEGIN
+#define X86_ALIGN_RODATA_END \
+ . = ALIGN(PAGE_SIZE); \
+ __end_rodata_aligned = .;
#define ALIGN_ENTRY_TEXT_BEGIN
#define ALIGN_ENTRY_TEXT_END
@@ -141,9 +144,9 @@ SECTIONS
/* .text should occupy whole number of pages */
. = ALIGN(PAGE_SIZE);
- X64_ALIGN_RODATA_BEGIN
+ X86_ALIGN_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
- X64_ALIGN_RODATA_END
+ X86_ALIGN_RODATA_END
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 3ab867603e81..2792b5573818 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -109,7 +109,7 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
struct x86_platform_ops x86_platform __ro_after_init = {
- .calibrate_cpu = native_calibrate_cpu,
+ .calibrate_cpu = native_calibrate_cpu_early,
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_rtc_mmss,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b5cd8465d44f..d536d457517b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
* using swait_active() is safe.
*/
if (swait_active(q))
- swake_up(q);
+ swake_up_one(q);
if (apic_lvtt_tscdeadline(apic))
ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6b8f11521c41..a44e568363a4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3840,6 +3840,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
{
int r = 1;
+ vcpu->arch.l1tf_flush_l1d = true;
switch (vcpu->arch.apf.host_apf_reason) {
default:
trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e30da9a2430c..46b428c0990e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -188,6 +188,150 @@ module_param(ple_window_max, uint, 0444);
extern const ulong vmx_return;
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+ const char *option;
+ enum vmx_l1d_flush_state cmd;
+} vmentry_l1d_param[] = {
+ {"auto", VMENTER_L1D_FLUSH_AUTO},
+ {"never", VMENTER_L1D_FLUSH_NEVER},
+ {"cond", VMENTER_L1D_FLUSH_COND},
+ {"always", VMENTER_L1D_FLUSH_ALWAYS},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+ struct page *page;
+ unsigned int i;
+
+ if (!enable_ept) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+ return 0;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+ u64 msr;
+
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+ }
+
+ /* If set to auto use the default l1tf mitigation method */
+ if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ l1tf = VMENTER_L1D_FLUSH_NEVER;
+ break;
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ l1tf = VMENTER_L1D_FLUSH_COND;
+ break;
+ case L1TF_MITIGATION_FULL:
+ case L1TF_MITIGATION_FULL_FORCE:
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ break;
+ }
+ } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ }
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+ !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+ if (!page)
+ return -ENOMEM;
+ vmx_l1d_flush_pages = page_address(page);
+
+ /*
+ * Initialize each page with a different pattern in
+ * order to protect against KSM in the nested
+ * virtualization case.
+ */
+ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+ memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ PAGE_SIZE);
+ }
+ }
+
+ l1tf_vmx_mitigation = l1tf;
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ static_branch_enable(&vmx_l1d_should_flush);
+ else
+ static_branch_disable(&vmx_l1d_should_flush);
+
+ if (l1tf == VMENTER_L1D_FLUSH_COND)
+ static_branch_enable(&vmx_l1d_flush_cond);
+ else
+ static_branch_disable(&vmx_l1d_flush_cond);
+ return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+ unsigned int i;
+
+ if (s) {
+ for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+ if (sysfs_streq(s, vmentry_l1d_param[i].option))
+ return vmentry_l1d_param[i].cmd;
+ }
+ }
+ return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ int l1tf, ret;
+
+ if (!boot_cpu_has(X86_BUG_L1TF))
+ return 0;
+
+ l1tf = vmentry_l1d_flush_parse(s);
+ if (l1tf < 0)
+ return l1tf;
+
+ /*
+ * Has vmx_init() run already? If not then this is the pre init
+ * parameter parsing. In that case just store the value and let
+ * vmx_init() do the proper setup after enable_ept has been
+ * established.
+ */
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+ vmentry_l1d_flush_param = l1tf;
+ return 0;
+ }
+
+ mutex_lock(&vmx_l1d_flush_mutex);
+ ret = vmx_setup_l1d_flush(l1tf);
+ mutex_unlock(&vmx_l1d_flush_mutex);
+ return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
struct kvm_vmx {
struct kvm kvm;
@@ -757,6 +901,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+struct vmx_msrs {
+ unsigned int nr;
+ struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+};
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -790,9 +939,8 @@ struct vcpu_vmx {
struct loaded_vmcs *loaded_vmcs;
bool __launched; /* temporary, used in vmx_vcpu_run */
struct msr_autoload {
- unsigned nr;
- struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
- struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+ struct vmx_msrs guest;
+ struct vmx_msrs host;
} msr_autoload;
struct {
int loaded;
@@ -2377,9 +2525,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
vm_exit_controls_clearbit(vmx, exit);
}
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+ unsigned int i;
+
+ for (i = 0; i < m->nr; ++i) {
+ if (m->val[i].index == msr)
+ return i;
+ }
+ return -ENOENT;
+}
+
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
- unsigned i;
+ int i;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
@@ -2400,18 +2559,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
}
break;
}
+ i = find_msr(&m->guest, msr);
+ if (i < 0)
+ goto skip_guest;
+ --m->guest.nr;
+ m->guest.val[i] = m->guest.val[m->guest.nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
-
- if (i == m->nr)
+skip_guest:
+ i = find_msr(&m->host, msr);
+ if (i < 0)
return;
- --m->nr;
- m->guest[i] = m->guest[m->nr];
- m->host[i] = m->host[m->nr];
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+
+ --m->host.nr;
+ m->host.val[i] = m->host.val[m->host.nr];
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -2426,9 +2588,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
}
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
- u64 guest_val, u64 host_val)
+ u64 guest_val, u64 host_val, bool entry_only)
{
- unsigned i;
+ int i, j = 0;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
@@ -2463,24 +2625,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
+ i = find_msr(&m->guest, msr);
+ if (!entry_only)
+ j = find_msr(&m->host, msr);
- if (i == NR_AUTOLOAD_MSRS) {
+ if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
- } else if (i == m->nr) {
- ++m->nr;
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
+ if (i < 0) {
+ i = m->guest.nr++;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+ }
+ m->guest.val[i].index = msr;
+ m->guest.val[i].value = guest_val;
- m->guest[i].index = msr;
- m->guest[i].value = guest_val;
- m->host[i].index = msr;
- m->host[i].value = host_val;
+ if (entry_only)
+ return;
+
+ if (j < 0) {
+ j = m->host.nr++;
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+ }
+ m->host.val[j].index = msr;
+ m->host.val[j].value = host_val;
}
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
@@ -2524,7 +2693,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer);
+ guest_efer, host_efer, false);
return false;
} else {
guest_efer &= ~ignore_bits;
@@ -3987,7 +4156,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.ia32_xss = data;
if (vcpu->arch.ia32_xss != host_xss)
add_atomic_switch_msr(vmx, MSR_IA32_XSS,
- vcpu->arch.ia32_xss, host_xss);
+ vcpu->arch.ia32_xss, host_xss, false);
else
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
@@ -6274,9 +6443,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -6296,8 +6465,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
+ vmx->arch_capabilities = kvm_get_arch_capabilities();
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
@@ -7893,6 +8061,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+ vmx->nested.vpid02 = allocate_vpid();
+
vmx->nested.vmxon = true;
return 0;
@@ -8480,21 +8650,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
/* Emulate the VMPTRST instruction */
static int handle_vmptrst(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- gva_t vmcs_gva;
+ unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
struct x86_exception e;
+ gva_t gva;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, true, &vmcs_gva))
+ if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
return 1;
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
- if (kvm_write_guest_virt_system(vcpu, vmcs_gva,
- (void *)&to_vmx(vcpu)->nested.current_vmptr,
- sizeof(u64), &e)) {
+ if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
+ sizeof(gpa_t), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
@@ -9547,6 +9716,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
}
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ /*
+ * This code is only executed when the the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ bool flush_l1d;
+
+ /*
+ * Clear the per-vcpu flush bit, it gets set again
+ * either from vcpu_run() or from one of the unsafe
+ * VMEXIT handlers.
+ */
+ flush_l1d = vcpu->arch.l1tf_flush_l1d;
+ vcpu->arch.l1tf_flush_l1d = false;
+
+ /*
+ * Clear the per-cpu flush bit, it gets set again from
+ * the interrupt handlers.
+ */
+ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+ kvm_clear_cpu_l1tf_flush_l1d();
+
+ if (!flush_l1d)
+ return;
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -9948,7 +10190,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, msrs[i].msr);
else
add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
- msrs[i].host);
+ msrs[i].host, false);
}
static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
@@ -10043,6 +10285,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
(unsigned long)&current_evmcs->host_rsp : 0;
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -10370,11 +10615,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
- if (nested) {
+ if (nested)
nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
kvm_vcpu_apicv_active(&vmx->vcpu));
- vmx->nested.vpid02 = allocate_vpid();
- }
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
@@ -10391,7 +10634,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
return &vmx->vcpu;
free_vmcs:
- free_vpid(vmx->nested.vpid02);
free_loaded_vmcs(vmx->loaded_vmcs);
free_msrs:
kfree(vmx->guest_msrs);
@@ -10405,10 +10647,37 @@ free_vcpu:
return ERR_PTR(err);
}
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+
static int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
kvm->arch.pause_in_guest = true;
+
+ if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ /* 'I explicitly don't care' is set */
+ break;
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ /*
+ * Warn upon starting the first VM in a potentially
+ * insecure environment.
+ */
+ if (cpu_smt_control == CPU_SMT_ENABLED)
+ pr_warn_once(L1TF_MSG_SMT);
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+ pr_warn_once(L1TF_MSG_L1D);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ /* Flush is enforced */
+ break;
+ }
+ }
return 0;
}
@@ -11262,10 +11531,10 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
* Set the MSR load/store lists to match L0's settings.
*/
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
set_cr4_guest_host_mask(vmx);
@@ -11901,6 +12170,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return ret;
}
+ /* Hide L1D cache contents from the nested guest. */
+ vmx->vcpu.arch.l1tf_flush_l1d = true;
+
/*
* If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
* by event injection, halt vcpu.
@@ -12421,8 +12693,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_segment_cache_clear(vmx);
/* Update any VMCS fields that might have changed while L2 ran */
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (vmx->hv_deadline_tsc == -1)
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -13139,6 +13411,51 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.enable_smi_window = enable_smi_window,
};
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
+static void vmx_exit(void)
+{
+#ifdef CONFIG_KEXEC_CORE
+ RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+ synchronize_rcu();
+#endif
+
+ kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (static_branch_unlikely(&enable_evmcs)) {
+ int cpu;
+ struct hv_vp_assist_page *vp_ap;
+ /*
+ * Reset everything to support using non-enlightened VMCS
+ * access later (e.g. when we reload the module with
+ * enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+ }
+#endif
+ vmx_cleanup_l1d_flush();
+}
+module_exit(vmx_exit);
+
static int __init vmx_init(void)
{
int r;
@@ -13173,10 +13490,25 @@ static int __init vmx_init(void)
#endif
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
+ /*
+ * Must be called after kvm_init() so enable_ept is properly set
+ * up. Hand the parameter mitigation value in which was stored in
+ * the pre module init parser. If no parameter was given, it will
+ * contain 'auto' which will be turned into the default 'cond'
+ * mitigation mode.
+ */
+ if (boot_cpu_has(X86_BUG_L1TF)) {
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r) {
+ vmx_exit();
+ return r;
+ }
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
@@ -13185,39 +13517,4 @@ static int __init vmx_init(void)
return 0;
}
-
-static void __exit vmx_exit(void)
-{
-#ifdef CONFIG_KEXEC_CORE
- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
- synchronize_rcu();
-#endif
-
- kvm_exit();
-
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
+module_init(vmx_init);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b812b3c5088..a5caa5e5480c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -195,6 +195,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "nmi_injections", VCPU_STAT(nmi_injections) },
{ "req_event", VCPU_STAT(req_event) },
+ { "l1d_flush", VCPU_STAT(l1d_flush) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -1102,11 +1103,35 @@ static u32 msr_based_features[] = {
static unsigned int num_msr_based_features;
+u64 kvm_get_arch_capabilities(void)
+{
+ u64 data;
+
+ rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+
+ /*
+ * If we're doing cache flushes (either "always" or "cond")
+ * we will do one whenever the guest does a vmlaunch/vmresume.
+ * If an outer hypervisor is doing the cache flush for us
+ * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
+ * capability to the guest too, and if EPT is disabled we're not
+ * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
+ * require a nested hypervisor to do a flush of its own.
+ */
+ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
+ data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+
+ return data;
+}
+EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
+
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
- case MSR_IA32_UCODE_REV:
case MSR_IA32_ARCH_CAPABILITIES:
+ msr->data = kvm_get_arch_capabilities();
+ break;
+ case MSR_IA32_UCODE_REV:
rdmsrl_safe(msr->index, &msr->data);
break;
default:
@@ -4876,6 +4901,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception *exception)
{
+ /* kvm_write_guest_virt_system can pull in tons of pages. */
+ vcpu->arch.l1tf_flush_l1d = true;
+
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -6052,6 +6080,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
bool writeback = true;
bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ vcpu->arch.l1tf_flush_l1d = true;
+
/*
* Clear write_fault_to_shadow_pgtable here to ensure it is
* never reused.
@@ -7581,6 +7611,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
if (kvm_vcpu_running(vcpu)) {
@@ -8700,6 +8731,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ vcpu->arch.l1tf_flush_l1d = true;
kvm_x86_ops->sched_in(vcpu, cpu);
}
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 298ef1479240..3b24dc05251c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)
/* Copy successful. Return zero */
.L_done_memcpy_trap:
- xorq %rax, %rax
+ xorl %eax, %eax
ret
ENDPROC(__memcpy_mcsafe)
EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 2f3c9196b834..a12afff146d1 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -111,6 +111,8 @@ static struct addr_marker address_markers[] = {
[END_OF_SPACE_NR] = { -1, NULL }
};
+#define INIT_PGD ((pgd_t *) &init_top_pgt)
+
#else /* CONFIG_X86_64 */
enum address_markers_idx {
@@ -121,6 +123,9 @@ enum address_markers_idx {
#ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ LDT_NR,
+#endif
CPU_ENTRY_AREA_NR,
FIXADDR_START_NR,
END_OF_SPACE_NR,
@@ -134,11 +139,16 @@ static struct addr_marker address_markers[] = {
#ifdef CONFIG_HIGHMEM
[PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ [LDT_NR] = { 0UL, "LDT remap" },
+#endif
[CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
[FIXADDR_START_NR] = { 0UL, "Fixmap area" },
[END_OF_SPACE_NR] = { -1, NULL }
};
+#define INIT_PGD (swapper_pg_dir)
+
#endif /* !CONFIG_X86_64 */
/* Multipliers for offsets within the PTEs */
@@ -496,11 +506,7 @@ static inline bool is_hypervisor_range(int idx)
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx, bool dmesg)
{
-#ifdef CONFIG_X86_64
- pgd_t *start = (pgd_t *) &init_top_pgt;
-#else
- pgd_t *start = swapper_pg_dir;
-#endif
+ pgd_t *start = INIT_PGD;
pgprotval_t prot, eff;
int i;
struct pg_state st = {};
@@ -563,12 +569,13 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
-static void ptdump_walk_user_pgd_level_checkwx(void)
+void ptdump_walk_user_pgd_level_checkwx(void)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pgd_t *pgd = (pgd_t *) &init_top_pgt;
+ pgd_t *pgd = INIT_PGD;
- if (!static_cpu_has(X86_FEATURE_PTI))
+ if (!(__supported_pte_mask & _PAGE_NX) ||
+ !static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("x86/mm: Checking user space page tables\n");
@@ -580,7 +587,6 @@ static void ptdump_walk_user_pgd_level_checkwx(void)
void ptdump_walk_pgd_level_checkwx(void)
{
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
- ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)
@@ -609,6 +615,9 @@ static int __init pt_dump_init(void)
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
+# ifdef CONFIG_MODIFY_LDT_SYSCALL
+ address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
+# endif
#endif
return 0;
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2aafa6ab6103..db1c042e9853 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -317,8 +317,6 @@ static noinline int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
- WARN_ON_ONCE(in_nmi());
-
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cee58a972cb2..acfab322fbe0 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -4,6 +4,8 @@
#include <linux/swap.h>
#include <linux/memblock.h>
#include <linux/bootmem.h> /* for max_low_pfn */
+#include <linux/swapfile.h>
+#include <linux/swapops.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@@ -773,13 +775,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
}
}
+/*
+ * begin/end can be in the direct map or the "high kernel mapping"
+ * used for the kernel image only. free_init_pages() will do the
+ * right thing for either kind of address.
+ */
+void free_kernel_image_pages(void *begin, void *end)
+{
+ unsigned long begin_ul = (unsigned long)begin;
+ unsigned long end_ul = (unsigned long)end;
+ unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
+
+
+ free_init_pages("unused kernel image", begin_ul, end_ul);
+
+ /*
+ * PTI maps some of the kernel into userspace. For performance,
+ * this includes some kernel areas that do not contain secrets.
+ * Those areas might be adjacent to the parts of the kernel image
+ * being freed, which may contain secrets. Remove the "high kernel
+ * image mapping" for these freed areas, ensuring they are not even
+ * potentially vulnerable to Meltdown regardless of the specific
+ * optimizations PTI is currently using.
+ *
+ * The "noalias" prevents unmapping the direct map alias which is
+ * needed to access the freed pages.
+ *
+ * This is only valid for 64bit kernels. 32bit has only one mapping
+ * which can't be treated in this way for obvious reasons.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
+ set_memory_np_noalias(begin_ul, len_pages);
+}
+
void __ref free_initmem(void)
{
e820__reallocate_tables();
- free_init_pages("unused kernel",
- (unsigned long)(&__init_begin),
- (unsigned long)(&__init_end));
+ free_kernel_image_pages(&__init_begin, &__init_end);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -880,3 +913,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
__pte2cachemode_tbl[entry] = cache;
}
+
+#ifdef CONFIG_SWAP
+unsigned long max_swapfile_size(void)
+{
+ unsigned long pages;
+
+ pages = generic_max_swapfile_size();
+
+ if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
+ unsigned long l1tf_limit = l1tf_pfn_limit() + 1;
+ /*
+ * We encode swap offsets also with 3 bits below those for pfn
+ * which makes the usable limit higher.
+ */
+#if CONFIG_PGTABLE_LEVELS > 2
+ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
+#endif
+ pages = min_t(unsigned long, l1tf_limit, pages);
+ }
+ return pages;
+}
+#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a688617c727e..dd519f372169 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1283,20 +1283,10 @@ void mark_rodata_ro(void)
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
- free_init_pages("unused kernel",
- (unsigned long) __va(__pa_symbol(text_end)),
- (unsigned long) __va(__pa_symbol(rodata_start)));
- free_init_pages("unused kernel",
- (unsigned long) __va(__pa_symbol(rodata_end)),
- (unsigned long) __va(__pa_symbol(_sdata)));
+ free_kernel_image_pages((void *)text_end, (void *)rodata_start);
+ free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
debug_checkwx();
-
- /*
- * Do this after all of the manipulation of the
- * kernel text page tables are complete.
- */
- pti_clone_kernel_text();
}
int kern_addr_valid(unsigned long addr)
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 7c8686709636..79eb55ce69a9 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
+ pmd_t new_pmd;
pmdval_t v = pmd_val(*pmd);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pmd(pmd, __pmd(v));
+ *old = v;
+ new_pmd = pmd_mknotpresent(*pmd);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ new_pmd = __pmd(*old);
+ }
+ set_pmd(pmd, new_pmd);
}
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pte_atomic(pte, __pte(v));
+ *old = v;
+ /* Nothing should care about address */
+ pte_clear(&init_mm, 0, pte);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ set_pte_atomic(pte, __pte(*old));
+ }
}
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 48c591251600..f40ab8185d94 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -240,3 +240,24 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
return phys_addr_valid(addr + count - 1);
}
+
+/*
+ * Only allow root to set high MMIO mappings to PROT_NONE.
+ * This prevents an unpriv. user to set them to PROT_NONE and invert
+ * them, then pointing to valid memory for L1TF speculation.
+ *
+ * Note: for locked down kernels may want to disable the root override.
+ */
+bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return true;
+ if (!__pte_needs_invert(pgprot_val(prot)))
+ return true;
+ /* If it's real memory always allow */
+ if (pfn_valid(pfn))
+ return true;
+ if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
+ return false;
+ return true;
+}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 34a2a3bfde9c..b54d52a2d00a 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -61,7 +61,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
eb->nid = nid;
if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
- emu_nid_to_phys[nid] = nid;
+ emu_nid_to_phys[nid] = pb->nid;
pb->start += size;
if (pb->start >= pb->end) {
@@ -198,40 +198,73 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
return end;
}
+static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
+{
+ unsigned long max_pfn = PHYS_PFN(max_addr);
+ unsigned long base_pfn = PHYS_PFN(base);
+ unsigned long hole_pfns = PHYS_PFN(hole);
+
+ return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
+}
+
/*
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
* `addr' to `max_addr'.
*
* Returns zero on success or negative on error.
*/
-static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
struct numa_meminfo *pi,
- u64 addr, u64 max_addr, u64 size)
+ u64 addr, u64 max_addr, u64 size,
+ int nr_nodes, struct numa_memblk *pblk,
+ int nid)
{
nodemask_t physnode_mask = numa_nodes_parsed;
+ int i, ret, uniform = 0;
u64 min_size;
- int nid = 0;
- int i, ret;
- if (!size)
+ if ((!size && !nr_nodes) || (nr_nodes && !pblk))
return -1;
+
/*
- * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
- * increased accordingly if the requested size is too small. This
- * creates a uniform distribution of node sizes across the entire
- * machine (but not necessarily over physical nodes).
+ * In the 'uniform' case split the passed in physical node by
+ * nr_nodes, in the non-uniform case, ignore the passed in
+ * physical block and try to create nodes of at least size
+ * @size.
+ *
+ * In the uniform case, split the nodes strictly by physical
+ * capacity, i.e. ignore holes. In the non-uniform case account
+ * for holes and treat @size as a minimum floor.
*/
- min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
- min_size = max(min_size, FAKE_NODE_MIN_SIZE);
- if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
- min_size = (min_size + FAKE_NODE_MIN_SIZE) &
- FAKE_NODE_MIN_HASH_MASK;
+ if (!nr_nodes)
+ nr_nodes = MAX_NUMNODES;
+ else {
+ nodes_clear(physnode_mask);
+ node_set(pblk->nid, physnode_mask);
+ uniform = 1;
+ }
+
+ if (uniform) {
+ min_size = uniform_size(max_addr, addr, 0, nr_nodes);
+ size = min_size;
+ } else {
+ /*
+ * The limit on emulated nodes is MAX_NUMNODES, so the
+ * size per node is increased accordingly if the
+ * requested size is too small. This creates a uniform
+ * distribution of node sizes across the entire machine
+ * (but not necessarily over physical nodes).
+ */
+ min_size = uniform_size(max_addr, addr,
+ mem_hole_size(addr, max_addr), nr_nodes);
+ }
+ min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
if (size < min_size) {
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
size >> 20, min_size >> 20);
size = min_size;
}
- size &= FAKE_NODE_MIN_HASH_MASK;
+ size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
/*
* Fill physical nodes with fake nodes of size until there is no memory
@@ -248,10 +281,14 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
node_clear(i, physnode_mask);
continue;
}
+
start = pi->blk[phys_blk].start;
limit = pi->blk[phys_blk].end;
- end = find_end_of_node(start, limit, size);
+ if (uniform)
+ end = start + size;
+ else
+ end = find_end_of_node(start, limit, size);
/*
* If there won't be at least FAKE_NODE_MIN_SIZE of
* non-reserved memory in ZONE_DMA32 for the next node,
@@ -266,7 +303,8 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
* next node, this one must extend to the end of the
* physical node.
*/
- if (limit - end - mem_hole_size(end, limit) < size)
+ if ((limit - end - mem_hole_size(end, limit) < size)
+ && !uniform)
end = limit;
ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
@@ -276,7 +314,15 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
return ret;
}
}
- return 0;
+ return nid;
+}
+
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size)
+{
+ return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
+ 0, NULL, NUMA_NO_NODE);
}
int __init setup_emu2phys_nid(int *dfl_phys_nid)
@@ -346,7 +392,28 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* the fixed node size. Otherwise, if it is just a single number N,
* split the system RAM into N fake nodes.
*/
- if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+ if (strchr(emu_cmdline, 'U')) {
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ unsigned long n;
+ int nid = 0;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = -1;
+ for_each_node_mask(i, physnode_mask) {
+ ret = split_nodes_size_interleave_uniform(&ei, &pi,
+ pi.blk[i].start, pi.blk[i].end, 0,
+ n, &pi.blk[i], nid);
+ if (ret < 0)
+ break;
+ if (ret < n) {
+ pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
+ __func__, i, ret, n);
+ ret = -1;
+ break;
+ }
+ nid = ret;
+ }
+ } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
u64 size;
size = memparse(emu_cmdline, &emu_cmdline);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3bded76e8d5c..8d6c34fe49be 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -53,6 +53,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
+#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -1014,8 +1015,8 @@ static long populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
- set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pmd_pgprot)));
+ set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
+ canon_pgprot(pmd_pgprot))));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
@@ -1087,8 +1088,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
- set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pud_pgprot)));
+ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
+ canon_pgprot(pud_pgprot))));
start += PUD_SIZE;
cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
@@ -1486,6 +1487,9 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+ /* Has caller explicitly disabled alias checking? */
+ if (in_flag & CPA_NO_CHECK_ALIAS)
+ checkalias = 0;
ret = __change_page_attr_set_clr(&cpa, checkalias);
@@ -1772,6 +1776,15 @@ int set_memory_np(unsigned long addr, int numpages)
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
+int set_memory_np_noalias(unsigned long addr, int numpages)
+{
+ int cpa_flags = CPA_NO_CHECK_ALIAS;
+
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(_PAGE_PRESENT), 0,
+ cpa_flags, NULL);
+}
+
int set_memory_4k(unsigned long addr, int numpages)
{
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
@@ -1784,6 +1797,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages)
__pgprot(_PAGE_GLOBAL), 0);
}
+int set_memory_global(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_GLOBAL), 0);
+}
+
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
struct cpa_data cpa;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 47b5951e592b..3ef095c70ae3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -182,6 +182,14 @@ static void pgd_dtor(pgd_t *pgd)
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
+/*
+ * We allocate separate PMDs for the kernel part of the user page-table
+ * when PTI is enabled. We need them to map the per-process LDT into the
+ * user-space page-table.
+ */
+#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \
+ KERNEL_PGD_PTRS : 0)
+
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
@@ -202,14 +210,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
-
+#define PREALLOCATED_USER_PMDS 0
#endif /* CONFIG_X86_PAE */
-static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
- for(i = 0; i < PREALLOCATED_PMDS; i++)
+ for (i = 0; i < count; i++)
if (pmds[i]) {
pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
@@ -217,7 +225,7 @@ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
}
}
-static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
bool failed = false;
@@ -226,7 +234,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
- for(i = 0; i < PREALLOCATED_PMDS; i++) {
+ for (i = 0; i < count; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
if (!pmd)
failed = true;
@@ -241,7 +249,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
}
if (failed) {
- free_pmds(mm, pmds);
+ free_pmds(mm, pmds, count);
return -ENOMEM;
}
@@ -254,23 +262,38 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
* preallocate which never got a corresponding vma will need to be
* freed manually.
*/
+static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
+{
+ pgd_t pgd = *pgdp;
+
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ *pgdp = native_make_pgd(0);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ mm_dec_nr_pmds(mm);
+ }
+}
+
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
- for(i = 0; i < PREALLOCATED_PMDS; i++) {
- pgd_t pgd = pgdp[i];
+ for (i = 0; i < PREALLOCATED_PMDS; i++)
+ mop_up_one_pmd(mm, &pgdp[i]);
- if (pgd_val(pgd) != 0) {
- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pgdp[i] = native_make_pgd(0);
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
- paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
- pmd_free(mm, pmd);
- mm_dec_nr_pmds(mm);
- }
- }
+ pgdp = kernel_to_user_pgdp(pgdp);
+
+ for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
+ mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
+#endif
}
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
@@ -296,6 +319,38 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
}
}
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
+ pgd_t *k_pgd, pmd_t *pmds[])
+{
+ pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ p4d_t *u_p4d;
+ pud_t *u_pud;
+ int i;
+
+ u_p4d = p4d_offset(u_pgd, 0);
+ u_pud = pud_offset(u_p4d, 0);
+
+ s_pgd += KERNEL_PGD_BOUNDARY;
+ u_pud += KERNEL_PGD_BOUNDARY;
+
+ for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
+ pmd_t *pmd = pmds[i];
+
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, u_pud, pmd);
+ }
+
+}
+#else
+static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
+ pgd_t *k_pgd, pmd_t *pmds[])
+{
+}
+#endif
/*
* Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
* assumes that pgd should be in one page.
@@ -329,9 +384,6 @@ static int __init pgd_cache_init(void)
*/
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
SLAB_PANIC, NULL);
- if (!pgd_cache)
- return -ENOMEM;
-
return 0;
}
core_initcall(pgd_cache_init);
@@ -343,7 +395,8 @@ static inline pgd_t *_pgd_alloc(void)
* We allocate one page for pgd.
*/
if (!SHARED_KERNEL_PMD)
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP,
+ PGD_ALLOCATION_ORDER);
/*
* Now PAE kernel is not running as a Xen domain. We can allocate
@@ -355,7 +408,7 @@ static inline pgd_t *_pgd_alloc(void)
static inline void _pgd_free(pgd_t *pgd)
{
if (!SHARED_KERNEL_PMD)
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
else
kmem_cache_free(pgd_cache, pgd);
}
@@ -375,6 +428,7 @@ static inline void _pgd_free(pgd_t *pgd)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
+ pmd_t *u_pmds[PREALLOCATED_USER_PMDS];
pmd_t *pmds[PREALLOCATED_PMDS];
pgd = _pgd_alloc();
@@ -384,12 +438,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
mm->pgd = pgd;
- if (preallocate_pmds(mm, pmds) != 0)
+ if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
goto out_free_pgd;
- if (paravirt_pgd_alloc(mm) != 0)
+ if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
goto out_free_pmds;
+ if (paravirt_pgd_alloc(mm) != 0)
+ goto out_free_user_pmds;
+
/*
* Make sure that pre-populating the pmds is atomic with
* respect to anything walking the pgd_list, so that they
@@ -399,13 +456,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
+ pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
spin_unlock(&pgd_lock);
return pgd;
+out_free_user_pmds:
+ free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
- free_pmds(mm, pmds);
+ free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
_pgd_free(pgd);
out:
@@ -719,28 +779,50 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
+#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD.
+ * @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
-int pud_free_pmd_page(pud_t *pud)
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i]))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -749,11 +831,12 @@ int pud_free_pmd_page(pud_t *pud)
/**
* pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD.
+ * @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
-int pmd_free_pte_page(pmd_t *pmd)
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
@@ -762,8 +845,30 @@ int pmd_free_pte_page(pmd_t *pmd)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
}
+
+#else /* !CONFIG_X86_64 */
+
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ return pud_none(*pud);
+}
+
+/*
+ * Disable free page handling on x86-PAE. This assures that ioremap()
+ * does not update sync'd pmd entries. See vmalloc_sync_one().
+ */
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ return pmd_none(*pmd);
+}
+
+#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e705878..31341ae7309f 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -45,6 +45,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/sections.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -54,6 +55,16 @@
#define __GFP_NOTRACK 0
#endif
+/*
+ * Define the page-table levels we clone for user-space on 32
+ * and 64 bit.
+ */
+#ifdef CONFIG_X86_64
+#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD
+#else
+#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE
+#endif
+
static void __init pti_print_if_insecure(const char *reason)
{
if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
@@ -117,7 +128,7 @@ enable:
setup_force_cpu_cap(X86_FEATURE_PTI);
}
-pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
/*
* Changes to the high (kernel) portion of the kernelmode page
@@ -176,7 +187,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
- if (!new_p4d_page)
+ if (WARN_ON_ONCE(!new_p4d_page))
return NULL;
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,13 +206,17 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+ p4d_t *p4d;
pud_t *pud;
+ p4d = pti_user_pagetable_walk_p4d(address);
+ if (!p4d)
+ return NULL;
+
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
- if (!new_pud_page)
+ if (WARN_ON_ONCE(!new_pud_page))
return NULL;
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -215,7 +230,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
- if (!new_pmd_page)
+ if (WARN_ON_ONCE(!new_pmd_page))
return NULL;
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -224,7 +239,6 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
return pmd_offset(pud, address);
}
-#ifdef CONFIG_X86_VSYSCALL_EMULATION
/*
* Walk the shadow copy of the page tables (optionally) trying to allocate
* page table pages on the way down. Does not support large pages.
@@ -237,9 +251,13 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+ pmd_t *pmd;
pte_t *pte;
+ pmd = pti_user_pagetable_walk_pmd(address);
+ if (!pmd)
+ return NULL;
+
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
@@ -262,6 +280,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
return pte;
}
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
static void __init pti_setup_vsyscall(void)
{
pte_t *pte, *target_pte;
@@ -282,8 +301,14 @@ static void __init pti_setup_vsyscall(void)
static void __init pti_setup_vsyscall(void) { }
#endif
+enum pti_clone_level {
+ PTI_CLONE_PMD,
+ PTI_CLONE_PTE,
+};
+
static void
-pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+pti_clone_pgtable(unsigned long start, unsigned long end,
+ enum pti_clone_level level)
{
unsigned long addr;
@@ -291,59 +316,105 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
* Clone the populated PMDs which cover start to end. These PMD areas
* can have holes.
*/
- for (addr = start; addr < end; addr += PMD_SIZE) {
+ for (addr = start; addr < end;) {
+ pte_t *pte, *target_pte;
pmd_t *pmd, *target_pmd;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
+ /* Overflow check */
+ if (addr < start)
+ break;
+
pgd = pgd_offset_k(addr);
if (WARN_ON(pgd_none(*pgd)))
return;
p4d = p4d_offset(pgd, addr);
if (WARN_ON(p4d_none(*p4d)))
return;
+
pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
+ if (pud_none(*pud)) {
+ addr += PUD_SIZE;
continue;
+ }
+
pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
+ if (pmd_none(*pmd)) {
+ addr += PMD_SIZE;
continue;
+ }
- target_pmd = pti_user_pagetable_walk_pmd(addr);
- if (WARN_ON(!target_pmd))
- return;
-
- /*
- * Only clone present PMDs. This ensures only setting
- * _PAGE_GLOBAL on present PMDs. This should only be
- * called on well-known addresses anyway, so a non-
- * present PMD would be a surprise.
- */
- if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)))
- return;
-
- /*
- * Setting 'target_pmd' below creates a mapping in both
- * the user and kernel page tables. It is effectively
- * global, so set it as global in both copies. Note:
- * the X86_FEATURE_PGE check is not _required_ because
- * the CPU ignores _PAGE_GLOBAL when PGE is not
- * supported. The check keeps consistentency with
- * code that only set this bit when supported.
- */
- if (boot_cpu_has(X86_FEATURE_PGE))
- *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL);
-
- /*
- * Copy the PMD. That is, the kernelmode and usermode
- * tables will share the last-level page tables of this
- * address range
- */
- *target_pmd = pmd_clear_flags(*pmd, clear);
+ if (pmd_large(*pmd) || level == PTI_CLONE_PMD) {
+ target_pmd = pti_user_pagetable_walk_pmd(addr);
+ if (WARN_ON(!target_pmd))
+ return;
+
+ /*
+ * Only clone present PMDs. This ensures only setting
+ * _PAGE_GLOBAL on present PMDs. This should only be
+ * called on well-known addresses anyway, so a non-
+ * present PMD would be a surprise.
+ */
+ if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)))
+ return;
+
+ /*
+ * Setting 'target_pmd' below creates a mapping in both
+ * the user and kernel page tables. It is effectively
+ * global, so set it as global in both copies. Note:
+ * the X86_FEATURE_PGE check is not _required_ because
+ * the CPU ignores _PAGE_GLOBAL when PGE is not
+ * supported. The check keeps consistentency with
+ * code that only set this bit when supported.
+ */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL);
+
+ /*
+ * Copy the PMD. That is, the kernelmode and usermode
+ * tables will share the last-level page tables of this
+ * address range
+ */
+ *target_pmd = *pmd;
+
+ addr += PMD_SIZE;
+
+ } else if (level == PTI_CLONE_PTE) {
+
+ /* Walk the page-table down to the pte level */
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte)) {
+ addr += PAGE_SIZE;
+ continue;
+ }
+
+ /* Only clone present PTEs */
+ if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT)))
+ return;
+
+ /* Allocate PTE in the user page-table */
+ target_pte = pti_user_pagetable_walk_pte(addr);
+ if (WARN_ON(!target_pte))
+ return;
+
+ /* Set GLOBAL bit in both PTEs */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ *pte = pte_set_flags(*pte, _PAGE_GLOBAL);
+
+ /* Clone the PTE */
+ *target_pte = *pte;
+
+ addr += PAGE_SIZE;
+
+ } else {
+ BUG();
+ }
}
}
+#ifdef CONFIG_X86_64
/*
* Clone a single p4d (i.e. a top-level entry on 4-level systems and a
* next-level entry on 5-level systems.
@@ -354,6 +425,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
user_p4d = pti_user_pagetable_walk_p4d(addr);
+ if (!user_p4d)
+ return;
+
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
@@ -367,6 +441,25 @@ static void __init pti_clone_user_shared(void)
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
}
+#else /* CONFIG_X86_64 */
+
+/*
+ * On 32 bit PAE systems with 1GB of Kernel address space there is only
+ * one pgd/p4d for the whole kernel. Cloning that would map the whole
+ * address space into the user page-tables, making PTI useless. So clone
+ * the page-table on the PMD level to prevent that.
+ */
+static void __init pti_clone_user_shared(void)
+{
+ unsigned long start, end;
+
+ start = CPU_ENTRY_AREA_BASE;
+ end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
+
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+}
+#endif /* CONFIG_X86_64 */
+
/*
* Clone the ESPFIX P4D into the user space visible page table
*/
@@ -380,11 +473,11 @@ static void __init pti_setup_espfix64(void)
/*
* Clone the populated PMDs of the entry and irqentry text and force it RO.
*/
-static void __init pti_clone_entry_text(void)
+static void pti_clone_entry_text(void)
{
- pti_clone_pmds((unsigned long) __entry_text_start,
- (unsigned long) __irqentry_text_end,
- _PAGE_RW);
+ pti_clone_pgtable((unsigned long) __entry_text_start,
+ (unsigned long) __irqentry_text_end,
+ PTI_CLONE_PMD);
}
/*
@@ -435,10 +528,17 @@ static inline bool pti_kernel_image_global_ok(void)
}
/*
+ * This is the only user for these and it is not arch-generic
+ * like the other set_memory.h functions. Just extern them.
+ */
+extern int set_memory_nonglobal(unsigned long addr, int numpages);
+extern int set_memory_global(unsigned long addr, int numpages);
+
+/*
* For some configurations, map all of kernel text into the user page
* tables. This reduces TLB misses, especially on non-PCID systems.
*/
-void pti_clone_kernel_text(void)
+static void pti_clone_kernel_text(void)
{
/*
* rodata is part of the kernel image and is normally
@@ -446,7 +546,8 @@ void pti_clone_kernel_text(void)
* clone the areas past rodata, they might contain secrets.
*/
unsigned long start = PFN_ALIGN(_text);
- unsigned long end = (unsigned long)__end_rodata_hpage_align;
+ unsigned long end_clone = (unsigned long)__end_rodata_aligned;
+ unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
if (!pti_kernel_image_global_ok())
return;
@@ -458,14 +559,18 @@ void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pmds(start, end, _PAGE_RW);
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+
+ /*
+ * pti_clone_pgtable() will set the global bit in any PMDs
+ * that it clones, but we also need to get any PTEs in
+ * the last level for areas that are not huge-page-aligned.
+ */
+
+ /* Set the global bit for normal non-__init kernel text: */
+ set_memory_global(start, (end_global - start) >> PAGE_SHIFT);
}
-/*
- * This is the only user for it and it is not arch-generic like
- * the other set_memory.h functions. Just extern it.
- */
-extern int set_memory_nonglobal(unsigned long addr, int numpages);
void pti_set_kernel_image_nonglobal(void)
{
/*
@@ -477,9 +582,11 @@ void pti_set_kernel_image_nonglobal(void)
unsigned long start = PFN_ALIGN(_text);
unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
- if (pti_kernel_image_global_ok())
- return;
-
+ /*
+ * This clears _PAGE_GLOBAL from the entire kernel image.
+ * pti_clone_kernel_text() map put _PAGE_GLOBAL back for
+ * areas that are mapped to userspace.
+ */
set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
}
@@ -493,6 +600,28 @@ void __init pti_init(void)
pr_info("enabled\n");
+#ifdef CONFIG_X86_32
+ /*
+ * We check for X86_FEATURE_PCID here. But the init-code will
+ * clear the feature flag on 32 bit because the feature is not
+ * supported on 32 bit anyway. To print the warning we need to
+ * check with cpuid directly again.
+ */
+ if (cpuid_ecx(0x1) & BIT(17)) {
+ /* Use printk to work around pr_fmt() */
+ printk(KERN_WARNING "\n");
+ printk(KERN_WARNING "************************************************************\n");
+ printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
+ printk(KERN_WARNING "** **\n");
+ printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n");
+ printk(KERN_WARNING "** Your performance will increase dramatically if you **\n");
+ printk(KERN_WARNING "** switch to a 64-bit kernel! **\n");
+ printk(KERN_WARNING "** **\n");
+ printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
+ printk(KERN_WARNING "************************************************************\n");
+ }
+#endif
+
pti_clone_user_shared();
/* Undo all global bits from the init pagetables in head_64.S: */
@@ -502,3 +631,22 @@ void __init pti_init(void)
pti_setup_espfix64();
pti_setup_vsyscall();
}
+
+/*
+ * Finalize the kernel mappings in the userspace page-table. Some of the
+ * mappings for the kernel image might have changed since pti_init()
+ * cloned them. This is because parts of the kernel image have been
+ * mapped RO and/or NX. These changes need to be cloned again to the
+ * userspace page-table.
+ */
+void pti_finalize(void)
+{
+ /*
+ * We need to clone everything (again) that maps parts of the
+ * kernel image.
+ */
+ pti_clone_entry_text();
+ pti_clone_kernel_text();
+
+ debug_checkwx_user();
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb1f34c3c85..752dbf4e0e50 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
+#include <linux/gfp.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -35,7 +36,7 @@
* necessary invalidation by clearing out the 'ctx_id' which
* forces a TLB flush when the context is loaded.
*/
-void clear_asid_other(void)
+static void clear_asid_other(void)
{
u16 asid;
@@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
+ bool need_flush;
+ u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
@@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
- * We don't currently support having a real mm loaded without
- * our cpu set in mm_cpumask(). We have all the bookkeeping
- * in place to figure out whether we would need to flush
- * if our cpu were cleared in mm_cpumask(), but we don't
- * currently use it.
+ * Even in lazy TLB mode, the CPU should stay set in the
+ * mm_cpumask. The TLB shootdown code can figure out from
+ * from cpu_tlbstate.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
- return;
+ /*
+ * If the CPU is not in lazy TLB mode, we are just switching
+ * from one thread in a process to another thread in the same
+ * process. No TLB flush required.
+ */
+ if (!was_lazy)
+ return;
+
+ /*
+ * Read the tlb_gen to check whether a flush is needed.
+ * If the TLB is up to date, just use it.
+ * The barrier synchronizes with the tlb_gen increment in
+ * the TLB shootdown code.
+ */
+ smp_mb();
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+ if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+ next_tlb_gen)
+ return;
+
+ /*
+ * TLB contents went out of date while we were in lazy
+ * mode. Fall through to the TLB switching code below.
+ */
+ new_asid = prev_asid;
+ need_flush = true;
} else {
- u16 new_asid;
- bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
@@ -285,53 +310,60 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
sync_current_stack_to_mm(next);
}
- /* Stop remote flushes for the previous mm */
- VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
- real_prev != &init_mm);
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ /*
+ * Stop remote flushes for the previous mm.
+ * Skip kernel threads; we never send init_mm TLB flushing IPIs,
+ * but the bitmap manipulation can cause cache line contention.
+ */
+ if (real_prev != &init_mm) {
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
+ mm_cpumask(real_prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ }
/*
* Start remote flushes and then read tlb_gen.
*/
- cpumask_set_cpu(cpu, mm_cpumask(next));
+ if (next != &init_mm)
+ cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ }
- if (need_flush) {
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
-
- /*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- } else {
- /* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
-
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
- }
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);
/*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
*/
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
+ /*
+ * Record last user mm's context id, so we can avoid
+ * flushing branch buffer with IBPB if we switch back
+ * to the same user.
+ */
+ if (next != &init_mm)
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
@@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- if (tlb_defer_switch_to_init_mm()) {
- /*
- * There's a significant optimization that may be possible
- * here. We have accurate enough TLB flush tracking that we
- * don't need to maintain coherence of TLB per se when we're
- * lazy. We do, however, need to maintain coherence of
- * paging-structure caches. We could, in principle, leave our
- * old mm loaded and only switch to init_mm when
- * tlb_remove_page() happens.
- */
- this_cpu_write(cpu_tlbstate.is_lazy, true);
- } else {
- switch_mm(NULL, &init_mm, NULL);
- }
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
}
/*
@@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
+ *
+ * This should be rare, with native_flush_tlb_others skipping
+ * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
+ cpumask_var_t lazymask;
+ unsigned int cpu;
+
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
- unsigned int cpu;
-
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
@@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
- smp_call_function_many(cpumask, flush_tlb_func_remote,
+
+ /*
+ * A temporary cpumask is used in order to skip sending IPIs
+ * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
+ * If the allocation fails, simply IPI every CPU in mm_cpumask.
+ */
+ if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
+ return;
+ }
+
+ cpumask_copy(lazymask, cpumask);
+
+ for_each_cpu(cpu, lazymask) {
+ if (per_cpu(cpu_tlbstate.is_lazy, cpu))
+ cpumask_clear_cpu(cpu, lazymask);
+ }
+
+ smp_call_function_many(lazymask, flush_tlb_func_remote,
+ (void *)info, 1);
+
+ free_cpumask_var(lazymask);
}
/*
@@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
put_cpu();
}
+void tlb_flush_remove_tables_local(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
+ this_cpu_read(cpu_tlbstate.is_lazy)) {
+ /*
+ * We're in lazy mode. We need to at least flush our
+ * paging-structure cache to avoid speculatively reading
+ * garbage into our TLB. Since switching to init_mm is barely
+ * slower than a minimal flush, just switch to init_mm.
+ */
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
+ }
+}
+
+static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
+ struct cpumask *lazy_cpus)
+{
+ int cpu;
+
+ for_each_cpu(cpu, mm_cpumask(mm)) {
+ if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
+ cpumask_set_cpu(cpu, lazy_cpus);
+ }
+}
+
+void tlb_flush_remove_tables(struct mm_struct *mm)
+{
+ int cpu = get_cpu();
+ cpumask_var_t lazy_cpus;
+
+ if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
+ put_cpu();
+ return;
+ }
+
+ if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
+ /*
+ * If the cpumask allocation fails, do a brute force flush
+ * on all the CPUs that have this mm loaded.
+ */
+ smp_call_function_many(mm_cpumask(mm),
+ tlb_flush_remove_tables_local, (void *)mm, 1);
+ put_cpu();
+ return;
+ }
+
+ /*
+ * CPUs with !is_lazy either received a TLB flush IPI while the user
+ * pages in this address range were unmapped, or have context switched
+ * and reloaded %CR3 since then.
+ *
+ * Shootdown IPIs at page table freeing time only need to be sent to
+ * CPUs that may have out of date TLB contents.
+ */
+ mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
+ smp_call_function_many(lazy_cpus,
+ tlb_flush_remove_tables_local, (void *)mm, 1);
+ free_cpumask_var(lazy_cpus);
+ put_cpu();
+}
static void do_flush_tlb_all(void *info)
{
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 55799873ebe5..8f6cc71e0848 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1441,8 +1441,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth)
/* sub esp,STACK_SIZE */
EMIT2_off32(0x81, 0xEC, STACK_SIZE);
- /* sub ebp,SCRATCH_SIZE+4+12*/
- EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16);
+ /* sub ebp,SCRATCH_SIZE+12*/
+ EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 12);
/* xor ebx,ebx */
EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX));
@@ -1475,8 +1475,8 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth)
/* mov edx,dword ptr [ebp+off]*/
EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
- /* add ebp,SCRATCH_SIZE+4+12*/
- EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16);
+ /* add ebp,SCRATCH_SIZE+12*/
+ EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 12);
/* mov ebx,dword ptr [ebp-12]*/
EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 77873ce700ae..ee5d08f25ce4 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -417,7 +417,7 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va)
if (!(md->attribute & EFI_MEMORY_WB))
flags |= _PAGE_PCD;
- if (sev_active())
+ if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO)
flags |= _PAGE_ENC;
pfn = md->phys_addr >> PAGE_SHIFT;
@@ -636,6 +636,8 @@ void efi_switch_mm(struct mm_struct *mm)
#ifdef CONFIG_EFI_MIXED
extern efi_status_t efi64_thunk(u32, ...);
+static DEFINE_SPINLOCK(efi_runtime_lock);
+
#define runtime_service32(func) \
({ \
u32 table = (u32)(unsigned long)efi.systab; \
@@ -657,17 +659,14 @@ extern efi_status_t efi64_thunk(u32, ...);
#define efi_thunk(f, ...) \
({ \
efi_status_t __s; \
- unsigned long __flags; \
u32 __func; \
\
- local_irq_save(__flags); \
arch_efi_call_virt_setup(); \
\
__func = runtime_service32(f); \
__s = efi64_thunk(__func, __VA_ARGS__); \
\
arch_efi_call_virt_teardown(); \
- local_irq_restore(__flags); \
\
__s; \
})
@@ -702,14 +701,17 @@ static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
efi_status_t status;
u32 phys_tm, phys_tc;
+ unsigned long flags;
spin_lock(&rtc_lock);
+ spin_lock_irqsave(&efi_runtime_lock, flags);
phys_tm = virt_to_phys_or_null(tm);
phys_tc = virt_to_phys_or_null(tc);
status = efi_thunk(get_time, phys_tm, phys_tc);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
spin_unlock(&rtc_lock);
return status;
@@ -719,13 +721,16 @@ static efi_status_t efi_thunk_set_time(efi_time_t *tm)
{
efi_status_t status;
u32 phys_tm;
+ unsigned long flags;
spin_lock(&rtc_lock);
+ spin_lock_irqsave(&efi_runtime_lock, flags);
phys_tm = virt_to_phys_or_null(tm);
status = efi_thunk(set_time, phys_tm);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
spin_unlock(&rtc_lock);
return status;
@@ -737,8 +742,10 @@ efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending,
{
efi_status_t status;
u32 phys_enabled, phys_pending, phys_tm;
+ unsigned long flags;
spin_lock(&rtc_lock);
+ spin_lock_irqsave(&efi_runtime_lock, flags);
phys_enabled = virt_to_phys_or_null(enabled);
phys_pending = virt_to_phys_or_null(pending);
@@ -747,6 +754,7 @@ efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending,
status = efi_thunk(get_wakeup_time, phys_enabled,
phys_pending, phys_tm);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
spin_unlock(&rtc_lock);
return status;
@@ -757,13 +765,16 @@ efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
{
efi_status_t status;
u32 phys_tm;
+ unsigned long flags;
spin_lock(&rtc_lock);
+ spin_lock_irqsave(&efi_runtime_lock, flags);
phys_tm = virt_to_phys_or_null(tm);
status = efi_thunk(set_wakeup_time, enabled, phys_tm);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
spin_unlock(&rtc_lock);
return status;
@@ -781,6 +792,9 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor,
efi_status_t status;
u32 phys_name, phys_vendor, phys_attr;
u32 phys_data_size, phys_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
phys_data_size = virt_to_phys_or_null(data_size);
phys_vendor = virt_to_phys_or_null(vendor);
@@ -791,6 +805,8 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor,
status = efi_thunk(get_variable, phys_name, phys_vendor,
phys_attr, phys_data_size, phys_data);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
return status;
}
@@ -800,6 +816,34 @@ efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor,
{
u32 phys_name, phys_vendor, phys_data;
efi_status_t status;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_name = virt_to_phys_or_null_size(name, efi_name_size(name));
+ phys_vendor = virt_to_phys_or_null(vendor);
+ phys_data = virt_to_phys_or_null_size(data, data_size);
+
+ /* If data_size is > sizeof(u32) we've got problems */
+ status = efi_thunk(set_variable, phys_name, phys_vendor,
+ attr, data_size, phys_data);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor,
+ u32 attr, unsigned long data_size,
+ void *data)
+{
+ u32 phys_name, phys_vendor, phys_data;
+ efi_status_t status;
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&efi_runtime_lock, flags))
+ return EFI_NOT_READY;
phys_name = virt_to_phys_or_null_size(name, efi_name_size(name));
phys_vendor = virt_to_phys_or_null(vendor);
@@ -809,6 +853,8 @@ efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor,
status = efi_thunk(set_variable, phys_name, phys_vendor,
attr, data_size, phys_data);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
return status;
}
@@ -819,6 +865,9 @@ efi_thunk_get_next_variable(unsigned long *name_size,
{
efi_status_t status;
u32 phys_name_size, phys_name, phys_vendor;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
phys_name_size = virt_to_phys_or_null(name_size);
phys_vendor = virt_to_phys_or_null(vendor);
@@ -827,6 +876,8 @@ efi_thunk_get_next_variable(unsigned long *name_size,
status = efi_thunk(get_next_variable, phys_name_size,
phys_name, phys_vendor);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
return status;
}
@@ -835,10 +886,15 @@ efi_thunk_get_next_high_mono_count(u32 *count)
{
efi_status_t status;
u32 phys_count;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
phys_count = virt_to_phys_or_null(count);
status = efi_thunk(get_next_high_mono_count, phys_count);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
return status;
}
@@ -847,10 +903,15 @@ efi_thunk_reset_system(int reset_type, efi_status_t status,
unsigned long data_size, efi_char16_t *data)
{
u32 phys_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
phys_data = virt_to_phys_or_null_size(data, data_size);
efi_thunk(reset_system, reset_type, status, data_size, phys_data);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
}
static efi_status_t
@@ -872,10 +933,40 @@ efi_thunk_query_variable_info(u32 attr, u64 *storage_space,
{
efi_status_t status;
u32 phys_storage, phys_remaining, phys_max;
+ unsigned long flags;
+
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_storage = virt_to_phys_or_null(storage_space);
+ phys_remaining = virt_to_phys_or_null(remaining_space);
+ phys_max = virt_to_phys_or_null(max_variable_size);
+
+ status = efi_thunk(query_variable_info, attr, phys_storage,
+ phys_remaining, phys_max);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_query_variable_info_nonblocking(u32 attr, u64 *storage_space,
+ u64 *remaining_space,
+ u64 *max_variable_size)
+{
+ efi_status_t status;
+ u32 phys_storage, phys_remaining, phys_max;
+ unsigned long flags;
if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
return EFI_UNSUPPORTED;
+ if (!spin_trylock_irqsave(&efi_runtime_lock, flags))
+ return EFI_NOT_READY;
+
phys_storage = virt_to_phys_or_null(storage_space);
phys_remaining = virt_to_phys_or_null(remaining_space);
phys_max = virt_to_phys_or_null(max_variable_size);
@@ -883,6 +974,8 @@ efi_thunk_query_variable_info(u32 attr, u64 *storage_space,
status = efi_thunk(query_variable_info, attr, phys_storage,
phys_remaining, phys_max);
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
return status;
}
@@ -908,9 +1001,11 @@ void efi_thunk_runtime_setup(void)
efi.get_variable = efi_thunk_get_variable;
efi.get_next_variable = efi_thunk_get_next_variable;
efi.set_variable = efi_thunk_set_variable;
+ efi.set_variable_nonblocking = efi_thunk_set_variable_nonblocking;
efi.get_next_high_mono_count = efi_thunk_get_next_high_mono_count;
efi.reset_system = efi_thunk_reset_system;
efi.query_variable_info = efi_thunk_query_variable_info;
+ efi.query_variable_info_nonblocking = efi_thunk_query_variable_info_nonblocking;
efi.update_capsule = efi_thunk_update_capsule;
efi.query_capsule_caps = efi_thunk_query_capsule_caps;
}
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 36c1f8b9f7e0..844d31cb8a0c 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -105,12 +105,11 @@ early_param("efi_no_storage_paranoia", setup_storage_paranoia);
*/
void efi_delete_dummy_variable(void)
{
- efi.set_variable((efi_char16_t *)efi_dummy_name,
- &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, NULL);
+ efi.set_variable_nonblocking((efi_char16_t *)efi_dummy_name,
+ &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL);
}
/*
@@ -249,7 +248,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
int num_entries;
void *new;
- if (efi_mem_desc_lookup(addr, &md)) {
+ if (efi_mem_desc_lookup(addr, &md) ||
+ md.type != EFI_BOOT_SERVICES_DATA) {
pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr);
return;
}
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index fa021dfab088..5cf886c867c2 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfld.o pwr.o
+obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o pwr.o
# SFI specific code
ifdef CONFIG_X86_INTEL_MID
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
index 4f5fa65a1011..2acd6be13375 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
@@ -18,6 +18,7 @@
#include <asm/intel-mid.h>
#include <asm/intel_scu_ipc.h>
#include <asm/io_apic.h>
+#include <asm/hw_irq.h>
#define TANGIER_EXT_TIMER0_MSI 12
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 2ebdf31d9996..56f66eafb94f 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -36,8 +36,6 @@
#include <asm/apb_timer.h>
#include <asm/reboot.h>
-#include "intel_mid_weak_decls.h"
-
/*
* the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
* cmdline option x86_intel_mid_timer can be used to override the configuration
@@ -61,10 +59,6 @@
enum intel_mid_timer_options intel_mid_timer_options;
-/* intel_mid_ops to store sub arch ops */
-static struct intel_mid_ops *intel_mid_ops;
-/* getter function for sub arch ops*/
-static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT;
enum intel_mid_cpu_type __intel_mid_cpu_chip;
EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
@@ -82,11 +76,6 @@ static void intel_mid_reboot(void)
intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0);
}
-static unsigned long __init intel_mid_calibrate_tsc(void)
-{
- return 0;
-}
-
static void __init intel_mid_setup_bp_timer(void)
{
apbt_time_init();
@@ -133,6 +122,7 @@ static void intel_mid_arch_setup(void)
case 0x3C:
case 0x4A:
__intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER;
+ x86_platform.legacy.rtc = 1;
break;
case 0x27:
default:
@@ -140,17 +130,7 @@ static void intel_mid_arch_setup(void)
break;
}
- if (__intel_mid_cpu_chip < MAX_CPU_OPS(get_intel_mid_ops))
- intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
- else {
- intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
- pr_info("ARCH: Unknown SoC, assuming Penwell!\n");
- }
-
out:
- if (intel_mid_ops->arch_setup)
- intel_mid_ops->arch_setup();
-
/*
* Intel MID platforms are using explicitly defined regulators.
*
@@ -191,7 +171,6 @@ void __init x86_intel_mid_early_setup(void)
x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
- x86_platform.calibrate_tsc = intel_mid_calibrate_tsc;
x86_platform.get_nmi_reason = intel_mid_get_nmi_reason;
x86_init.pci.arch_init = intel_mid_pci_init;
diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
deleted file mode 100644
index 3c1c3866d82b..000000000000
--- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * intel_mid_weak_decls.h: Weak declarations of intel-mid.c
- *
- * (C) Copyright 2013 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-
-/* For every CPU addition a new get_<cpuname>_ops interface needs
- * to be added.
- */
-extern void *get_penwell_ops(void);
-extern void *get_cloverview_ops(void);
-extern void *get_tangier_ops(void);
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
deleted file mode 100644
index e42978d4deaf..000000000000
--- a/arch/x86/platform/intel-mid/mfld.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * mfld.c: Intel Medfield platform setup code
- *
- * (C) Copyright 2013 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/init.h>
-
-#include <asm/apic.h>
-#include <asm/intel-mid.h>
-#include <asm/intel_mid_vrtc.h>
-
-#include "intel_mid_weak_decls.h"
-
-static unsigned long __init mfld_calibrate_tsc(void)
-{
- unsigned long fast_calibrate;
- u32 lo, hi, ratio, fsb;
-
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
- ratio = (hi >> 8) & 0x1f;
- pr_debug("ratio is %d\n", ratio);
- if (!ratio) {
- pr_err("read a zero ratio, should be incorrect!\n");
- pr_err("force tsc ratio to 16 ...\n");
- ratio = 16;
- }
- rdmsr(MSR_FSB_FREQ, lo, hi);
- if ((lo & 0x7) == 0x7)
- fsb = FSB_FREQ_83SKU;
- else
- fsb = FSB_FREQ_100SKU;
- fast_calibrate = ratio * fsb;
- pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
- lapic_timer_frequency = fsb * 1000 / HZ;
-
- /*
- * TSC on Intel Atom SoCs is reliable and of known frequency.
- * See tsc_msr.c for details.
- */
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
-
- return fast_calibrate;
-}
-
-static void __init penwell_arch_setup(void)
-{
- x86_platform.calibrate_tsc = mfld_calibrate_tsc;
-}
-
-static struct intel_mid_ops penwell_ops = {
- .arch_setup = penwell_arch_setup,
-};
-
-void *get_penwell_ops(void)
-{
- return &penwell_ops;
-}
-
-void *get_cloverview_ops(void)
-{
- return &penwell_ops;
-}
diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c
deleted file mode 100644
index ae7bdeb0e507..000000000000
--- a/arch/x86/platform/intel-mid/mrfld.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Intel Merrifield platform specific setup code
- *
- * (C) Copyright 2013 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/init.h>
-
-#include <asm/apic.h>
-#include <asm/intel-mid.h>
-
-#include "intel_mid_weak_decls.h"
-
-static unsigned long __init tangier_calibrate_tsc(void)
-{
- unsigned long fast_calibrate;
- u32 lo, hi, ratio, fsb, bus_freq;
-
- /* *********************** */
- /* Compute TSC:Ratio * FSB */
- /* *********************** */
-
- /* Compute Ratio */
- rdmsr(MSR_PLATFORM_INFO, lo, hi);
- pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo);
-
- ratio = (lo >> 8) & 0xFF;
- pr_debug("ratio is %d\n", ratio);
- if (!ratio) {
- pr_err("Read a zero ratio, force tsc ratio to 4 ...\n");
- ratio = 4;
- }
-
- /* Compute FSB */
- rdmsr(MSR_FSB_FREQ, lo, hi);
- pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n",
- hi, lo);
-
- bus_freq = lo & 0x7;
- pr_debug("bus_freq = 0x%x\n", bus_freq);
-
- if (bus_freq == 0)
- fsb = FSB_FREQ_100SKU;
- else if (bus_freq == 1)
- fsb = FSB_FREQ_100SKU;
- else if (bus_freq == 2)
- fsb = FSB_FREQ_133SKU;
- else if (bus_freq == 3)
- fsb = FSB_FREQ_167SKU;
- else if (bus_freq == 4)
- fsb = FSB_FREQ_83SKU;
- else if (bus_freq == 5)
- fsb = FSB_FREQ_400SKU;
- else if (bus_freq == 6)
- fsb = FSB_FREQ_267SKU;
- else if (bus_freq == 7)
- fsb = FSB_FREQ_333SKU;
- else {
- BUG();
- pr_err("Invalid bus_freq! Setting to minimal value!\n");
- fsb = FSB_FREQ_100SKU;
- }
-
- /* TSC = FSB Freq * Resolved HFM Ratio */
- fast_calibrate = ratio * fsb;
- pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate);
-
- /* ************************************ */
- /* Calculate Local APIC Timer Frequency */
- /* ************************************ */
- lapic_timer_frequency = (fsb * 1000) / HZ;
-
- pr_debug("Setting lapic_timer_frequency = %d\n",
- lapic_timer_frequency);
-
- /*
- * TSC on Intel Atom SoCs is reliable and of known frequency.
- * See tsc_msr.c for details.
- */
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
- setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
-
- return fast_calibrate;
-}
-
-static void __init tangier_arch_setup(void)
-{
- x86_platform.calibrate_tsc = tangier_calibrate_tsc;
- x86_platform.legacy.rtc = 1;
-}
-
-/* tangier arch ops */
-static struct intel_mid_ops tangier_ops = {
- .arch_setup = tangier_arch_setup,
-};
-
-void *get_tangier_ops(void)
-{
- return &tangier_ops;
-}
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index 7c3077e58fa0..f0e920fb98ad 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -311,10 +311,8 @@ static int __init add_xo1_platform_devices(void)
return PTR_ERR(pdev);
pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
- if (IS_ERR(pdev))
- return PTR_ERR(pdev);
- return 0;
+ return PTR_ERR_OR_ZERO(pdev);
}
static int olpc_xo1_ec_probe(struct platform_device *pdev)
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index ca446da48fd2..a4130b84d1ff 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1285,6 +1285,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
struct msg_desc msgdesc;
ack_APIC_irq();
+ kvm_set_cpu_l1tf_flush_l1d();
time_start = get_cycles();
bcp = &per_cpu(bau_control, smp_processor_id());
@@ -1607,8 +1608,6 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr,
*tunables[cnt].tunp = val;
continue;
}
- if (q == p)
- break;
}
return 0;
}
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index ce8da3a0412c..fd369a6e9ff8 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -137,7 +137,7 @@ ENTRY(restore_registers)
/* Saved in save_processor_state. */
lgdt saved_context_gdt_desc(%rax)
- xorq %rax, %rax
+ xorl %eax, %eax
/* tell the hibernation core that we've just restored the memory */
movq %rax, in_suspend(%rip)
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 220e97841e49..3a6c8ebc8032 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -67,6 +67,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"__tracedata_(start|end)|"
"__(start|stop)_notes|"
"__end_rodata|"
+ "__end_rodata_aligned|"
"__initramfs_start|"
"(jiffies|jiffies_64)|"
#if ELF_BITS == 64
diff --git a/arch/x86/um/vdso/.gitignore b/arch/x86/um/vdso/.gitignore
index 9cac6d072199..f8b69d84238e 100644
--- a/arch/x86/um/vdso/.gitignore
+++ b/arch/x86/um/vdso/.gitignore
@@ -1,2 +1 @@
-vdso-syms.lds
vdso.lds
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index b2d6967262b2..822ccdba93ad 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -53,22 +53,6 @@ $(vobjs): KBUILD_CFLAGS += $(CFL)
CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage
CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage
-targets += vdso-syms.lds
-extra-$(VDSO64-y) += vdso-syms.lds
-
-#
-# Match symbols in the DSO that look like VDSO*; produce a file of constants.
-#
-sed-vdsosym := -e 's/^00*/0/' \
- -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
-quiet_cmd_vdsosym = VDSOSYM $@
-define cmd_vdsosym
- $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
-endef
-
-$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
- $(call if_changed,vdsosym)
-
#
# The DSO images are built using a special linker script.
#
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3b5318505c69..2eeddd814653 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -3,6 +3,7 @@
#endif
#include <linux/cpu.h>
#include <linux/kexec.h>
+#include <linux/slab.h>
#include <xen/features.h>
#include <xen/page.h>
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 439a94bf89ad..105a57d73701 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -119,6 +119,27 @@ static void __init xen_banner(void)
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
}
+
+static void __init xen_pv_init_platform(void)
+{
+ set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+ /* xen clock uses per-cpu vcpu_info, need to init it for boot cpu */
+ xen_vcpu_info_reset(0);
+
+ /* pvclock is in shared info area */
+ xen_init_time_ops();
+}
+
+static void __init xen_pv_guest_late_init(void)
+{
+#ifndef CONFIG_SMP
+ /* Setup shared vcpu info for non-smp configurations */
+ xen_setup_vcpu_info_placement();
+#endif
+}
+
/* Check if running on Xen version (major, minor) or later */
bool
xen_running_on_version_or_later(unsigned int major, unsigned int minor)
@@ -947,34 +968,8 @@ static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
xen_write_msr_safe(msr, low, high);
}
-void xen_setup_shared_info(void)
-{
- set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
-
- HYPERVISOR_shared_info =
- (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
- xen_setup_mfn_list_list();
-
- if (system_state == SYSTEM_BOOTING) {
-#ifndef CONFIG_SMP
- /*
- * In UP this is as good a place as any to set up shared info.
- * Limit this to boot only, at restore vcpu setup is done via
- * xen_vcpu_restore().
- */
- xen_setup_vcpu_info_placement();
-#endif
- /*
- * Now that shared info is set up we can start using routines
- * that point to pvclock area.
- */
- xen_init_time_ops();
- }
-}
-
/* This is called once we have the cpu_possible_mask */
-void __ref xen_setup_vcpu_info_placement(void)
+void __init xen_setup_vcpu_info_placement(void)
{
int cpu;
@@ -1228,6 +1223,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
x86_init.irqs.intr_mode_init = x86_init_noop;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
+ x86_init.hyper.init_platform = xen_pv_init_platform;
+ x86_init.hyper.guest_late_init = xen_pv_guest_late_init;
/*
* Set up some pagetable state before starting to set any ptes.
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2c30cabfda90..52206ad81e4b 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1230,8 +1230,7 @@ static void __init xen_pagetable_p2m_free(void)
* We roundup to the PMD, which means that if anybody at this stage is
* using the __ka address of xen_start_info or
* xen_start_info->shared_info they are in going to crash. Fortunatly
- * we have already revectored in xen_setup_kernel_pagetable and in
- * xen_setup_shared_info.
+ * we have already revectored in xen_setup_kernel_pagetable.
*/
size = roundup(size, PMD_SIZE);
@@ -1292,8 +1291,7 @@ static void __init xen_pagetable_init(void)
/* Remap memory freed due to conflicts with E820 map */
xen_remap_memory();
-
- xen_setup_shared_info();
+ xen_setup_mfn_list_list();
}
static void xen_write_cr2(unsigned long cr2)
{
diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c
index a2e0f110af56..8303b58c79a9 100644
--- a/arch/x86/xen/suspend_pv.c
+++ b/arch/x86/xen/suspend_pv.c
@@ -27,8 +27,9 @@ void xen_pv_pre_suspend(void)
void xen_pv_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
-
- xen_setup_shared_info();
+ set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+ xen_setup_mfn_list_list();
if (suspend_cancelled) {
xen_start_info->store_mfn =
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index e0f1bcf01d63..c84f1e039d84 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -31,6 +31,8 @@
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
+static u64 xen_sched_clock_offset __read_mostly;
+
/* Get the TSC speed from Xen */
static unsigned long xen_tsc_khz(void)
{
@@ -40,7 +42,7 @@ static unsigned long xen_tsc_khz(void)
return pvclock_tsc_khz(info);
}
-u64 xen_clocksource_read(void)
+static u64 xen_clocksource_read(void)
{
struct pvclock_vcpu_time_info *src;
u64 ret;
@@ -57,6 +59,11 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs)
return xen_clocksource_read();
}
+static u64 xen_sched_clock(void)
+{
+ return xen_clocksource_read() - xen_sched_clock_offset;
+}
+
static void xen_read_wallclock(struct timespec64 *ts)
{
struct shared_info *s = HYPERVISOR_shared_info;
@@ -367,7 +374,7 @@ void xen_timer_resume(void)
}
static const struct pv_time_ops xen_time_ops __initconst = {
- .sched_clock = xen_clocksource_read,
+ .sched_clock = xen_sched_clock,
.steal_clock = xen_steal_clock,
};
@@ -503,8 +510,9 @@ static void __init xen_time_init(void)
pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
}
-void __ref xen_init_time_ops(void)
+void __init xen_init_time_ops(void)
{
+ xen_sched_clock_offset = xen_clocksource_read();
pv_time_ops = xen_time_ops;
x86_init.timers.timer_init = xen_time_init;
@@ -542,11 +550,11 @@ void __init xen_hvm_init_time_ops(void)
return;
if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
- printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
- "disable pv timer\n");
+ pr_info("Xen doesn't support pvclock on HVM, disable pv timer");
return;
}
+ xen_sched_clock_offset = xen_clocksource_read();
pv_time_ops = xen_time_ops;
x86_init.timers.setup_percpu_clockev = xen_time_init;
x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 3b34745d0a52..e78684597f57 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -31,7 +31,6 @@ extern struct shared_info xen_dummy_shared_info;
extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void);
-void xen_setup_shared_info(void);
void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
@@ -68,12 +67,11 @@ void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);
void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
-u64 xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
void xen_save_time_memory_area(void);
void xen_restore_time_memory_area(void);
-void __ref xen_init_time_ops(void);
-void __init xen_hvm_init_time_ops(void);
+void xen_init_time_ops(void);
+void xen_hvm_init_time_ops(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);