diff options
Diffstat (limited to 'arch/x86/kernel')
47 files changed, 725 insertions, 585 deletions
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 158ad1483c43..cb6e076a6d39 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -51,6 +51,18 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, if (c->x86_vendor == X86_VENDOR_INTEL && (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ + if (c->x86_vendor == X86_VENDOR_CENTAUR) { + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && + c->x86_stepping >= 0x0e)) + flags->bm_check = 1; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9a79c7808f9c..7b9b49dfc05a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/kdebug.h> #include <linux/kprobes.h> +#include <linux/mmu_context.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -void *text_poke_early(void *addr, const void *opcode, size_t len); +void text_poke_early(void *addr, const void *opcode, size_t len); /* * Are we looking at a near JMP with a 1 or 4-byte displacement. @@ -666,16 +667,136 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -void *__init_or_module text_poke_early(void *addr, const void *opcode, - size_t len) +void __init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_NX) && + is_module_text_address((unsigned long)addr)) { + /* + * Modules text is marked initially as non-executable, so the + * code cannot be running and speculative code-fetches are + * prevented. Just change the code. + */ + memcpy(addr, opcode, len); + } else { + local_irq_save(flags); + memcpy(addr, opcode, len); + local_irq_restore(flags); + sync_core(); + + /* + * Could also do a CLFLUSH here to speed up CPU recovery; but + * that causes hangs on some VIA CPUs. + */ + } +} + +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + +static void *__text_poke(void *addr, const void *opcode, size_t len) +{ + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; + struct page *pages[2] = {NULL}; + temp_mm_state_t prev; + unsigned long flags; + pte_t pte, *ptep; + spinlock_t *ptl; + pgprot_t pgprot; + + /* + * While boot memory allocator is running we cannot use struct pages as + * they are not yet initialized. There is no way to recover. + */ + BUG_ON(!after_bootmem); + + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + if (cross_page_boundary) + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + } else { + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + if (cross_page_boundary) + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + /* + * If something went wrong, crash and burn since recovery paths are not + * implemented. + */ + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); + local_irq_save(flags); - memcpy(addr, opcode, len); + + /* + * Map the page without the global bit, as TLB flushing is done with + * flush_tlb_mm_range(), which is intended for non-global PTEs. + */ + pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + + /* + * The lock is not really needed, but this allows to avoid open-coding. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + + /* + * This must not fail; preallocated in poking_init(). + */ + VM_BUG_ON(!ptep); + + pte = mk_pte(pages[0], pgprot); + set_pte_at(poking_mm, poking_addr, ptep, pte); + + if (cross_page_boundary) { + pte = mk_pte(pages[1], pgprot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + + /* + * Loading the temporary mm behaves as a compiler barrier, which + * guarantees that the PTE will be set at the time memcpy() is done. + */ + prev = use_temporary_mm(poking_mm); + + kasan_disable_current(); + memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + kasan_enable_current(); + + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); + + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, + PAGE_SHIFT, false); + + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, opcode, len)); + + pte_unmap_unlock(ptep, ptl); local_irq_restore(flags); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ return addr; } @@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. + * + * Note that the caller must ensure that if the modified code is part of a + * module, the module would not be removed during poking. This can be achieved + * by registering a module notifier, and ordering module removal and patching + * trough a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { - unsigned long flags; - char *vaddr; - struct page *pages[2]; - int i; - - /* - * While boot memory allocator is runnig we cannot use struct - * pages as they are not yet initialized. - */ - BUG_ON(!after_bootmem); - lockdep_assert_held(&text_mutex); - if (!core_kernel_text((unsigned long)addr)) { - pages[0] = vmalloc_to_page(addr); - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); - } else { - pages[0] = virt_to_page(addr); - WARN_ON(!PageReserved(pages[0])); - pages[1] = virt_to_page(addr + PAGE_SIZE); - } - BUG_ON(!pages[0]); - local_irq_save(flags); - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); - if (pages[1]) - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - clear_fixmap(FIX_TEXT_POKE0); - if (pages[1]) - clear_fixmap(FIX_TEXT_POKE1); - local_flush_tlb(); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ - for (i = 0; i < len; i++) - BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); - local_irq_restore(flags); - return addr; + return __text_poke(addr, opcode, len); +} + +/** + * text_poke_kgdb - Update instructions on a live kernel by kgdb + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Context: should only be used by kgdb, which ensures no other core is running, + * despite the fact it does not hold the text_mutex. + */ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) +{ + return __text_poke(addr, opcode, len); } static void do_sync_core(void *info) @@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * replacing opcode * - sync cores */ -void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) { unsigned char int3 = 0xcc; @@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) * the writing of the new instruction. */ bp_patching_in_progress = false; - - return addr; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b7bcdd781651..ab6af775f06c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -802,6 +802,24 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) return 0; } +static int __init lapic_init_clockevent(void) +{ + if (!lapic_timer_frequency) + return -1; + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; + + return 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); @@ -810,25 +828,21 @@ static int __init calibrate_APIC_clock(void) long delta, deltatsc; int pm_referenced = 0; - /** - * check if lapic timer has already been calibrated by platform - * specific routine, such as tsc calibration code. if so, we just fill + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return 0; + + /* + * Check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. If so just fill * in the clockevent structure and return. */ - - if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { - return 0; - } else if (lapic_timer_frequency) { + if (!lapic_init_clockevent()) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_frequency); - lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, - TICK_NSEC, lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.max_delta_ticks = 0x7FFFFF; - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - lapic_clockevent.min_delta_ticks = 0xF; + lapic_timer_frequency); + /* + * Direct calibration methods must have an always running + * local APIC timer, no need for broadcast timer. + */ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -869,17 +883,8 @@ static int __init calibrate_APIC_clock(void) pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, &delta, &deltatsc); - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); - lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - lapic_clockevent.min_delta_ticks = 0xF; - lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_init_clockevent(); apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 78778b54f904..a5464b8b6c46 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -175,7 +175,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { + if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index ddced33184b5..d3d075226c0a 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -68,10 +68,12 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - + offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 01004bfb1a1b..fb6a64bd765f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -82,11 +82,14 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ +#ifdef CONFIG_X86_32 extern __visible void vide(void); -__asm__(".globl vide\n" +__asm__(".text\n" + ".globl vide\n" ".type vide, @function\n" ".align 4\n" "vide: ret\n"); +#endif static void init_amd_k5(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 804c49493938..64d5aec24203 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -83,7 +83,7 @@ unsigned int aperfmperf_get_khz(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; aperfmperf_snapshot_cpu(cpu, ktime_get(), true); @@ -99,7 +99,7 @@ void arch_freq_prepare_all(void) if (!cpu_khz) return; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; for_each_online_cpu(cpu) @@ -115,7 +115,7 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b91b3bfa5cfb..29630393f300 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -440,7 +440,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); @@ -672,7 +673,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", @@ -1008,6 +1010,11 @@ static void __init l1tf_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5e37dfa4d9df..37640544e12f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -507,19 +507,6 @@ void load_percpu_segment(int cpu) DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif -#ifdef CONFIG_X86_64 -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; -#endif - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1511,9 +1498,9 @@ static __init int setup_clearcpuid(char *arg) __setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); /* * The following percpu variables are hot. Align current_task to @@ -1523,9 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; - +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; @@ -1562,23 +1547,7 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); - -int is_debug_stack(unsigned long addr) -{ - return __this_cpu_read(debug_stack_usage) || - (addr <= __this_cpu_read(debug_stack_addr) && - addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ)); -} -NOKPROBE_SYMBOL(is_debug_stack); - DEFINE_PER_CPU(u32, debug_idt_ctr); void debug_stack_set_zero(void) @@ -1668,7 +1637,7 @@ static void setup_getcpu(int cpu) unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; - if (static_cpu_has(X86_FEATURE_RDTSCP)) + if (boot_cpu_has(X86_FEATURE_RDTSCP)) write_rdtscp_aux(cpudata); /* Store CPU and node number in limit. */ @@ -1690,17 +1659,14 @@ static void setup_getcpu(int cpu) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 void cpu_init(void) { - struct orig_ist *oist; + int cpu = raw_smp_processor_id(); struct task_struct *me; struct tss_struct *t; - unsigned long v; - int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); @@ -1715,7 +1681,6 @@ void cpu_init(void) load_ucode_ap(); t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1753,16 +1718,11 @@ void cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!oist->ist[0]) { - char *estacks = get_cpu_entry_area(cpu)->exception_stacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; - oist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - if (v == DEBUG_STACK-1) - per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; - } + if (!t->x86_tss.ist[0]) { + t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index cf25405444ab..415621ddb8a2 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -19,6 +19,8 @@ #include "cpu.h" +#define APICID_SOCKET_ID_BIT 6 + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] @@ -87,6 +89,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); + /* Socket ID is ApicId[6] for these processors. */ + c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + cacheinfo_hygon_init_llc_id(c, cpu, node_id); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 8492ef7d9015..3da9a8823e47 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -528,7 +528,7 @@ static void do_inject(void) * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * Fam10h and later BKDGs. */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && + if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5260185cbf7b..8a4a7823451a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -418,8 +418,9 @@ static int do_microcode_update(const void __user *buf, size_t size) if (ustate == UCODE_ERROR) { error = -1; break; - } else if (ustate == UCODE_OK) + } else if (ustate == UCODE_NEW) { apply_microcode_on_target(cpu); + } } return error; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 16936a24795c..a44bdbe7c55e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -31,6 +31,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/cpu.h> +#include <linux/uio.h> #include <linux/mm.h> #include <asm/microcode_intel.h> @@ -861,32 +862,33 @@ out: return ret; } -static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover = size; unsigned int curr_mc_size = 0, new_mc_size = 0; - unsigned int csig, cpf; enum ucode_state ret = UCODE_OK; + int new_rev = uci->cpu_sig.rev; + u8 *new_mc = NULL, *mc = NULL; + unsigned int csig, cpf; - while (leftover) { + while (iov_iter_count(iter)) { struct microcode_header_intel mc_header; - unsigned int mc_size; + unsigned int mc_size, data_size; + u8 *data; - if (leftover < sizeof(mc_header)) { - pr_err("error! Truncated header in microcode data file\n"); + if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) { + pr_err("error! Truncated or inaccessible header in microcode data file\n"); break; } - if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) - break; - mc_size = get_totalsize(&mc_header); - if (!mc_size || mc_size > leftover) { - pr_err("error! Bad data in microcode data file\n"); + if (mc_size < sizeof(mc_header)) { + pr_err("error! Bad data in microcode data file (totalsize too small)\n"); + break; + } + data_size = mc_size - sizeof(mc_header); + if (data_size > iov_iter_count(iter)) { + pr_err("error! Bad data in microcode data file (truncated file?)\n"); break; } @@ -899,7 +901,9 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, curr_mc_size = mc_size; } - if (get_ucode_data(mc, ucode_ptr, mc_size) || + memcpy(mc, &mc_header, sizeof(mc_header)); + data = mc + sizeof(mc_header); + if (!copy_from_iter_full(data, data_size, iter) || microcode_sanity_check(mc, 1) < 0) { break; } @@ -914,14 +918,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc = NULL; /* trigger new vmalloc */ ret = UCODE_NEW; } - - ucode_ptr += mc_size; - leftover -= mc_size; } vfree(mc); - if (leftover) { + if (iov_iter_count(iter)) { vfree(new_mc); return UCODE_ERROR; } @@ -945,12 +946,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, return ret; } -static int get_ucode_fw(void *to, const void *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -977,10 +972,12 @@ static bool is_blacklisted(unsigned int cpu) static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { - char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; + struct iov_iter iter; enum ucode_state ret; + struct kvec kvec; + char name[30]; if (is_blacklisted(cpu)) return UCODE_NFOUND; @@ -993,26 +990,30 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return UCODE_NFOUND; } - ret = generic_load_microcode(cpu, (void *)firmware->data, - firmware->size, &get_ucode_fw); + kvec.iov_base = (void *)firmware->data; + kvec.iov_len = firmware->size; + iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size); + ret = generic_load_microcode(cpu, &iter); release_firmware(firmware); return ret; } -static int get_ucode_user(void *to, const void *from, size_t n) -{ - return copy_from_user(to, from, n); -} - static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + struct iov_iter iter; + struct iovec iov; + if (is_blacklisted(cpu)) return UCODE_NFOUND; - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); + iov.iov_base = (void __user *)buf; + iov.iov_len = size; + iov_iter_init(&iter, WRITE, &iov, 1, size); + + return generic_load_microcode(cpu, &iter); } static struct microcode_ops microcode_intel_ops = { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 2c8522a39ed5..cb2e49810d68 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -35,11 +35,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", c->cpuid_level); } #else diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 2dbd990a2eb7..89320c0396b1 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -342,10 +342,10 @@ int update_domains(struct rdt_resource *r, int closid) if (cpumask_empty(cpu_mask) || mba_sc) goto done; cpu = get_cpu(); - /* Update CBM on this cpu if it's in cpu_mask. */ + /* Update resource control msr on this CPU if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) rdt_ctrl_update(&msr_param); - /* Update CBM on other cpus. */ + /* Update resource control msr on other CPUs. */ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 85212a32b54d..333c177a2471 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2516,100 +2516,127 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) bitmap_clear(val, zero_bit, cbm_len - zero_bit); } -/** - * rdtgroup_init_alloc - Initialize the new RDT group's allocations - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. That is, all shareable and unused bits. +/* + * Initialize cache resources per RDT domain * - * All-zero CBM is invalid. If there are no more shareable bits available - * on any domain then the entire allocation will fail. + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, + u32 closid) { struct rdt_resource *r_cdp = NULL; struct rdt_domain *d_cdp = NULL; u32 used_b = 0, unused_b = 0; - u32 closid = rdtgrp->closid; - struct rdt_resource *r; unsigned long tmp_cbm; enum rdtgrp_mode mode; - struct rdt_domain *d; u32 peer_ctl, *ctrl; - int i, ret; + int i; - for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; - list_for_each_entry(d, &r->domains, list) { - rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); - d->have_new_ctrl = false; - d->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - ctrl = d->ctrl_val; - for (i = 0; i < closids_supported(); i++, ctrl++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - break; - /* - * If CDP is active include peer - * domain's usage to ensure there - * is no overlap with an exclusive - * group. - */ - if (d_cdp) - peer_ctl = d_cdp->ctrl_val[i]; - else - peer_ctl = 0; - used_b |= *ctrl | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - d->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cbm_ensure_valid(&d->new_ctrl, r); + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); + d->have_new_ctrl = false; + d->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + ctrl = d->ctrl_val; + for (i = 0; i < closids_supported(); i++, ctrl++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + break; /* - * Assign the u32 CBM to an unsigned long to ensure - * that bitmap_weight() does not access out-of-bound - * memory. + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. */ - tmp_cbm = d->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < - r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", - r->name, d->id); - return -ENOSPC; - } - d->have_new_ctrl = true; + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + d->new_ctrl |= *ctrl | peer_ctl; } } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + d->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cbm_ensure_valid(&d->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = d->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id); + return -ENOSPC; + } + d->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid) +{ + struct rdt_domain *d; + int ret; + + list_for_each_entry(d, &r->domains, list) { + ret = __init_one_rdt_domain(d, r, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + d->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r; + int ret; for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; + if (r->rid == RDT_RESOURCE_MBA) { + rdtgroup_init_mba(r); + } else { + ret = rdtgroup_init_cat(r, rdtgrp->closid); + if (ret < 0) + return ret; + } + ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Failed to initialize allocations\n"); return ret; } + } rdtgrp->mode = RDT_MODE_SHAREABLE; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 17ffc869cab8..a96ca8584803 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -204,8 +204,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) * another range split. So add extra two slots here. */ nr_ranges += 2; - cmem = vzalloc(sizeof(struct crash_mem) + - sizeof(struct crash_mem_range) * nr_ranges); + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index cd53f3030e40..64a59d726639 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -34,14 +34,14 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -59,14 +59,14 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5cdb9e84da57..753b8cfe8b8a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,23 +16,21 @@ #include <linux/bug.h> #include <linux/nmi.h> +#include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> -static char *exception_stack_names[N_EXCEPTION_STACKS] = { - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ NMI_STACK-1 ] = "NMI", - [ DEBUG_STACK-1 ] = "#DB", - [ MCE_STACK-1 ] = "#MC", -}; - -static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ +static const char * const exception_stack_names[] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB2 ] = "#DB2", + [ ESTACK_DB1 ] = "#DB1", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -52,43 +50,84 @@ const char *stack_type_name(enum stack_type type) return NULL; } +/** + * struct estack_pages - Page descriptor for exception stacks + * @offs: Offset from the start of the exception stack area + * @size: Size of the exception stack + * @type: Type to store in the stack_info struct + */ +struct estack_pages { + u32 offs; + u16 size; + u16 type; +}; + +#define EPAGERANGE(st) \ + [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ + PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ + .offs = CEA_ESTACK_OFFS(st), \ + .size = CEA_ESTACK_SIZE(st), \ + .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } + +/* + * Array of exception stack page descriptors. If the stack is larger than + * PAGE_SIZE, all pages covering a particular stack will have the same + * info. The guard pages including the not mapped DB2 stack are zeroed + * out. + */ +static const +struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { + EPAGERANGE(DF), + EPAGERANGE(NMI), + EPAGERANGE(DB1), + EPAGERANGE(DB), + EPAGERANGE(MCE), +}; + static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin, *end; + unsigned long begin, end, stk = (unsigned long)stack; + const struct estack_pages *ep; struct pt_regs *regs; - unsigned k; + unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - (exception_stack_sizes[k] / sizeof(long)); - regs = (struct pt_regs *)end - 1; - - if (stack <= begin || stack >= end) - continue; + begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + end = begin + sizeof(struct cea_exception_stacks); + /* Bail if @stack is outside the exception stack area. */ + if (stk < begin || stk >= end) + return false; - info->type = STACK_TYPE_EXCEPTION + k; - info->begin = begin; - info->end = end; - info->next_sp = (unsigned long *)regs->sp; + /* Calc page offset from start of exception stacks */ + k = (stk - begin) >> PAGE_SHIFT; + /* Lookup the page descriptor */ + ep = &estack_pages[k]; + /* Guard page? */ + if (!ep->size) + return false; - return true; - } + begin += (unsigned long)ep->offs; + end = begin + (unsigned long)ep->size; + regs = (struct pt_regs *)end - 1; - return false; + info->type = ep->type; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; + info->next_sp = (unsigned long *)regs->sp; + return true; } static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ef49517f6bb2..0caf8122d680 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size) { return module_alloc(size); } -static inline void tramp_free(void *tramp, int size) +static inline void tramp_free(void *tramp) { - int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; - - set_memory_nx((unsigned long)tramp, npages); - set_memory_rw((unsigned long)tramp, npages); module_memfree(tramp); } #else @@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size) { return NULL; } -static inline void tramp_free(void *tramp, int size) { } +static inline void tramp_free(void *tramp) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ @@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long end_offset; unsigned long op_offset; unsigned long offset; + unsigned long npages; unsigned long size; unsigned long retq; unsigned long *ptr; @@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) return 0; *tramp_size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ ret = probe_kernel_read(trampoline, (void *)start_offset, size); @@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + set_vm_flush_reset_perms(trampoline); + + /* + * Module allocation needs to be completed by making the page + * executable. The page is still writable, which is a security hazard, + * but anyhow ftrace breaks W^X completely. + */ + set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: - tramp_free(trampoline, *tramp_size); + tramp_free(trampoline); return 0; } @@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; - tramp_free((void *)ops->trampoline, ops->trampoline_size); + tramp_free((void *)ops->trampoline); ops->trampoline = 0; } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d1dbe8e4eb82..bcd206c8ac90 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,7 +265,7 @@ ENDPROC(start_cpu0) GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(irq_stack_union) + .quad INIT_PER_CPU_VAR(fixed_percpu_data) GLOBAL(initial_stack) /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 01adea278a71..6d8917875f44 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -41,13 +41,12 @@ struct idt_data { #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) -/* Interrupt gate with interrupt stack */ +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ #define ISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) - -/* System interrupt gate with interrupt stack */ -#define SISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -184,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, DEBUG_STACK), - ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), + ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), #endif }; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 95600a99ae93..fc34816c6f04 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); static void call_on_stack(void *func, void *stack) { @@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -107,27 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) } /* - * allocate per-cpu stacks for hardirq and for softirq processing + * Allocate per-cpu stacks for hardirq and softirq processing */ -void irq_ctx_init(int cpu) +int irq_init_percpu_irqstack(unsigned int cpu) { - struct irq_stack *irqstk; - - if (per_cpu(hardirq_stack, cpu)) - return; + int node = cpu_to_node(cpu); + struct page *ph, *ps; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(hardirq_stack, cpu) = irqstk; + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(softirq_stack, cpu) = irqstk; + ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ph) + return -ENOMEM; + ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ps) { + __free_pages(ph, THREAD_SIZE_ORDER); + return -ENOMEM; + } - printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + return 0; } void do_softirq_own_stack(void) @@ -135,7 +136,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db1..6bf6517a05bb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,63 +18,64 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <linux/sched/task_stack.h> + +#include <asm/cpu_entry_area.h> #include <asm/io_apic.h> #include <asm/apic.h> -int sysctl_panic_on_stackoverflow; +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); -/* - * Probabilistic stack overflow check: - * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. - */ -static inline void stack_overflow_check(struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { -#ifdef CONFIG_DEBUG_STACKOVERFLOW -#define STACK_TOP_MARGIN 128 - struct orig_ist *oist; - u64 irq_stack_top, irq_stack_bottom; - u64 estack_top, estack_bottom; - u64 curbase = (u64)task_stack_page(current); + if (IS_ERR_OR_NULL(desc)) + return false; - if (user_mode(regs)) - return; + generic_handle_irq_desc(desc); + return true; +} - if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && - regs->sp <= curbase + THREAD_SIZE) - return; +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; - irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + - STACK_TOP_MARGIN; - irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); - if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) - return; + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); - oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; - estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; - if (regs->sp >= estack_top && regs->sp <= estack_bottom) - return; + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", - current->comm, curbase, regs->sp, - irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom, (void *)regs->ip); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!va) + return -ENOMEM; - if (sysctl_panic_on_stackoverflow) - panic("low stack detected by irq handler - check messages\n"); -#endif + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; } - -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ +static int map_irq_stack(unsigned int cpu) { - stack_overflow_check(regs); + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); - if (IS_ERR_OR_NULL(desc)) - return false; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#endif - generic_handle_irq_desc(desc); - return true; +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a0693b71cfc1..16919a9671fa 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -91,6 +91,8 @@ void __init init_IRQ(void) for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); + BUG_ON(irq_init_percpu_irqstack(smp_processor_id())); + x86_init.irqs.intr_init(); } @@ -104,6 +106,4 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); - - irq_ctx_init(smp_processor_id()); } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f99bd26bd3f1..e631c358f7f4 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line) static void __ref __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), int init) { union jump_code_union jmp; @@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry, jmp.offset = jump_entry_target(entry) - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); - if (early_boot_irqs_disabled) - poker = text_poke_early; - if (type == JUMP_LABEL_JMP) { if (init) { expect = default_nop; line = __LINE__; @@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry, bug_at((void *)jump_entry_code(entry), line); /* - * Make text_poke_bp() a default fallback poker. + * As long as only a single processor is running and the code is still + * not marked as RO, text_poke_early() can be used; Checking that + * system_state is SYSTEM_BOOTING guarantees it. It will be set to + * SYSTEM_SCHEDULING before other cores are awaken and before the + * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction - * */ - if (poker) { - (*poker)((void *)jump_entry_code(entry), code, - JUMP_LABEL_NOP_SIZE); + if (init || system_state == SYSTEM_BOOTING) { + text_poke_early((void *)jump_entry_code(entry), code, + JUMP_LABEL_NOP_SIZE); return; } @@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { mutex_lock(&text_mutex); - __jump_label_transform(entry, type, NULL, 0); + __jump_label_transform(entry, type, 0); mutex_unlock(&text_mutex); } @@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, jlstate = JL_STATE_NO_UPDATE; } if (jlstate == JL_STATE_UPDATE) - __jump_label_transform(entry, type, text_poke_early, 1); + __jump_label_transform(entry, type, 1); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 4ff6b4cdb941..13b13311b792 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; - char opc[BREAK_INSTR_SIZE]; bpt->type = BP_BREAKPOINT; err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, @@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) if (!err) return err; /* - * It is safe to call text_poke() because normal kernel execution + * It is safe to call text_poke_kgdb() because normal kernel execution * is stopped on all cores, so long as the text_mutex is not locked. */ if (mutex_is_locked(&text_mutex)) return -EBUSY; - text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); - err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); - if (err) - return err; - if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) - return -EINVAL; + text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); bpt->type = BP_POKE_BREAKPOINT; return err; @@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { - int err; - char opc[BREAK_INSTR_SIZE]; - if (bpt->type != BP_POKE_BREAKPOINT) goto knl_write; /* - * It is safe to call text_poke() because normal kernel execution + * It is safe to call text_poke_kgdb() because normal kernel execution * is stopped on all cores, so long as the text_mutex is not locked. */ if (mutex_is_locked(&text_mutex)) goto knl_write; - text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); - err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); - if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) - goto knl_write; - return err; + text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr, + BREAK_INSTR_SIZE); + return 0; knl_write: return probe_kernel_write((char *)bpt->bpt_addr, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index fed46ddb1eef..cf52ee0d8711 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -431,8 +431,21 @@ void *alloc_insn_page(void) void *page; page = module_alloc(PAGE_SIZE); - if (page) - set_memory_ro((unsigned long)page & PAGE_MASK, 1); + if (!page) + return NULL; + + set_vm_flush_reset_perms(page); + /* + * First make the page read-only, and only then make it executable to + * prevent it from being W+X in between. + */ + set_memory_ro((unsigned long)page, 1); + + /* + * TODO: Once additional kernel code protection mechanisms are set, ensure + * that the page was not maliciously altered and it is still zeroed. + */ + set_memory_x((unsigned long)page, 1); return page; } @@ -440,8 +453,6 @@ void *alloc_insn_page(void) /* Recover page to RW mode before releasing it */ void free_insn_page(void *page) { - set_memory_nx((unsigned long)page & PAGE_MASK, 1); - set_memory_rw((unsigned long)page & PAGE_MASK, 1); module_memfree(page); } @@ -716,6 +727,7 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); * calls trampoline_handler() runs, which calls the kretprobe's handler. */ asm( + ".text\n" ".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5c93a65ee1e5..3f0cc828cc36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -67,7 +67,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); +DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; /* diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6135ae8ce036..b2463fcb20a8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -113,7 +113,7 @@ static void do_sanity_check(struct mm_struct *mm, * tables. */ WARN_ON(!had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(!had_user_mapping); } else { /* @@ -121,7 +121,7 @@ static void do_sanity_check(struct mm_struct *mm, * Sync the pgd to the usermode tables. */ WARN_ON(had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(had_user_mapping); } } @@ -156,7 +156,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pmd(u_pmd, *k_pmd); } @@ -181,7 +181,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pgd(kernel_to_user_pgdp(pgd), *pgd); } @@ -208,7 +208,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) spinlock_t *ptl; int i, nr_pages; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return 0; /* @@ -271,7 +271,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) return; /* LDT map/unmap is only required for PTI */ - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); @@ -311,7 +311,7 @@ static void free_ldt_pgtables(struct mm_struct *mm) unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; tlb_gather_mmu(&tlb, mm, start, end); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b052e883dd8c..cfa3106faee4 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -87,7 +87,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { vfree(p); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 18bc9b51ac9b..3755d0310026 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -21,13 +21,14 @@ #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/atomic.h> #include <linux/sched/clock.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif -#include <linux/atomic.h> +#include <asm/cpu_entry_area.h> #include <asm/traps.h> #include <asm/mach_traps.h> #include <asm/nmi.h> @@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); * switch back to the original IDT. */ static DEFINE_PER_CPU(int, update_debug_stack); + +static bool notrace is_debug_stack(unsigned long addr) +{ + struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); + unsigned long top = CEA_ESTACK_TOP(cs, DB); + unsigned long bot = CEA_ESTACK_BOT(cs, DB1); + + if (__this_cpu_read(debug_stack_usage)) + return true; + /* + * Note, this covers the guard page between DB and DB1 as well to + * avoid two checks. But by all means @addr can never point into + * the guard page. + */ + return addr >= bot && addr < top; +} +NOKPROBE_SYMBOL(is_debug_stack); #endif dotraplinkage notrace void diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c0e0101133f3..7bbaa6baf37f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -121,7 +121,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_disable(&virt_spin_lock_key); } diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index c06c4c16c6b6..07c30ee17425 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -59,18 +59,34 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { u64 perf_reg_value(struct pt_regs *regs, int idx) { + struct x86_perf_regs *perf_regs; + + if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { + perf_regs = container_of(regs, struct x86_perf_regs, regs); + if (!perf_regs->xmm_regs) + return 0; + return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; + } + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) return 0; return regs_get_register(regs, pt_regs_offset[idx]); } -#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) - #ifdef CONFIG_X86_32 +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; @@ -96,10 +112,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) - return -EINVAL; - - if (mask & REG_NOSUPPORT) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 957eae13b370..d1d312d012a6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -236,7 +236,7 @@ static int get_cpuid_mode(void) static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) { - if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; if (cpuid_enabled) @@ -670,7 +670,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) + if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; return 1; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e471d8e6f0b2..70933193878c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -127,6 +127,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct task_struct *tsk; int err; + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6a62f4af9fcf..844a28b29967 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -392,6 +392,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; + frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 8fd3cedd9acc..09d6bded3c1e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -121,7 +121,7 @@ void __noreturn machine_real_restart(unsigned int type) write_cr3(real_mode_header->trampoline_pgd); /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (static_cpu_has(X86_FEATURE_PCID)) + if (boot_cpu_has(X86_FEATURE_PCID)) cr4_clear_bits(X86_CR4_PCIDE); #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3d872a527cd9..905dae880563 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -71,6 +71,7 @@ #include <linux/tboot.h> #include <linux/jiffies.h> #include <linux/mem_encrypt.h> +#include <linux/sizes.h> #include <linux/usb/xhci-dbgp.h> #include <video/edid.h> @@ -448,18 +449,17 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC_CORE /* 16M alignment for crash kernel regions */ -#define CRASH_ALIGN (16 << 20) +#define CRASH_ALIGN SZ_16M /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. - * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_ADDR_LOW_MAX (512 << 20) -# define CRASH_ADDR_HIGH_MAX (512 << 20) +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M #else -# define CRASH_ADDR_LOW_MAX (896UL << 20) +# define CRASH_ADDR_LOW_MAX SZ_4G # define CRASH_ADDR_HIGH_MAX MAXMEM #endif @@ -541,21 +541,27 @@ static void __init reserve_crashkernel(void) } /* 0 means: find the address automatically */ - if (crash_base <= 0) { + if (!crash_base) { /* * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, - * as old kexec-tools loads bzImage below that, unless - * "crashkernel=size[KMG],high" is specified. + * crashkernel=x,high reserves memory over 4G, also allocates + * 256M extra low memory for DMA buffers and swiotlb. + * But the extra memory is not required for all machines. + * So try low memory first and fall back to high memory + * unless "crashkernel=size[KMG],high" is specified. */ - crash_base = memblock_find_in_range(CRASH_ALIGN, - high ? CRASH_ADDR_HIGH_MAX - : CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); + if (!high) + crash_base = memblock_find_in_range(CRASH_ALIGN, + CRASH_ADDR_LOW_MAX, + crash_size, CRASH_ALIGN); + if (!crash_base) + crash_base = memblock_find_in_range(CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX, + crash_size, CRASH_ALIGN); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } - } else { unsigned long long start; @@ -1005,13 +1011,11 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_init(); - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine(), for the boot CPU. + * needs to be done after dmi_setup(), for the boot CPU. */ init_hypervisor_platform(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4bf46575568a..86663874ef04 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_cpu_to_logical_apicid, cpu) = early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); #endif -#ifdef CONFIG_X86_64 - per_cpu(irq_stack_ptr, cpu) = - per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE; -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 08dfd4c1a4f9..dff90fb6a9af 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -132,16 +132,6 @@ static int restore_sigcontext(struct pt_regs *regs, COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && - user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ @@ -150,6 +140,15 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); +#endif + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -461,6 +460,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *fp = NULL; + unsigned long uc_flags; int err = 0; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); @@ -473,9 +473,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -541,6 +543,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, { #ifdef CONFIG_X86_X32_ABI struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -555,9 +558,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); @@ -688,10 +693,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; - /* - * Increment event counter and perform fixup for the pre-signal - * frame. - */ + /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce1a67b70168..73e69aaaa117 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -455,7 +455,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) * multicore group inside a NUMA node. If this happens, we will * discard the MC level of the topology later. */ -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (c->phys_proc_id == o->phys_proc_id) return true; @@ -546,7 +546,7 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); - if ((i == cpu) || (has_mp && match_die(c, o))) { + if ((i == cpu) || (has_mp && match_pkg(c, o))) { link_mask(topology_core_cpumask, cpu, i); /* @@ -570,7 +570,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_die(c, o) && !topology_same_node(c, o)) + if (match_pkg(c, o) && !topology_same_node(c, o)) x86_has_numa_in_package = true; } @@ -935,20 +935,27 @@ out: return boot_error; } -void common_cpu_up(unsigned int cpu, struct task_struct *idle) +int common_cpu_up(unsigned int cpu, struct task_struct *idle) { + int ret; + /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + /* Initialize the interrupt stack(s) */ + ret = irq_init_percpu_irqstack(cpu); + if (ret) + return ret; + #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif + return 0; } /* @@ -1106,7 +1113,9 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) /* the FPU context is blank, nobody can own it */ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; - common_cpu_up(cpu, tidle); + err = common_cpu_up(cpu, tidle); + if (err) + return err; err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); if (err) { diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 5c2d71a1dc06..2abf27d7df6b 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -12,78 +12,31 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> -static int save_stack_address(struct stack_trace *trace, unsigned long addr, - bool nosched) -{ - if (nosched && in_sched_functions(addr)) - return 0; - - if (trace->skip > 0) { - trace->skip--; - return 0; - } - - if (trace->nr_entries >= trace->max_entries) - return -1; - - trace->entries[trace->nr_entries++] = addr; - return 0; -} - -static void noinline __save_stack_trace(struct stack_trace *trace, - struct task_struct *task, struct pt_regs *regs, - bool nosched) +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; - if (regs) - save_stack_address(trace, regs->ip, nosched); + if (regs && !consume_entry(cookie, regs->ip, false)) + return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); - if (!addr || save_stack_address(trace, addr, nosched)) + if (!addr || !consume_entry(cookie, addr, false)) break; } - - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } /* - * Save stack-backtrace addresses into a stack_trace buffer. + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. */ -void save_stack_trace(struct stack_trace *trace) -{ - trace->skip++; - __save_stack_trace(trace, current, NULL, false); -} -EXPORT_SYMBOL_GPL(save_stack_trace); - -void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) -{ - __save_stack_trace(trace, current, regs, false); -} - -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - if (!try_get_task_stack(tsk)) - return; - - if (tsk == current) - trace->skip++; - __save_stack_trace(trace, tsk, NULL, true); - - put_task_stack(tsk); -} -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); - -#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE - -static int __always_inline -__save_stack_trace_reliable(struct stack_trace *trace, - struct task_struct *task) +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) { struct unwind_state state; struct pt_regs *regs; @@ -97,7 +50,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (regs) { /* Success path for user tasks */ if (user_mode(regs)) - goto success; + return 0; /* * Kernel mode registers on the stack indicate an @@ -120,7 +73,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!addr) return -EINVAL; - if (save_stack_address(trace, addr, false)) + if (!consume_entry(cookie, addr, false)) return -EINVAL; } @@ -132,39 +85,9 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!(task->flags & (PF_KTHREAD | PF_IDLE))) return -EINVAL; -success: - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; - return 0; } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ -int save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) -{ - int ret; - - /* - * If the task doesn't have a stack (e.g., a zombie), the stack is - * "reliably" empty. - */ - if (!try_get_task_stack(tsk)) - return 0; - - ret = __save_stack_trace_reliable(trace, tsk); - - put_task_stack(tsk); - - return ret; -} -#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ - /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { @@ -189,15 +112,15 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) return ret; } -static inline void __save_stack_trace_user(struct stack_trace *trace) +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) { - const struct pt_regs *regs = task_pt_regs(current); const void __user *fp = (const void __user *)regs->bp; - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = regs->ip; + if (!consume_entry(cookie, regs->ip, false)) + return; - while (trace->nr_entries < trace->max_entries) { + while (1) { struct stack_frame_user frame; frame.next_fp = NULL; @@ -207,8 +130,8 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) if ((unsigned long)fp < regs->sp) break; if (frame.ret_addr) { - trace->entries[trace->nr_entries++] = - frame.ret_addr; + if (!consume_entry(cookie, frame.ret_addr, false)) + return; } if (fp == frame.next_fp) break; @@ -216,14 +139,3 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) } } -void save_stack_trace_user(struct stack_trace *trace) -{ - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm) { - __save_stack_trace_user(trace); - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; -} diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 738bf42b0218..be5bc2e47c71 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -71,7 +71,7 @@ int _debug_hotplug_cpu(int cpu, int action) case 0: ret = cpu_down(cpu); if (!ret) { - pr_info("CPU %u is now offline\n", cpu); + pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); dev->offline = true; kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } else diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cc6df5c6d7b3..15b5e98a86f9 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -282,6 +282,7 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); static int no_sched_irq_time; +static int no_tsc_watchdog; static int __init tsc_setup(char *str) { @@ -291,6 +292,8 @@ static int __init tsc_setup(char *str) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); + if (!strcmp(str, "nowatchdog")) + no_tsc_watchdog = 1; return 1; } @@ -1348,7 +1351,7 @@ static int __init init_tsc_clocksource(void) if (tsc_unstable) goto unreg; - if (tsc_clocksource_reliable) + if (tsc_clocksource_reliable || no_tsc_watchdog) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index a092b6b40c6b..6a38717d179c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -369,7 +369,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) preempt_disable(); tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) { + if (boot_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; refresh_sysenter_cs(&tsk->thread); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a5127b2c195f..0850b5149345 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -141,11 +141,11 @@ SECTIONS *(.text.__x86.indirect_thunk) __indirect_thunk_end = .; #endif - - /* End of text section */ - _etext = .; } :text = 0x9090 + /* End of text section */ + _etext = .; + NOTES :text :note EXCEPTION_TABLE(16) :text = 0x9090 @@ -403,7 +403,8 @@ SECTIONS */ #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); +INIT_PER_CPU(fixed_percpu_data); +INIT_PER_CPU(irq_stack_backing_store); /* * Build-time check on the image size: @@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((fixed_percpu_data == 0), + "fixed_percpu_data is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ |