diff options
Diffstat (limited to 'arch/x86/kernel')
50 files changed, 1454 insertions, 1109 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77261db2391..de09af019e23 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -68,6 +68,7 @@ obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o obj-y += irqflags.o +obj-y += static_call.o obj-y += process.o obj-y += fpu/ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cdaab30880b9..4adbe65afe23 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1103,6 +1103,10 @@ noinstr int poke_int3_handler(struct pt_regs *regs) */ goto out_put; + case RET_INSN_OPCODE: + int3_emulate_ret(regs); + break; + case CALL_INSN_OPCODE: int3_emulate_call(regs, (long)ip + tp->rel32); break; @@ -1277,6 +1281,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, switch (tp->opcode) { case INT3_INSN_OPCODE: + case RET_INSN_OPCODE: break; case CALL_INSN_OPCODE: diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 5f943b938167..b3eef1d5c903 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1429,6 +1429,9 @@ void __init apic_intr_mode_init(void) break; } + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); + apic_bsp_setup(upmode); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 21f9c7f11779..7b3c7e0d4a09 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -860,10 +860,10 @@ void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, { init_irq_alloc_info(info, NULL); info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; - info->ioapic_node = node; - info->ioapic_trigger = trigger; - info->ioapic_polarity = polarity; - info->ioapic_valid = 1; + info->ioapic.node = node; + info->ioapic.trigger = trigger; + info->ioapic.polarity = polarity; + info->ioapic.valid = 1; } #ifndef CONFIG_ACPI @@ -878,32 +878,32 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, copy_irq_alloc_info(dst, src); dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; - dst->ioapic_id = mpc_ioapic_id(ioapic_idx); - dst->ioapic_pin = pin; - dst->ioapic_valid = 1; - if (src && src->ioapic_valid) { - dst->ioapic_node = src->ioapic_node; - dst->ioapic_trigger = src->ioapic_trigger; - dst->ioapic_polarity = src->ioapic_polarity; + dst->devid = mpc_ioapic_id(ioapic_idx); + dst->ioapic.pin = pin; + dst->ioapic.valid = 1; + if (src && src->ioapic.valid) { + dst->ioapic.node = src->ioapic.node; + dst->ioapic.trigger = src->ioapic.trigger; + dst->ioapic.polarity = src->ioapic.polarity; } else { - dst->ioapic_node = NUMA_NO_NODE; + dst->ioapic.node = NUMA_NO_NODE; if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { - dst->ioapic_trigger = trigger; - dst->ioapic_polarity = polarity; + dst->ioapic.trigger = trigger; + dst->ioapic.polarity = polarity; } else { /* * PCI interrupts are always active low level * triggered. */ - dst->ioapic_trigger = IOAPIC_LEVEL; - dst->ioapic_polarity = IOAPIC_POL_LOW; + dst->ioapic.trigger = IOAPIC_LEVEL; + dst->ioapic.polarity = IOAPIC_POL_LOW; } } } static int ioapic_alloc_attr_node(struct irq_alloc_info *info) { - return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE; + return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE; } static void mp_register_handler(unsigned int irq, unsigned long trigger) @@ -933,14 +933,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { - if (info->ioapic_trigger != data->trigger) - mp_register_handler(irq, info->ioapic_trigger); - data->entry.trigger = data->trigger = info->ioapic_trigger; - data->entry.polarity = data->polarity = info->ioapic_polarity; + if (info->ioapic.trigger != data->trigger) + mp_register_handler(irq, info->ioapic.trigger); + data->entry.trigger = data->trigger = info->ioapic.trigger; + data->entry.polarity = data->polarity = info->ioapic.polarity; } - return data->trigger == info->ioapic_trigger && - data->polarity == info->ioapic_polarity; + return data->trigger == info->ioapic.trigger && + data->polarity == info->ioapic.polarity; } static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, @@ -1002,7 +1002,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, if (!mp_check_pin_attr(irq, info)) return -EBUSY; if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, - info->ioapic_pin)) + info->ioapic.pin)) return -ENOMEM; } else { info->flags |= X86_IRQ_ALLOC_LEGACY; @@ -2092,8 +2092,8 @@ static int mp_alloc_timer_irq(int ioapic, int pin) struct irq_alloc_info info; ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); - info.ioapic_id = mpc_ioapic_id(ioapic); - info.ioapic_pin = pin; + info.devid = mpc_ioapic_id(ioapic); + info.ioapic.pin = pin; mutex_lock(&ioapic_mutex); irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); mutex_unlock(&ioapic_mutex); @@ -2297,9 +2297,9 @@ static int mp_irqdomain_create(int ioapic) return 0; init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_IOAPIC; - info.ioapic_id = mpc_ioapic_id(ioapic); - parent = irq_remapping_get_ir_irq_domain(&info); + info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT; + info.devid = mpc_ioapic_id(ioapic); + parent = irq_remapping_get_irq_domain(&info); if (!parent) parent = x86_vector_domain; else @@ -2933,9 +2933,9 @@ int mp_ioapic_registered(u32 gsi_base) static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { - if (info && info->ioapic_valid) { - data->trigger = info->ioapic_trigger; - data->polarity = info->ioapic_polarity; + if (info && info->ioapic.valid) { + data->trigger = info->ioapic.trigger; + data->polarity = info->ioapic.polarity; } else if (acpi_get_override_irq(gsi, &data->trigger, &data->polarity) < 0) { /* PCI interrupts are always active low level triggered. */ @@ -2981,7 +2981,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; ioapic = mp_irqdomain_ioapic_idx(domain); - pin = info->ioapic_pin; + pin = info->ioapic.pin; if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) return -EEXIST; @@ -2989,7 +2989,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, if (!data) return -ENOMEM; - info->ioapic_entry = &data->entry; + info->ioapic.entry = &data->entry; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); if (ret < 0) { kfree(data); @@ -2997,7 +2997,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, } INIT_LIST_HEAD(&data->irq_2_pin); - irq_data->hwirq = info->ioapic_pin; + irq_data->hwirq = info->ioapic.pin; irq_data->chip = (domain->parent == x86_vector_domain) ? &ioapic_chip : &ioapic_ir_chip; irq_data->chip_data = data; @@ -3007,8 +3007,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); local_irq_save(flags); - if (info->ioapic_entry) - mp_setup_entry(cfg, data, info->ioapic_entry); + if (info->ioapic.entry) + mp_setup_entry(cfg, data, info->ioapic.entry); mp_register_handler(virq, data->trigger); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c2b2911feeef..6313f0a05db7 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -21,7 +21,7 @@ #include <asm/apic.h> #include <asm/irq_remapping.h> -static struct irq_domain *msi_default_domain; +struct irq_domain *x86_pci_msi_default_domain __ro_after_init; static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) { @@ -45,7 +45,7 @@ static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) MSI_DATA_VECTOR(cfg->vector); } -static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { __irq_msi_compose_msg(irqd_cfg(data), msg); } @@ -177,40 +177,10 @@ static struct irq_chip pci_msi_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, .irq_set_affinity = msi_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - struct irq_domain *domain; - struct irq_alloc_info info; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_MSI; - info.msi_dev = dev; - - domain = irq_remapping_get_irq_domain(&info); - if (domain == NULL) - domain = msi_default_domain; - if (domain == NULL) - return -ENOSYS; - - return msi_domain_alloc_irqs(domain, &dev->dev, nvec); -} - -void native_teardown_msi_irq(unsigned int irq) -{ - irq_domain_free_irqs(irq, 1); -} - -static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->msi_hwirq; -} - int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { @@ -218,11 +188,10 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, struct msi_desc *desc = first_pci_msi_entry(pdev); init_irq_alloc_info(arg, NULL); - arg->msi_dev = pdev; if (desc->msi_attrib.is_msix) { - arg->type = X86_IRQ_ALLOC_TYPE_MSIX; + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; } else { - arg->type = X86_IRQ_ALLOC_TYPE_MSI; + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; } @@ -230,16 +199,8 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, } EXPORT_SYMBOL_GPL(pci_msi_prepare); -void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) -{ - arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); -} -EXPORT_SYMBOL_GPL(pci_msi_set_desc); - static struct msi_domain_ops pci_msi_domain_ops = { - .get_hwirq = pci_msi_get_hwirq, .msi_prepare = pci_msi_prepare, - .set_desc = pci_msi_set_desc, }; static struct msi_domain_info pci_msi_domain_info = { @@ -251,25 +212,32 @@ static struct msi_domain_info pci_msi_domain_info = { .handler_name = "edge", }; -void __init arch_init_msi_domain(struct irq_domain *parent) +struct irq_domain * __init native_create_pci_msi_domain(void) { struct fwnode_handle *fn; + struct irq_domain *d; if (disable_apic) - return; + return NULL; fn = irq_domain_alloc_named_fwnode("PCI-MSI"); - if (fn) { - msi_default_domain = - pci_msi_create_irq_domain(fn, &pci_msi_domain_info, - parent); - } - if (!msi_default_domain) { + if (!fn) + return NULL; + + d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, + x86_vector_domain); + if (!d) { irq_domain_free_fwnode(fn); - pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); + pr_warn("Failed to initialize PCI-MSI irqdomain.\n"); } else { - msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; + d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; } + return d; +} + +void __init x86_create_pci_msi_domain(void) +{ + x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain(); } #ifdef CONFIG_IRQ_REMAP @@ -279,7 +247,6 @@ static struct irq_chip pci_msi_ir_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -321,35 +288,28 @@ static struct irq_chip dmar_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->dmar_id; -} - static int dmar_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL, - handle_edge_irq, arg->dmar_data, "edge"); + irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); return 0; } static struct msi_domain_ops dmar_msi_domain_ops = { - .get_hwirq = dmar_msi_get_hwirq, .msi_init = dmar_msi_init, }; static struct msi_domain_info dmar_msi_domain_info = { .ops = &dmar_msi_domain_ops, .chip = &dmar_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, }; static struct irq_domain *dmar_get_irq_domain(void) @@ -384,8 +344,9 @@ int dmar_alloc_hwirq(int id, int node, void *arg) init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_DMAR; - info.dmar_id = id; - info.dmar_data = arg; + info.devid = id; + info.hwirq = id; + info.data = arg; return irq_domain_alloc_irqs(domain, 1, node, &info); } @@ -419,24 +380,17 @@ static struct irq_chip hpet_msi_controller __ro_after_init = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->hpet_index; -} - static int hpet_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL, - handle_edge_irq, arg->hpet_data, "edge"); + irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); return 0; } @@ -448,7 +402,6 @@ static void hpet_msi_free(struct irq_domain *domain, } static struct msi_domain_ops hpet_msi_domain_ops = { - .get_hwirq = hpet_msi_get_hwirq, .msi_init = hpet_msi_init, .msi_free = hpet_msi_free, }; @@ -456,6 +409,7 @@ static struct msi_domain_ops hpet_msi_domain_ops = { static struct msi_domain_info hpet_msi_domain_info = { .ops = &hpet_msi_domain_ops, .chip = &hpet_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, }; struct irq_domain *hpet_create_irq_domain(int hpet_id) @@ -476,9 +430,9 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) domain_info->data = (void *)(long)hpet_id; init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.hpet_id = hpet_id; - parent = irq_remapping_get_ir_irq_domain(&info); + info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT; + info.devid = hpet_id; + parent = irq_remapping_get_irq_domain(&info); if (parent == NULL) parent = x86_vector_domain; else @@ -506,9 +460,9 @@ int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.hpet_data = hc; - info.hpet_id = hpet_dev_id(domain); - info.hpet_index = dev_num; + info.data = hc; + info.devid = hpet_dev_id(domain); + info.hwirq = dev_num; return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 99ee61c9ba54..67b6f7c049ec 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -170,9 +170,6 @@ void __init default_setup_apic_routing(void) if (apic->setup_apic_routing) apic->setup_apic_routing(); - - if (x86_platform.apic_post_init) - x86_platform.apic_post_init(); } void __init generic_apic_probe(void) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bd3835d6b535..c46720f185c0 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -32,9 +32,6 @@ void __init default_setup_apic_routing(void) break; } } - - if (x86_platform.apic_post_init) - x86_platform.apic_post_init(); } int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f8a56b5dc29f..1eac53632786 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -714,8 +714,6 @@ int __init arch_early_irq_init(void) BUG_ON(x86_vector_domain == NULL); irq_set_default_host(x86_vector_domain); - arch_init_msi_domain(x86_vector_domain); - BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); /* @@ -824,6 +822,7 @@ static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, .irq_set_affinity = apic_set_affinity, + .irq_compose_msi_msg = x86_vector_msi_compose_msg, .irq_retrigger = apic_retrigger_irq, }; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0b6eea3f54e6..714233cee0b5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,6 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #include <linux/crash_dump.h> @@ -29,19 +30,24 @@ static int uv_hubbed_system; static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; static union uvh_apicid uvh_apicid; +static int uv_node_id; -/* Unpack OEM/TABLE ID's to be NULL terminated strings */ +/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */ +static u8 uv_archtype[UV_AT_SIZE]; static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; -/* Information derived from CPUID: */ +/* Information derived from CPUID and some UV MMRs */ static struct { unsigned int apicid_shift; unsigned int apicid_mask; unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */ unsigned int pnode_mask; + unsigned int nasid_shift; unsigned int gpa_shift; unsigned int gnode_shift; + unsigned int m_skt; + unsigned int n_skt; } uv_cpuid; static int uv_min_hub_revision_id; @@ -77,6 +83,9 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) static inline bool is_GRU_range(u64 start, u64 end) { + if (!gru_start_paddr) + return false; + return start >= gru_start_paddr && end <= gru_end_paddr; } @@ -85,43 +94,102 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end) return is_ISA_range(start, end) || is_GRU_range(start, end); } -static int __init early_get_pnodeid(void) +static void __init early_get_pnodeid(void) { - union uvh_node_id_u node_id; - union uvh_rh_gam_config_mmr_u m_n_config; int pnode; - /* Currently, all blades have same revision number */ + uv_cpuid.m_skt = 0; + if (UVH_RH10_GAM_ADDR_MAP_CONFIG) { + union uvh_rh10_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH10_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + uv_cpuid.nasid_shift = 0; + } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) { + union uvh_rh_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + if (is_uv(UV3)) + uv_cpuid.m_skt = m_n_config.s3.m_skt; + if (is_uv(UV2)) + uv_cpuid.m_skt = m_n_config.s2.m_skt; + uv_cpuid.nasid_shift = 1; + } else { + unsigned long GAM_ADDR_MAP_CONFIG = 0; + + WARN(GAM_ADDR_MAP_CONFIG == 0, + "UV: WARN: GAM_ADDR_MAP_CONFIG is not available\n"); + uv_cpuid.n_skt = 0; + uv_cpuid.nasid_shift = 0; + } + + if (is_uv(UV4|UVY)) + uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ + + uv_cpuid.pnode_mask = (1 << uv_cpuid.n_skt) - 1; + pnode = (uv_node_id >> uv_cpuid.nasid_shift) & uv_cpuid.pnode_mask; + uv_cpuid.gpa_shift = 46; /* Default unless changed */ + + pr_info("UV: n_skt:%d pnmsk:%x pn:%x\n", + uv_cpuid.n_skt, uv_cpuid.pnode_mask, pnode); +} + +/* Running on a UV Hubbed system, determine which UV Hub Type it is */ +static int __init early_set_hub_type(void) +{ + union uvh_node_id_u node_id; + + /* + * The NODE_ID MMR is always at offset 0. + * Contains the chip part # + revision. + * Node_id field started with 15 bits, + * ... now 7 but upper 8 are masked to 0. + * All blades/nodes have the same part # and hub revision. + */ node_id.v = uv_early_read_mmr(UVH_NODE_ID); - m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); - uv_min_hub_revision_id = node_id.s.revision; + uv_node_id = node_id.sx.node_id; switch (node_id.s.part_number) { - case UV2_HUB_PART_NUMBER: - case UV2_HUB_PART_NUMBER_X: - uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; + + case UV5_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV5_HUB_REVISION_BASE; + uv_hub_type_set(UV5); break; + + /* UV4/4A only have a revision difference */ + case UV4_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV4_HUB_REVISION_BASE; + uv_hub_type_set(UV4); + if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE) + uv_hub_type_set(UV4|UV4A); + break; + case UV3_HUB_PART_NUMBER: case UV3_HUB_PART_NUMBER_X: - uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; + uv_min_hub_revision_id = node_id.s.revision + + UV3_HUB_REVISION_BASE; + uv_hub_type_set(UV3); break; - /* Update: UV4A has only a modified revision to indicate HUB fixes */ - case UV4_HUB_PART_NUMBER: - uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; - uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ + case UV2_HUB_PART_NUMBER: + case UV2_HUB_PART_NUMBER_X: + uv_min_hub_revision_id = node_id.s.revision + + UV2_HUB_REVISION_BASE - 1; + uv_hub_type_set(UV2); break; + + default: + return 0; } - uv_hub_info->hub_revision = uv_min_hub_revision_id; - uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1; - pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask; - uv_cpuid.gpa_shift = 46; /* Default unless changed */ + pr_info("UV: part#:%x rev:%d rev_id:%d UVtype:0x%x\n", + node_id.s.part_number, node_id.s.revision, + uv_min_hub_revision_id, is_uv(~0)); - pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n", - node_id.s.revision, node_id.s.part_number, node_id.s.node_id, - m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode); - return pnode; + return 1; } static void __init uv_tsc_check_sync(void) @@ -130,38 +198,41 @@ static void __init uv_tsc_check_sync(void) int sync_state; int mmr_shift; char *state; - bool valid; - /* Accommodate different UV arch BIOSes */ + /* Different returns from different UV BIOS versions */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; + /* Check if TSC is valid for all sockets */ switch (sync_state) { case UVH_TSC_SYNC_VALID: state = "in sync"; - valid = true; + mark_tsc_async_resets("UV BIOS"); break; - case UVH_TSC_SYNC_INVALID: - state = "unstable"; - valid = false; + /* If BIOS state unknown, don't do anything */ + case UVH_TSC_SYNC_UNKNOWN: + state = "unknown"; break; + + /* Otherwise, BIOS indicates problem with TSC */ default: - state = "unknown: assuming valid"; - valid = true; + state = "unstable"; + mark_tsc_unstable("UV BIOS"); break; } pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state); - - /* Mark flag that says TSC != 0 is valid for socket 0 */ - if (valid) - mark_tsc_async_resets("UV BIOS"); - else - mark_tsc_unstable("UV BIOS"); } +/* Selector for (4|4A|5) structs */ +#define uvxy_field(sname, field, undef) ( \ + is_uv(UV4A) ? sname.s4a.field : \ + is_uv(UV4) ? sname.s4.field : \ + is_uv(UV3) ? sname.s3.field : \ + undef) + /* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ #define SMT_LEVEL 0 /* Leaf 0xb SMT level */ @@ -221,29 +292,110 @@ static void __init uv_stringify(int len, char *to, char *from) strncpy(to, from, len-1); } -static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) +/* Find UV arch type entry in UVsystab */ +static unsigned long __init early_find_archtype(struct uv_systab *st) +{ + int i; + + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; + + if (!ptr) + continue; + ptr += (unsigned long)st; + if (st->entry[i].type == UV_SYSTAB_TYPE_ARCH_TYPE) + return ptr; + } + return 0; +} + +/* Validate UV arch type field in UVsystab */ +static int __init decode_arch_type(unsigned long ptr) +{ + struct uv_arch_type_entry *uv_ate = (struct uv_arch_type_entry *)ptr; + int n = strlen(uv_ate->archtype); + + if (n > 0 && n < sizeof(uv_ate->archtype)) { + pr_info("UV: UVarchtype received from BIOS\n"); + uv_stringify(UV_AT_SIZE, uv_archtype, uv_ate->archtype); + return 1; + } + return 0; +} + +/* Determine if UV arch type entry might exist in UVsystab */ +static int __init early_get_arch_type(void) { - int pnodeid; - int uv_apic; + unsigned long uvst_physaddr, uvst_size, ptr; + struct uv_systab *st; + u32 rev; + int ret; + + uvst_physaddr = get_uv_systab_phys(0); + if (!uvst_physaddr) + return 0; + + st = early_memremap_ro(uvst_physaddr, sizeof(struct uv_systab)); + if (!st) { + pr_err("UV: Cannot access UVsystab, remap failed\n"); + return 0; + } + rev = st->revision; + if (rev < UV_SYSTAB_VERSION_UV5) { + early_memunmap(st, sizeof(struct uv_systab)); + return 0; + } + + uvst_size = st->size; + early_memunmap(st, sizeof(struct uv_systab)); + st = early_memremap_ro(uvst_physaddr, uvst_size); + if (!st) { + pr_err("UV: Cannot access UVarchtype, remap failed\n"); + return 0; + } + + ptr = early_find_archtype(st); + if (!ptr) { + early_memunmap(st, uvst_size); + return 0; + } + + ret = decode_arch_type(ptr); + early_memunmap(st, uvst_size); + return ret; +} + +static int __init uv_set_system_type(char *_oem_id) +{ + /* Save OEM_ID passed from ACPI MADT */ uv_stringify(sizeof(oem_id), oem_id, _oem_id); - uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - if (strncmp(oem_id, "SGI", 3) != 0) { - if (strncmp(oem_id, "NSGI", 4) != 0) + /* Check if BIOS sent us a UVarchtype */ + if (!early_get_arch_type()) + + /* If not use OEM ID for UVarchtype */ + uv_stringify(UV_AT_SIZE, uv_archtype, _oem_id); + + /* Check if not hubbed */ + if (strncmp(uv_archtype, "SGI", 3) != 0) { + + /* (Not hubbed), check if not hubless */ + if (strncmp(uv_archtype, "NSGI", 4) != 0) + + /* (Not hubless), not a UV */ return 0; - /* UV4 Hubless, CH, (0x11:UV4+Any) */ - if (strncmp(oem_id, "NSGI4", 5) == 0) + /* UV4 Hubless: CH */ + if (strncmp(uv_archtype, "NSGI4", 5) == 0) uv_hubless_system = 0x11; - /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */ + /* UV3 Hubless: UV300/MC990X w/o hub */ else uv_hubless_system = 0x9; - pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n", - oem_id, oem_table_id, uv_hubless_system); - + pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", + oem_id, oem_table_id, uv_system_type, uv_hubless_system); return 0; } @@ -252,60 +404,83 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) return 0; } - /* Set up early hub type field in uv_hub_info for Node 0 */ - uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; + /* Set hubbed type if true */ + uv_hub_info->hub_revision = + !strncmp(uv_archtype, "SGI5", 4) ? UV5_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI4", 4) ? UV4_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI3", 4) ? UV3_HUB_REVISION_BASE : + !strcmp(uv_archtype, "SGI2") ? UV2_HUB_REVISION_BASE : 0; + + switch (uv_hub_info->hub_revision) { + case UV5_HUB_REVISION_BASE: + uv_hubbed_system = 0x21; + uv_hub_type_set(UV5); + break; - /* - * Determine UV arch type. - * SGI2: UV2000/3000 - * SGI3: UV300 (truncated to 4 chars because of different varieties) - * SGI4: UV400 (truncated to 4 chars because of different varieties) - */ - if (!strncmp(oem_id, "SGI4", 4)) { - uv_hub_info->hub_revision = UV4_HUB_REVISION_BASE; + case UV4_HUB_REVISION_BASE: uv_hubbed_system = 0x11; + uv_hub_type_set(UV4); + break; - } else if (!strncmp(oem_id, "SGI3", 4)) { - uv_hub_info->hub_revision = UV3_HUB_REVISION_BASE; + case UV3_HUB_REVISION_BASE: uv_hubbed_system = 0x9; + uv_hub_type_set(UV3); + break; - } else if (!strcmp(oem_id, "SGI2")) { - uv_hub_info->hub_revision = UV2_HUB_REVISION_BASE; + case UV2_HUB_REVISION_BASE: uv_hubbed_system = 0x5; + uv_hub_type_set(UV2); + break; - } else { - uv_hub_info->hub_revision = 0; - goto badbios; + default: + return 0; } - pnodeid = early_get_pnodeid(); - early_get_apic_socketid_shift(); + /* Get UV hub chip part number & revision */ + early_set_hub_type(); + /* Other UV setup functions */ + early_get_pnodeid(); + early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; + uv_tsc_check_sync(); + + return 1; +} + +/* Called early to probe for the correct APIC driver */ +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) +{ + /* Set up early hub info fields for Node 0 */ + uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; + + /* If not UV, return. */ + if (likely(uv_set_system_type(_oem_id) == 0)) + return 0; + + /* Save and Decode OEM Table ID */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - if (!strcmp(oem_table_id, "UVX")) { - /* This is the most common hardware variant: */ + /* This is the most common hardware variant, x2apic mode */ + if (!strcmp(oem_table_id, "UVX")) uv_system_type = UV_X2APIC; - uv_apic = 0; - } else if (!strcmp(oem_table_id, "UVL")) { - /* Only used for very small systems: */ + /* Only used for very small systems, usually 1 chassis, legacy mode */ + else if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; - uv_apic = 0; - } else { + else goto badbios; - } - pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic); - uv_tsc_check_sync(); + pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", + oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), + uv_min_hub_revision_id); - return uv_apic; + return 0; badbios: - pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id); - pr_err("Current UV Type or BIOS not supported\n"); + pr_err("UV: UVarchtype:%s not supported\n", uv_archtype); BUG(); } @@ -673,12 +848,12 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { }; #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 -#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT +#define DEST_SHIFT UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { - union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; - union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; + union uvh_rh_gam_alias_2_overlay_config_u alias; + union uvh_rh_gam_alias_2_redirect_config_u redirect; unsigned long m_redirect; unsigned long m_overlay; int i; @@ -686,16 +861,16 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) { switch (i) { case 0: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; + m_redirect = UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG; break; case 1: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; + m_redirect = UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG; break; case 2: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; + m_redirect = UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG; break; } alias.v = uv_read_local_mmr(m_overlay); @@ -710,6 +885,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) } enum map_type {map_wb, map_uc}; +static const char * const mt[] = { "WB", "UC" }; static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type) { @@ -721,23 +897,36 @@ static __init void map_high(char *id, unsigned long base, int pshift, int bshift pr_info("UV: Map %s_HI base address NULL\n", id); return; } - pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) init_extra_mapping_uc(paddr, bytes); else init_extra_mapping_wb(paddr, bytes); + + pr_info("UV: Map %s_HI 0x%lx - 0x%lx %s (%d segments)\n", + id, paddr, paddr + bytes, mt[map_type], max_pnode + 1); } static __init void map_gru_high(int max_pnode) { - union uvh_rh_gam_gru_overlay_config_mmr_u gru; - int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; - unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK; - unsigned long base; + union uvh_rh_gam_gru_overlay_config_u gru; + unsigned long mask, base; + int shift; + + if (UVH_RH_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else if (UVH_RH10_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH10_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else { + pr_err("UV: GRU unavailable (no MMR)\n"); + return; + } - gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (!gru.s.enable) { - pr_info("UV: GRU disabled\n"); + pr_info("UV: GRU disabled (by BIOS)\n"); return; } @@ -749,62 +938,104 @@ static __init void map_gru_high(int max_pnode) static __init void map_mmr_high(int max_pnode) { - union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; - int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + unsigned long base; + int shift; + bool enable; + + if (UVH_RH10_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh10_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH10_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else if (UVH_RH_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else { + pr_err("UV:%s:RH_GAM_MMR_OVERLAY_CONFIG MMR undefined?\n", + __func__); + return; + } - mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); - if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); + if (enable) + map_high("MMR", base, shift, shift, max_pnode, map_uc); else pr_info("UV: MMR disabled\n"); } -/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */ -static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) -{ - unsigned long overlay; - unsigned long mmr; - unsigned long base; - unsigned long nasid_mask; - unsigned long m_overlay; - int i, n, shift, m_io, max_io; - int nasid, lnasid, fi, li; - char *id; - - if (index == 0) { - id = "MMIOH0"; - m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR; - overlay = uv_read_local_mmr(m_overlay); - base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK; - mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR; - m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK) - >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; - shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; - n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; - nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK; - } else { - id = "MMIOH1"; - m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR; - overlay = uv_read_local_mmr(m_overlay); - base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK; - mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR; - m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK) - >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; - shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; - n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH; - nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK; +/* Arch specific ENUM cases */ +enum mmioh_arch { + UV2_MMIOH = -1, + UVY_MMIOH0, UVY_MMIOH1, + UVX_MMIOH0, UVX_MMIOH1, +}; + +/* Calculate and Map MMIOH Regions */ +static void __init calc_mmioh_map(enum mmioh_arch index, + int min_pnode, int max_pnode, + int shift, unsigned long base, int m_io, int n_io) +{ + unsigned long mmr, nasid_mask; + int nasid, min_nasid, max_nasid, lnasid, mapped; + int i, fi, li, n, max_io; + char id[8]; + + /* One (UV2) mapping */ + if (index == UV2_MMIOH) { + strncpy(id, "MMIOH", sizeof(id)); + max_io = max_pnode; + mapped = 0; + goto map_exit; } - pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io); - if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) { - pr_info("UV: %s disabled\n", id); + + /* small and large MMIOH mappings */ + switch (index) { + case UVY_MMIOH0: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVY_MMIOH1: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVX_MMIOH0: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + case UVX_MMIOH1: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + default: + pr_err("UV:%s:Invalid mapping type:%d\n", __func__, index); return; } - /* Convert to NASID: */ - min_pnode *= 2; - max_pnode *= 2; - max_io = lnasid = fi = li = -1; + /* enum values chosen so (index mod 2) is MMIOH 0/1 (low/high) */ + snprintf(id, sizeof(id), "MMIOH%d", index%2); + max_io = lnasid = fi = li = -1; for (i = 0; i < n; i++) { unsigned long m_redirect = mmr + i * 8; unsigned long redirect = uv_read_local_mmr(m_redirect); @@ -814,9 +1045,12 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n", id, redirect, m_redirect, nasid); - /* Invalid NASID: */ - if (nasid < min_pnode || max_pnode < nasid) + /* Invalid NASID check */ + if (nasid < min_nasid || max_nasid < nasid) { + pr_err("UV:%s:Invalid NASID:%x (range:%x..%x)\n", + __func__, index, min_nasid, max_nasid); nasid = -1; + } if (nasid == lnasid) { li = i; @@ -839,7 +1073,8 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) } addr1 = (base << shift) + f * (1ULL << m_io); addr2 = (base << shift) + (l + 1) * (1ULL << m_io); - pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", + id, fi, li, lnasid, addr1, addr2); if (max_io < l) max_io = l; } @@ -847,49 +1082,93 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) lnasid = nasid; } - pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io); +map_exit: + pr_info("UV: %s base:0x%lx shift:%d m_io:%d max_io:%d max_pnode:0x%x\n", + id, base, shift, m_io, max_io, max_pnode); - if (max_io >= 0) + if (max_io >= 0 && !mapped) map_high(id, base, shift, m_io, max_io, map_uc); } static __init void map_mmioh_high(int min_pnode, int max_pnode) { - union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; - unsigned long mmr, base; - int shift, enable, m_io, n_io; + /* UVY flavor */ + if (UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh10_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh10_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH0, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + mmioh0.s.base, mmioh0.s.m_io, mmioh0.s.n_io); - if (is_uv3_hub() || is_uv4_hub()) { - /* Map both MMIOH regions: */ - map_mmioh_high_uv34(0, min_pnode, max_pnode); - map_mmioh_high_uv34(1, min_pnode, max_pnode); + mmioh1.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH1, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + mmioh1.s.base, mmioh1.s.m_io, mmioh1.s.n_io); return; } + /* UVX flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh0, base, 0); + int m_io = uvxy_field(mmioh0, m_io, 0); + int n_io = uvxy_field(mmioh0, n_io, 0); + + calc_mmioh_map(UVX_MMIOH0, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + base, m_io, n_io); + } - if (is_uv2_hub()) { - mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s2.enable; - base = mmioh.s2.base; - m_io = mmioh.s2.m_io; - n_io = mmioh.s2.n_io; - - if (enable) { - max_pnode &= (1 << n_io) - 1; - pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", - base, shift, m_io, n_io, max_pnode); - map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); - } else { - pr_info("UV: MMIOH disabled\n"); + mmioh1.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh1, base, 0); + int m_io = uvxy_field(mmioh1, m_io, 0); + int n_io = uvxy_field(mmioh1, n_io, 0); + + calc_mmioh_map(UVX_MMIOH1, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + base, m_io, n_io); } + return; + } + + /* UV2 flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG) { + union uvh_rh_gam_mmioh_overlay_config_u mmioh; + + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG); + if (unlikely(mmioh.s2.enable == 0)) + pr_info("UV: MMIOH disabled\n"); + else + calc_mmioh_map(UV2_MMIOH, min_pnode, max_pnode, + UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT, + mmioh.s2.base, mmioh.s2.m_io, mmioh.s2.n_io); + return; } } static __init void map_low_mmrs(void) { - init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); - init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); + if (UV_GLOBAL_MMR32_BASE) + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + + if (UV_LOCAL_MMR_BASE) + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); } static __init void uv_rtc_init(void) @@ -909,85 +1188,6 @@ static __init void uv_rtc_init(void) } } -/* - * percpu heartbeat timer - */ -static void uv_heartbeat(struct timer_list *timer) -{ - unsigned char bits = uv_scir_info->state; - - /* Flip heartbeat bit: */ - bits ^= SCIR_CPU_HEARTBEAT; - - /* Is this CPU idle? */ - if (idle_cpu(raw_smp_processor_id())) - bits &= ~SCIR_CPU_ACTIVITY; - else - bits |= SCIR_CPU_ACTIVITY; - - /* Update system controller interface reg: */ - uv_set_scir_bits(bits); - - /* Enable next timer period: */ - mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); -} - -static int uv_heartbeat_enable(unsigned int cpu) -{ - while (!uv_cpu_scir_info(cpu)->enabled) { - struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; - - uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - timer_setup(timer, uv_heartbeat, TIMER_PINNED); - timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; - add_timer_on(timer, cpu); - uv_cpu_scir_info(cpu)->enabled = 1; - - /* Also ensure that boot CPU is enabled: */ - cpu = 0; - } - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -static int uv_heartbeat_disable(unsigned int cpu) -{ - if (uv_cpu_scir_info(cpu)->enabled) { - uv_cpu_scir_info(cpu)->enabled = 0; - del_timer(&uv_cpu_scir_info(cpu)->timer); - } - uv_set_cpu_scir_bits(cpu, 0xff); - return 0; -} - -static __init void uv_scir_register_cpu_notifier(void) -{ - cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online", - uv_heartbeat_enable, uv_heartbeat_disable); -} - -#else /* !CONFIG_HOTPLUG_CPU */ - -static __init void uv_scir_register_cpu_notifier(void) -{ -} - -static __init int uv_init_heartbeat(void) -{ - int cpu; - - if (is_uv_system()) { - for_each_online_cpu(cpu) - uv_heartbeat_enable(cpu); - } - - return 0; -} - -late_initcall(uv_init_heartbeat); - -#endif /* !CONFIG_HOTPLUG_CPU */ - /* Direct Legacy VGA I/O traffic to designated IOH */ static int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags) { @@ -1027,26 +1227,22 @@ struct mn { unsigned char n_lshift; }; +/* Initialize caller's MN struct and fill in values */ static void get_mn(struct mn *mnp) { - union uvh_rh_gam_config_mmr_u m_n_config; - union uv3h_gr0_gam_gr_config_u m_gr_config; - - /* Make sure the whole structure is well initialized: */ memset(mnp, 0, sizeof(*mnp)); - - m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); - mnp->n_val = m_n_config.s.n_skt; - - if (is_uv4_hub()) { + mnp->n_val = uv_cpuid.n_skt; + if (is_uv(UV4|UVY)) { mnp->m_val = 0; mnp->n_lshift = 0; } else if (is_uv3_hub()) { - mnp->m_val = m_n_config.s3.m_skt; - m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + union uvyh_gr0_gam_gr_config_u m_gr_config; + + mnp->m_val = uv_cpuid.m_skt; + m_gr_config.v = uv_read_local_mmr(UVH_GR0_GAM_GR_CONFIG); mnp->n_lshift = m_gr_config.s3.m_skt; } else if (is_uv2_hub()) { - mnp->m_val = m_n_config.s2.m_skt; + mnp->m_val = uv_cpuid.m_skt; mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; } mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; @@ -1054,7 +1250,6 @@ static void get_mn(struct mn *mnp) static void __init uv_init_hub_info(struct uv_hub_info_s *hi) { - union uvh_node_id_u node_id; struct mn mn; get_mn(&mn); @@ -1067,7 +1262,9 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->m_shift = mn.m_shift; hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0; hi->hub_revision = uv_hub_info->hub_revision; + hi->hub_type = uv_hub_info->hub_type; hi->pnode_mask = uv_cpuid.pnode_mask; + hi->nasid_shift = uv_cpuid.nasid_shift; hi->min_pnode = _min_pnode; hi->min_socket = _min_socket; hi->pnode_to_socket = _pnode_to_socket; @@ -1076,9 +1273,8 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->gr_table_len = _gr_table_len; hi->gr_table = _gr_table; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); - hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; + hi->gnode_extra = (uv_node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; if (mn.m_val) hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; @@ -1090,7 +1286,9 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->gpa_shift = uv_gp_table->gpa_shift; hi->gpa_mask = (1UL << hi->gpa_shift) - 1; } else { - hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + hi->global_mmr_base = + uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG) & + ~UV_MMR_ENABLE; hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; } @@ -1101,7 +1299,11 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) /* Show system specific info: */ pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift); pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift); - pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift); + pr_info("UV: mmr_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift); + if (hi->global_gru_base) + pr_info("UV: gru_base/shift:0x%lx/%ld\n", + hi->global_gru_base, hi->global_gru_shift); + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra); } @@ -1173,21 +1375,25 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); } +/* Walk through UVsystab decoding the fields */ static int __init decode_uv_systab(void) { struct uv_systab *st; int i; - /* If system is uv3 or lower, there is no extended UVsystab */ - if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4)) - return 0; /* No extended UVsystab required */ - + /* Get mapped UVsystab pointer */ st = uv_systab; + + /* If UVsystab is version 1, there is no extended UVsystab */ + if (st && st->revision == UV_SYSTAB_VERSION_1) + return 0; + if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) { int rev = st ? st->revision : 0; - pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST); - pr_err("UV: Cannot support UV operations, switching to generic PC\n"); + pr_err("UV: BIOS UVsystab mismatch, (%x < %x)\n", + rev, UV_SYSTAB_VERSION_UV4_LATEST); + pr_err("UV: Does not support UV, switch to non-UV x86_64\n"); uv_system_type = UV_NONE; return -EINVAL; @@ -1199,7 +1405,8 @@ static int __init decode_uv_systab(void) if (!ptr) continue; - ptr = ptr + (unsigned long)st; + /* point to payload */ + ptr += (unsigned long)st; switch (st->entry[i].type) { case UV_SYSTAB_TYPE_GAM_PARAMS: @@ -1209,32 +1416,49 @@ static int __init decode_uv_systab(void) case UV_SYSTAB_TYPE_GAM_RNG_TBL: decode_gam_rng_tbl(ptr); break; + + case UV_SYSTAB_TYPE_ARCH_TYPE: + /* already processed in early startup */ + break; + + default: + pr_err("UV:%s:Unrecognized UV_SYSTAB_TYPE:%d, skipped\n", + __func__, st->entry[i].type); + break; } } return 0; } -/* - * Set up physical blade translations from UVH_NODE_PRESENT_TABLE - * .. NB: UVH_NODE_PRESENT_TABLE is going away, - * .. being replaced by GAM Range Table - */ +/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */ static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) { + unsigned long np; int i, uv_pb = 0; - pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH); - for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { - unsigned long np; - - np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); - if (np) + if (UVH_NODE_PRESENT_TABLE) { + pr_info("UV: NODE_PRESENT_DEPTH = %d\n", + UVH_NODE_PRESENT_TABLE_DEPTH); + for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); - + uv_pb += hweight64(np); + } + } + if (UVH_NODE_PRESENT_0) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_0); + pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np); + uv_pb += hweight64(np); + } + if (UVH_NODE_PRESENT_1) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_1); + pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np); uv_pb += hweight64(np); } if (uv_possible_blades != uv_pb) uv_possible_blades = uv_pb; + + pr_info("UV: number nodes/possible blades %d\n", uv_pb); } static void __init build_socket_tables(void) @@ -1253,7 +1477,7 @@ static void __init build_socket_tables(void) pr_info("UV: No UVsystab socket table, ignoring\n"); return; } - pr_crit("UV: Error: UVsystab address translations not available!\n"); + pr_err("UV: Error: UVsystab address translations not available!\n"); BUG(); } @@ -1379,9 +1603,9 @@ static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data) return 0; } -static int __maybe_unused proc_oemid_show(struct seq_file *file, void *data) +static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data) { - seq_printf(file, "%s/%s\n", oem_id, oem_table_id); + seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id); return 0; } @@ -1390,7 +1614,7 @@ static __init void uv_setup_proc_files(int hubless) struct proc_dir_entry *pde; pde = proc_mkdir(UV_PROC_NODE, NULL); - proc_create_single("oemid", 0, pde, proc_oemid_show); + proc_create_single("archtype", 0, pde, proc_archtype_show); if (hubless) proc_create_single("hubless", 0, pde, proc_hubless_show); else @@ -1429,7 +1653,8 @@ static void __init uv_system_init_hub(void) struct uv_hub_info_s hub_info = {0}; int bytes, cpu, nodeid; unsigned short min_pnode = 9999, max_pnode = 0; - char *hub = is_uv4_hub() ? "UV400" : + char *hub = is_uv5_hub() ? "UV500" : + is_uv4_hub() ? "UV400" : is_uv3_hub() ? "UV300" : is_uv2_hub() ? "UV2000/3000" : NULL; @@ -1441,12 +1666,14 @@ static void __init uv_system_init_hub(void) map_low_mmrs(); - /* Get uv_systab for decoding: */ + /* Get uv_systab for decoding, setup UV BIOS calls */ uv_bios_init(); /* If there's an UVsystab problem then abort UV init: */ - if (decode_uv_systab() < 0) + if (decode_uv_systab() < 0) { + pr_err("UV: Mangled UVsystab format\n"); return; + } build_socket_tables(); build_uv_gr_table(); @@ -1517,8 +1744,6 @@ static void __init uv_system_init_hub(void) uv_hub_info_list(numa_node_id)->pnode = pnode; else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) uv_cpu_hub_info(cpu)->pnode = pnode; - - uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid); } for_each_node(nodeid) { @@ -1547,7 +1772,6 @@ static void __init uv_system_init_hub(void) uv_nmi_setup(); uv_cpu_init(); - uv_scir_register_cpu_notifier(); uv_setup_proc_files(0); /* Register Legacy VGA I/O redirection handler: */ diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index c5cf336e5077..345f7d905db6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -65,6 +65,9 @@ static void init_c3(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; set_cpu_cap(c, X86_FEATURE_REP_GOOD); } + + if (c->x86 >= 7) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); } enum { @@ -90,18 +93,15 @@ enum { static void early_init_centaur(struct cpuinfo_x86 *c) { - switch (c->x86) { #ifdef CONFIG_X86_32 - case 5: - /* Emulate MTRRs using Centaur's MCR. */ + /* Emulate MTRRs using Centaur's MCR. */ + if (c->x86 == 5) set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); - break; #endif - case 6: - if (c->x86_model >= 0xf) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - break; - } + if ((c->x86 == 6 && c->x86_model >= 0xf) || + (c->x86 >= 7)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif @@ -145,9 +145,8 @@ static void init_centaur(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - switch (c->x86) { #ifdef CONFIG_X86_32 - case 5: + if (c->x86 == 5) { switch (c->x86_model) { case 4: name = "C6"; @@ -207,12 +206,10 @@ static void init_centaur(struct cpuinfo_x86 *c) c->x86_cache_size = (cc>>24)+(dd>>24); } sprintf(c->x86_model_id, "WinChip %s", name); - break; + } #endif - case 6: + if (c->x86 == 6 || c->x86 >= 7) init_c3(c); - break; - } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c5d6f17d9b9d..c51158914ea2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -23,6 +23,7 @@ #include <linux/syscore_ops.h> #include <linux/pgtable.h> +#include <asm/cmdline.h> #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> @@ -359,7 +360,7 @@ void native_write_cr0(unsigned long val) unsigned long bits_missing = 0; set_register: - asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + asm volatile("mov %0,%%cr0": "+r" (val) : : "memory"); if (static_branch_likely(&cr_pinning)) { if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { @@ -378,7 +379,7 @@ void native_write_cr4(unsigned long val) unsigned long bits_changed = 0; set_register: - asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + asm volatile("mov %0,%%cr4": "+r" (val) : : "memory"); if (static_branch_likely(&cr_pinning)) { if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) { @@ -1221,6 +1222,59 @@ static void detect_nopl(void) } /* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + char arg[128]; + char *argptr = arg; + int arglen, res, bit; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen <= 0) + return; + + pr_info("Clearing CPUID bits:"); + do { + res = get_option(&argptr, &bit); + if (res == 0 || res == 3) + break; + + /* If the argument was too long, the last bit may be cut off */ + if (res == 1 && arglen >= sizeof(arg)) + break; + + if (bit >= 0 && bit < NCAPINTS * 32) { + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + setup_clear_cpu_cap(bit); + } + } while (res == 2); + pr_cont("\n"); +} + +/* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. @@ -1255,6 +1309,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_cap(c); get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); + cpu_parse_early_param(); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1413,15 +1468,7 @@ static void generic_identify(struct cpuinfo_x86 *c) * ESPFIX issue, we can change this. */ #ifdef CONFIG_X86_32 -# ifdef CONFIG_PARAVIRT_XXL - do { - extern void native_iret(void); - if (pv_ops.cpu.iret == native_iret) - set_cpu_bug(c, X86_BUG_ESPFIX); - } while (0); -# else set_cpu_bug(c, X86_BUG_ESPFIX); -# endif #endif } diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 3cbe24ca80ab..d502241995a3 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -69,6 +69,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, + { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, {} }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 99be063fcb1b..0c6b02dd744c 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -132,49 +132,49 @@ static enum smca_bank_types smca_get_bank_type(unsigned int bank) } static struct smca_hwid smca_hwid_mcatypes[] = { - /* { bank_type, hwid_mcatype, xec_bitmap } */ + /* { bank_type, hwid_mcatype } */ /* Reserved type */ - { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) }, /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF }, - { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF }, - { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, - { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, - { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) }, + { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) }, /* HWID 0xB0 MCATYPE 0x4 is Reserved */ - { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF }, - { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, - { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) }, /* Data Fabric MCA types */ - { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, - { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F }, - { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, /* Unified Memory Controller MCA type */ - { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF }, + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, /* Parameter Block MCA type */ - { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, /* Platform Security Processor MCA type */ - { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, - { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF }, + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) }, /* System Management Unit MCA type */ - { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, - { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF }, + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) }, /* Microprocessor 5 Unit MCA type */ - { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, /* Northbridge IO Unit MCA type */ - { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F }, + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, /* PCI Express Unit MCA type */ - { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F }, + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index fc4f8c04bdb5..1c08cb9eb9f6 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -40,7 +40,6 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> -#include <linux/jump_label.h> #include <linux/set_memory.h> #include <linux/sync_core.h> #include <linux/task_work.h> @@ -373,42 +372,105 @@ static int msr_to_offset(u32 msr) return -1; } +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; +} + /* MSR access wrappers used for error injection */ -static u64 mce_rdmsrl(u32 msr) +static noinstr u64 mce_rdmsrl(u32 msr) { - u64 v; + DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + u64 ret; + + instrumentation_begin(); + offset = msr_to_offset(msr); if (offset < 0) - return 0; - return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - } + ret = 0; + else + ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); - /* - * Return zero in case the access faulted. This should - * not happen normally but can happen if the CPU does - * something weird, or if the code is buggy. - */ - v = 0; + instrumentation_end(); + + return ret; } - return v; + /* + * RDMSR on MCA MSRs should not fault. If they do, this is very much an + * architectural violation and needs to be reported to hw vendor. Panic + * the box to not allow any further progress. + */ + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + : EAX_EDX_RET(val, low, high) : "c" (msr)); + + + return EAX_EDX_VAL(val, low, high); +} + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr) +{ + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); + + return true; } -static void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrl(u32 msr, u64 v) { + u32 low, high; + if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + + instrumentation_begin(); + offset = msr_to_offset(msr); if (offset >= 0) *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; + + instrumentation_end(); + return; } - wrmsrl(msr, v); + + low = (u32)v; + high = (u32)(v >> 32); + + /* See comment in mce_rdmsrl() */ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + : : "c" (msr), "a"(low), "d" (high) : "memory"); } /* @@ -745,7 +807,7 @@ log_it: goto clear_it; mce_read_aux(&m, i); - m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -794,7 +856,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, quirk_no_way_out(i, m, regs); m->bank = i; - if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); *msg = tmp; return 1; @@ -872,7 +934,6 @@ static void mce_reign(void) struct mce *m = NULL; int global_worst = 0; char *msg = NULL; - char *nmsg = NULL; /* * This CPU is the Monarch and the other CPUs have run @@ -880,12 +941,10 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), - mca_cfg.tolerant, - &nmsg, true); - if (severity > global_worst) { - msg = nmsg; - global_worst = severity; + struct mce *mtmp = &per_cpu(mces_seen, cpu); + + if (mtmp->severity > global_worst) { + global_worst = mtmp->severity; m = &per_cpu(mces_seen, cpu); } } @@ -895,8 +954,11 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + /* call mce_severity() to get "msg" for panic */ + mce_severity(m, NULL, mca_cfg.tolerant, &msg, true); mce_panic("Fatal machine check", m, msg); + } /* * For UC somewhere we let the CPU who detects it handle it. @@ -1105,7 +1167,7 @@ static noinstr bool mce_check_crashing_cpu(void) return false; } -static void __mc_scan_banks(struct mce *m, struct mce *final, +static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, unsigned long *toclear, unsigned long *valid_banks, int no_way_out, int *worst) { @@ -1140,7 +1202,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, /* Set taint even when machine check was not enabled. */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(m, cfg->tolerant, NULL, true); + severity = mce_severity(m, regs, cfg->tolerant, NULL, true); /* * When machine check was for corrected/deferred handler don't @@ -1188,13 +1250,34 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && + !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + sync_core(); return; } - pr_err("Memory error not recovered"); - kill_me_now(cb); + if (p->mce_vaddr != (void __user *)-1l) { + force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); + } else { + pr_err("Memory error not recovered"); + kill_me_now(cb); + } +} + +static void queue_task_work(struct mce *m, int kill_it) +{ + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + + if (kill_it) + current->mce_kill_me.func = kill_me_now; + else + current->mce_kill_me.func = kill_me_maybe; + + task_work_add(current, ¤t->mce_kill_me, true); } /* @@ -1291,7 +1374,7 @@ noinstr void do_machine_check(struct pt_regs *regs) order = mce_start(&no_way_out); } - __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); + __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1313,7 +1396,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { - mce_severity(&m, cfg->tolerant, &msg, true); + mce_severity(&m, regs, cfg->tolerant, &msg, true); mce_panic("Local fatal machine check!", &m, msg); } } @@ -1330,25 +1413,16 @@ noinstr void do_machine_check(struct pt_regs *regs) if (worst > 0) irq_work_queue(&mce_irq_work); - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - - sync_core(); - if (worst != MCE_AR_SEVERITY && !kill_it) - return; + goto out; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - current->mce_addr = m.addr; - current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV); - current->mce_whole_page = whole_page(&m); - current->mce_kill_me.func = kill_me_maybe; - if (kill_it) - current->mce_kill_me.func = kill_me_now; - task_work_add(current, ¤t->mce_kill_me, true); + queue_task_work(&m, kill_it); + } else { /* * Handle an MCE which has happened in kernel space but from @@ -1363,7 +1437,12 @@ noinstr void do_machine_check(struct pt_regs *regs) if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } + + if (m.kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&m, kill_it); } +out: + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -2064,7 +2143,7 @@ void mce_disable_bank(int bank) and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold - * mce=recovery force enable memcpy_mcsafe() + * mce=recovery force enable copy_mc_fragile() */ static int __init mcheck_enable(char *str) { @@ -2672,13 +2751,10 @@ static void __init mcheck_debugfs_init(void) static void __init mcheck_debugfs_init(void) { } #endif -DEFINE_STATIC_KEY_FALSE(mcsafe_key); -EXPORT_SYMBOL_GPL(mcsafe_key); - static int __init mcheck_late_init(void) { if (mca_cfg.recovery) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); mcheck_debugfs_init(); diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 03e51053592a..100fbeebdc72 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -67,7 +67,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, unlock: mutex_unlock(&mce_chrdev_read_mutex); - mce->kflags |= MCE_HANDLED_MCELOG; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + mce->kflags |= MCE_HANDLED_MCELOG; + return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 6473070b5da4..88dcc79cfb07 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -38,7 +38,8 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); +extern int (*mce_severity)(struct mce *a, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; @@ -185,4 +186,14 @@ extern bool amd_filter_mce(struct mce *m); static inline bool amd_filter_mce(struct mce *m) { return false; }; #endif +__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + +__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index e1da619add19..83df991314c5 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -9,9 +9,14 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/debugfs.h> -#include <asm/mce.h> #include <linux/uaccess.h> +#include <asm/mce.h> +#include <asm/intel-family.h> +#include <asm/traps.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> + #include "internal.h" /* @@ -40,9 +45,14 @@ static struct severity { unsigned char context; unsigned char excp; unsigned char covered; + unsigned char cpu_model; + unsigned char cpu_minstepping; + unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h +#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -90,14 +100,9 @@ static struct severity { EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( - DEFERRED, "Deferred error", - NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) - ), - MCESEV( KEEP, "Corrected error", NOSER, BITCLR(MCI_STATUS_UC) ), - /* * known AO MCACODs reported via MCE or CMC: * @@ -113,6 +118,18 @@ static struct severity { AO, "Action optional: last level cache writeback error", SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), + /* + * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured + * to report uncorrected errors using CMCI with a special signature. + * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported + * in one of the memory controller banks. + * Set severity to "AO" for same action as normal patrol scrub error. + */ + MCESEV( + AO, "Uncorrected Patrol Scrub Error", + SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), + MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + ), /* ignore OVER for UCNA */ MCESEV( @@ -198,6 +215,47 @@ static struct severity { #define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) +static bool is_copy_from_user(struct pt_regs *regs) +{ + u8 insn_buf[MAX_INSN_SIZE]; + struct insn insn; + unsigned long addr; + + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); + insn_get_opcode(&insn); + if (!insn.opcode.got) + return false; + + switch (insn.opcode.value) { + /* MOV mem,reg */ + case 0x8A: case 0x8B: + /* MOVZ mem,reg */ + case 0xB60F: case 0xB70F: + insn_get_modrm(&insn); + insn_get_sib(&insn); + if (!insn.modrm.got || !insn.sib.got) + return false; + addr = (unsigned long)insn_get_addr_ref(&insn, regs); + break; + /* REP MOVS */ + case 0xA4: case 0xA5: + addr = regs->si; + break; + default: + return false; + } + + if (fault_in_kernel_space(addr)) + return false; + + current->mce_vaddr = (void __user *)addr; + + return true; +} + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -209,15 +267,25 @@ static struct severity { * distinguish an exception taken in user from from one * taken in the kernel. */ -static int error_context(struct mce *m) +static int error_context(struct mce *m, struct pt_regs *regs) { + enum handler_type t; + if ((m->cs & 3) == 3) return IN_USER; + if (!mc_recoverable(m->mcgstatus)) + return IN_KERNEL; - if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) { + t = ex_get_fault_handler_type(m->ip); + if (t == EX_HANDLER_FAULT) { m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; } + if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { + m->kflags |= MCE_IN_KERNEL_RECOV; + m->kflags |= MCE_IN_KERNEL_COPYIN; + return IN_KERNEL_RECOV; + } return IN_KERNEL; } @@ -253,9 +321,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" */ -static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, + char **msg, bool is_excp) { - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); /* Processor Context Corrupt, no need to fumble too much, die! */ if (m->status & MCI_STATUS_PCC) @@ -305,10 +374,11 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc return MCE_KEEP_SEVERITY; } -static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) +static int mce_severity_intel(struct mce *m, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); struct severity *s; for (s = severities;; s++) { @@ -324,6 +394,12 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e continue; if (s->excp && excp != s->excp) continue; + if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + continue; + if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) + continue; + if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi)) + continue; if (msg) *msg = s->msg; s->covered = 1; @@ -336,7 +412,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e } /* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = +int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) = mce_severity_intel; void __init mcheck_vendor_init_severity(void) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 31125448b174..9834a43cd0fa 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -248,7 +248,7 @@ static void __init ms_hyperv_init_platform(void) hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); } - if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; @@ -270,7 +270,7 @@ static void __init ms_hyperv_init_platform(void) crash_kexec_post_notifiers = true; #ifdef CONFIG_X86_LOCAL_APIC - if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { /* * Get the APIC frequency. @@ -296,7 +296,7 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif - if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) { + if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } else { @@ -330,7 +330,7 @@ static void __init ms_hyperv_init_platform(void) alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); /* Setup the IDT for reenlightenment notifications */ - if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) { + if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, asm_sysvec_hyperv_reenlightenment); } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 6a9df71c1b9e..e5f4ee8f4c3b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -168,6 +168,7 @@ struct rdt_resource rdt_resources_all[] = { .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .cache_level = 3, + .parse_ctrlval = parse_bw, .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, }, @@ -254,22 +255,30 @@ static bool __get_mem_config_intel(struct rdt_resource *r) { union cpuid_0x10_3_eax eax; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx, ecx, max_delay; cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); r->num_closid = edx.split.cos_max + 1; - r->membw.max_delay = eax.split.max_delay + 1; + max_delay = eax.split.max_delay + 1; r->default_ctrl = MAX_MBA_BW; + r->membw.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { r->membw.delay_linear = true; - r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; - r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + r->membw.min_bw = MAX_MBA_BW - max_delay; + r->membw.bw_gran = MAX_MBA_BW - max_delay; } else { if (!rdt_get_mb_table(r)) return false; + r->membw.arch_needs_linear = false; } r->data_width = 3; + if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) + r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + else + r->membw.throttle_mode = THREAD_THROTTLE_MAX; + thread_throttle_mode_init(); + r->alloc_capable = true; r->alloc_enabled = true; @@ -288,7 +297,13 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) /* AMD does not use delay */ r->membw.delay_linear = false; + r->membw.arch_needs_linear = false; + /* + * AMD does not use memory delay throttle model to control + * the allocation like Intel does. + */ + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; /* Max value is 2048, Data width should be 4 in decimal */ @@ -346,19 +361,6 @@ static void rdt_get_cdp_l2_config(void) rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE); } -static int get_cache_id(int cpu, int level) -{ - struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); - int i; - - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == level) - return ci->info_list[i].id; - } - - return -1; -} - static void mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) { @@ -556,13 +558,13 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int id = get_cache_id(cpu, r->cache_level); + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); struct list_head *add_pos = NULL; struct rdt_domain *d; d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { - pr_warn("Could't find cache id for cpu %d\n", cpu); + pr_warn("Couldn't find cache id for CPU %d\n", cpu); return; } @@ -602,12 +604,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) static void domain_remove_cpu(int cpu, struct rdt_resource *r) { - int id = get_cache_id(cpu, r->cache_level); + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); struct rdt_domain *d; d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { - pr_warn("Could't find cache id for cpu %d\n", cpu); + pr_warn("Couldn't find cache id for CPU %d\n", cpu); return; } @@ -918,12 +920,12 @@ static __init void rdt_init_res_defs_intel(void) r->rid == RDT_RESOURCE_L3CODE || r->rid == RDT_RESOURCE_L2 || r->rid == RDT_RESOURCE_L2DATA || - r->rid == RDT_RESOURCE_L2CODE) - r->cbm_validate = cbm_validate_intel; - else if (r->rid == RDT_RESOURCE_MBA) { + r->rid == RDT_RESOURCE_L2CODE) { + r->cache.arch_has_sparse_bitmaps = false; + r->cache.arch_has_empty_bitmaps = false; + } else if (r->rid == RDT_RESOURCE_MBA) { r->msr_base = MSR_IA32_MBA_THRTL_BASE; r->msr_update = mba_wrmsr_intel; - r->parse_ctrlval = parse_bw_intel; } } } @@ -938,12 +940,12 @@ static __init void rdt_init_res_defs_amd(void) r->rid == RDT_RESOURCE_L3CODE || r->rid == RDT_RESOURCE_L2 || r->rid == RDT_RESOURCE_L2DATA || - r->rid == RDT_RESOURCE_L2CODE) - r->cbm_validate = cbm_validate_amd; - else if (r->rid == RDT_RESOURCE_MBA) { + r->rid == RDT_RESOURCE_L2CODE) { + r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_empty_bitmaps = true; + } else if (r->rid == RDT_RESOURCE_MBA) { r->msr_base = MSR_IA32_MBA_BW_BASE; r->msr_update = mba_wrmsr_amd; - r->parse_ctrlval = parse_bw_amd; } } } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 934c8fb8a64a..c877642e8a14 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -23,53 +23,6 @@ /* * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and maximum bandwidth values specified by - * the hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate_amd(char *buf, unsigned long *data, - struct rdt_resource *r) -{ - unsigned long bw; - int ret; - - ret = kstrtoul(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); - return false; - } - - if (bw < r->membw.min_bw || bw > r->default_ctrl) { - rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, - r->membw.min_bw, r->default_ctrl); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d) -{ - unsigned long bw_val; - - if (d->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); - return -EINVAL; - } - - if (!bw_validate_amd(data->buf, &bw_val, r)) - return -EINVAL; - - d->new_ctrl = bw_val; - d->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether MBA bandwidth percentage value is correct. The value is * checked against the minimum and max bandwidth values specified by the * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. @@ -82,7 +35,7 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear) { + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; } @@ -104,8 +57,8 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { unsigned long bw_val; @@ -123,12 +76,14 @@ int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, } /* - * Check whether a cache bit mask is valid. The SDM says: + * Check whether a cache bit mask is valid. + * For Intel the SDM says: * Please note that all (and only) contiguous '1' combinations * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. + * AMD allows non-contiguous bitmasks. */ -bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) { unsigned long first_bit, zero_bit, val; unsigned int cbm_len = r->cache.cbm_len; @@ -140,7 +95,8 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) return false; } - if (val == 0 || val > r->default_ctrl) { + if ((!r->cache.arch_has_empty_bitmaps && val == 0) || + val > r->default_ctrl) { rdt_last_cmd_puts("Mask out of range\n"); return false; } @@ -148,7 +104,9 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) { + /* Are non-contiguous bitmaps allowed? */ + if (!r->cache.arch_has_sparse_bitmaps && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); return false; } @@ -164,30 +122,6 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) } /* - * Check whether a cache bit mask is valid. AMD allows non-contiguous - * bitmasks - */ -bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r) -{ - unsigned long val; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if (val > r->default_ctrl) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - *data = val; - return true; -} - -/* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ @@ -212,7 +146,7 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, return -EINVAL; } - if (!r->cbm_validate(data->buf, &cbm_val, r)) + if (!cbm_validate(data->buf, &cbm_val, r)) return -EINVAL; if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5ffa32256b3b..80fa997fae60 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -283,7 +283,6 @@ struct rftype { * struct mbm_state - status for each MBM counter in each domain * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it - * @chunks_bw Total local data moved. Used for bandwidth calculation * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting * @prev_bw The most recent bandwidth in MBps * @delta_bw Difference between the current and previous bandwidth @@ -292,7 +291,6 @@ struct rftype { struct mbm_state { u64 chunks; u64 prev_msr; - u64 chunks_bw; u64 prev_bw_msr; u32 prev_bw; u32 delta_bw; @@ -360,6 +358,8 @@ struct msr_param { * in a cache bit mask * @shareable_bits: Bitmask of shareable resource with other * executing entities + * @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid. + * @arch_has_empty_bitmaps: True if the '0' bitmap is valid. */ struct rdt_cache { unsigned int cbm_len; @@ -367,25 +367,43 @@ struct rdt_cache { unsigned int cbm_idx_mult; unsigned int cbm_idx_offset; unsigned int shareable_bits; + bool arch_has_sparse_bitmaps; + bool arch_has_empty_bitmaps; +}; + +/** + * enum membw_throttle_mode - System's memory bandwidth throttling mode + * @THREAD_THROTTLE_UNDEFINED: Not relevant to the system + * @THREAD_THROTTLE_MAX: Memory bandwidth is throttled at the core + * always using smallest bandwidth percentage + * assigned to threads, aka "max throttling" + * @THREAD_THROTTLE_PER_THREAD: Memory bandwidth is throttled at the thread + */ +enum membw_throttle_mode { + THREAD_THROTTLE_UNDEFINED = 0, + THREAD_THROTTLE_MAX, + THREAD_THROTTLE_PER_THREAD, }; /** * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. * @min_bw: Minimum memory bandwidth percentage user can request * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale + * @arch_needs_linear: True if we can't configure non-linear resources + * @throttle_mode: Bandwidth throttling mode when threads request + * different memory bandwidths * @mba_sc: True if MBA software controller(mba_sc) is enabled * @mb_map: Mapping of memory B/W percentage to memory B/W delay */ struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - bool mba_sc; - u32 *mb_map; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + bool arch_needs_linear; + enum membw_throttle_mode throttle_mode; + bool mba_sc; + u32 *mb_map; }; static inline bool is_llc_occupancy_enabled(void) @@ -437,7 +455,6 @@ struct rdt_parse_data { * @cache: Cache allocation related data * @format_str: Per resource format string to show domain value * @parse_ctrlval: Per resource function pointer to parse control values - * @cbm_validate Cache bitmask validate function * @evt_list: List of monitoring events * @num_rmid: Number of RMIDs available * @mon_scale: cqm counter * mon_scale = occupancy in bytes @@ -464,7 +481,6 @@ struct rdt_resource { int (*parse_ctrlval)(struct rdt_parse_data *data, struct rdt_resource *r, struct rdt_domain *d); - bool (*cbm_validate)(char *buf, u32 *data, struct rdt_resource *r); struct list_head evt_list; int num_rmid; unsigned int mon_scale; @@ -474,10 +490,8 @@ struct rdt_resource { int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d); -int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; @@ -609,8 +623,7 @@ void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); -bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r); -bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void __init thread_throttle_mode_init(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 837d7d012b7b..54dffe574e67 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -279,8 +279,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) return; chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); - m->chunks_bw += chunks; - m->chunks = m->chunks_bw; + m->chunks += chunks; cur_bw = (chunks * r->mon_scale) >> 20; if (m->delta_comp) @@ -478,19 +477,13 @@ void cqm_handle_limbo(struct work_struct *work) mutex_lock(&rdtgroup_mutex); r = &rdt_resources_all[RDT_RESOURCE_L3]; - d = get_domain_from_cpu(cpu, r); - - if (!d) { - pr_warn_once("Failure to get domain for limbo worker\n"); - goto out_unlock; - } + d = container_of(work, struct rdt_domain, cqm_limbo.work); __check_limbo(d, false); if (has_busy_rmid(r, d)) schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); -out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -520,10 +513,7 @@ void mbm_handle_overflow(struct work_struct *work) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3]; - - d = get_domain_from_cpu(cpu, r); - if (!d) - goto out_unlock; + d = container_of(work, struct rdt_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mbm_update(r, d, prgrp->mon.rmid); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 3f844f14fc0a..b494187632b2 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -592,6 +592,18 @@ static int __rdtgroup_move_task(struct task_struct *tsk, return ret; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /** * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group * @r: Resource group @@ -607,8 +619,7 @@ int rdtgroup_tasks_assigned(struct rdtgroup *r) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { ret = 1; break; } @@ -706,8 +717,7 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) + if (is_closid_match(t, r) || is_rmid_match(t, r)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -1017,6 +1027,19 @@ static int max_threshold_occ_show(struct kernfs_open_file *of, return 0; } +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) + seq_puts(seq, "per-thread\n"); + else + seq_puts(seq, "max\n"); + + return 0; +} + static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -1513,6 +1536,17 @@ static struct rftype res_common_files[] = { .seq_show = rdt_delay_linear_show, .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, { .name = "max_threshold_occupancy", .mode = 0644, @@ -1583,7 +1617,7 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); for (rft = rfts; rft < rfts + len; rft++) { - if ((fflags & rft->fflags) == rft->fflags) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); if (ret) goto error; @@ -1600,6 +1634,33 @@ error: return ret; } +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +void __init thread_throttle_mode_init(void) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); + if (!rft) + return; + + rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; +} + /** * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file * @r: The resource group with which the file is associated. @@ -2245,18 +2306,6 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); -} - /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -3196,7 +3245,7 @@ int __init rdtgroup_init(void) * It may also be ok since that would enable debugging of RDT before * resctrl is mounted. * The reason why the debugfs directory is created here and not in - * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and * during the debugfs directory creation also &sb->s_type->i_mutex_key * (the lockdep class of inode->i_rwsem). Other filesystem * interactions (eg. SyS_getdents) have the lock ordering: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 62b137c3c97a..2eb0a8c44b35 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -35,12 +35,14 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, + { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index a0e8fc7d85f1..ddffd80f5c52 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -229,8 +229,8 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, it = &of_ioapic_type[type_index]; ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); - tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); - tmp.ioapic_pin = fwspec->param[0]; + tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic.pin = fwspec->param[0]; return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 48ce44576947..ea8d51ec251b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -115,7 +115,8 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) unsigned long prologue = regs->ip - PROLOGUE_SIZE; if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { - printk("%sCode: Bad RIP value.\n", loglvl); + printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", + loglvl, prologue); } else { printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes, diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 61ddc3a5e5c2..701f196d7c68 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -5,7 +5,6 @@ #include <asm/fpu/internal.h> #include <asm/tlbflush.h> #include <asm/setup.h> -#include <asm/cmdline.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -238,51 +237,11 @@ static void __init fpu__init_system_ctx_switch(void) } /* - * We parse fpu parameters early because fpu__init_system() is executed - * before parse_early_param(). - */ -static void __init fpu__init_parse_early_param(void) -{ - char arg[32]; - char *argptr = arg; - int bit; - -#ifdef CONFIG_X86_32 - if (cmdline_find_option_bool(boot_command_line, "no387")) -#ifdef CONFIG_MATH_EMULATION - setup_clear_cpu_cap(X86_FEATURE_FPU); -#else - pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); -#endif - - if (cmdline_find_option_bool(boot_command_line, "nofxsr")) - setup_clear_cpu_cap(X86_FEATURE_FXSR); -#endif - - if (cmdline_find_option_bool(boot_command_line, "noxsave")) - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - - if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - - if (cmdline_find_option_bool(boot_command_line, "noxsaves")) - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - - if (cmdline_find_option(boot_command_line, "clearcpuid", arg, - sizeof(arg)) && - get_option(&argptr, &bit) && - bit >= 0 && - bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); -} - -/* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: */ void __init fpu__init_system(struct cpuinfo_x86 *c) { - fpu__init_parse_early_param(); fpu__init_system_early_generic(c); /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 038e19c0019e..5d8047441a0a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -37,6 +37,7 @@ static const char *xfeature_names[] = "AVX-512 ZMM_Hi256" , "Processor Trace (unused)" , "Protection Keys User registers", + "PASID state", "unknown xstate feature" , }; @@ -51,6 +52,7 @@ static short xsave_cpuid_features[] __initdata = { X86_FEATURE_AVX512F, X86_FEATURE_INTEL_PT, X86_FEATURE_PKU, + X86_FEATURE_ENQCMD, }; /* @@ -318,6 +320,7 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); + print_xstate_feature(XFEATURE_MASK_PASID); } /* @@ -592,6 +595,7 @@ static void check_xstate_against_struct(int nr) XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); + XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); /* * Make *SURE* to add any feature numbers in below if @@ -601,7 +605,7 @@ static void check_xstate_against_struct(int nr) if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) { + ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); } @@ -1398,3 +1402,60 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ + +#ifdef CONFIG_IOMMU_SUPPORT +void update_pasid(void) +{ + u64 pasid_state; + u32 pasid; + + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) + return; + + if (!current->mm) + return; + + pasid = READ_ONCE(current->mm->pasid); + /* Set the valid bit in the PASID MSR/state only for valid pasid. */ + pasid_state = pasid == PASID_DISABLED ? + pasid : pasid | MSR_IA32_PASID_VALID; + + /* + * No need to hold fregs_lock() since the task's fpstate won't + * be changed by others (e.g. ptrace) while the task is being + * switched to or is in IPI. + */ + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + /* The MSR is active and can be directly updated. */ + wrmsrl(MSR_IA32_PASID, pasid_state); + } else { + struct fpu *fpu = ¤t->thread.fpu; + struct ia32_pasid_state *ppasid_state; + struct xregs_state *xsave; + + /* + * The CPU's xstate registers are not currently active. Just + * update the PASID state in the memory buffer here. The + * PASID MSR will be loaded when returning to user mode. + */ + xsave = &fpu->state.xsave; + xsave->header.xfeatures |= XFEATURE_MASK_PASID; + ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID); + /* + * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state + * won't be NULL and no need to check its value. + * + * Only update the task's PASID state when it's different + * from the mm's pasid. + */ + if (ppasid_state->pasid != pasid_state) { + /* + * Invalid fpregs so that state restoring will pick up + * the PASID state. + */ + __fpu_invalidate_fpregs_state(fpu); + ppasid_state->pasid = pasid_state; + } + } +} +#endif /* CONFIG_IOMMU_SUPPORT */ diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index b98ff620ba77..03aa33b58165 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -442,42 +442,6 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, } /* - * Dump the debug register contents to the user. - * We can't dump our per cpu values because it - * may contain cpu wide breakpoint, something that - * doesn't belong to the current task. - * - * TODO: include non-ptrace user breakpoints (perf) - */ -void aout_dump_debugregs(struct user *dump) -{ - int i; - int dr7 = 0; - struct perf_event *bp; - struct arch_hw_breakpoint *info; - struct thread_struct *thread = ¤t->thread; - - for (i = 0; i < HBP_NUM; i++) { - bp = thread->ptrace_bps[i]; - - if (bp && !bp->attr.disabled) { - dump->u_debugreg[i] = bp->attr.bp_addr; - info = counter_arch_bp(bp); - dr7 |= encode_dr7(i, info->len, info->type); - } else { - dump->u_debugreg[i] = 0; - } - } - - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - - dump->u_debugreg[7] = dr7; -} -EXPORT_SYMBOL_GPL(aout_dump_debugregs); - -/* * Release the user breakpoints used by ptrace */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) @@ -490,7 +454,7 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) t->ptrace_bps[i] = NULL; } - t->debugreg6 = 0; + t->virtual_dr6 = 0; t->ptrace_dr7 = 0; } @@ -500,7 +464,7 @@ void hw_breakpoint_restore(void) set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); - set_debugreg(current->thread.debugreg6, 6); + set_debugreg(DR6_RESERVED, 6); set_debugreg(__this_cpu_read(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); @@ -523,10 +487,10 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore); */ static int hw_breakpoint_handler(struct die_args *args) { - int i, cpu, rc = NOTIFY_STOP; + int i, rc = NOTIFY_STOP; struct perf_event *bp; - unsigned long dr6; unsigned long *dr6_p; + unsigned long dr6; /* The DR6 value is pointed by args->err */ dr6_p = (unsigned long *)ERR_PTR(args->err); @@ -540,14 +504,6 @@ static int hw_breakpoint_handler(struct die_args *args) if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - /* - * Assert that local interrupts are disabled - * Reset the DRn bits in the virtualized register value. - * The ptrace trigger routine will add in whatever is needed. - */ - current->thread.debugreg6 &= ~DR_TRAP_BITS; - cpu = get_cpu(); - /* Handle all the breakpoints that were triggered */ for (i = 0; i < HBP_NUM; ++i) { if (likely(!(dr6 & (DR_TRAP0 << i)))) @@ -561,7 +517,7 @@ static int hw_breakpoint_handler(struct die_args *args) */ rcu_read_lock(); - bp = per_cpu(bp_per_reg[i], cpu); + bp = this_cpu_read(bp_per_reg[i]); /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -592,12 +548,10 @@ static int hw_breakpoint_handler(struct die_args *args) * breakpoints (to generate signals) and b) when the system has * taken exception due to multiple causes */ - if ((current->thread.debugreg6 & DR_TRAP_BITS) || + if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; - put_cpu(); - return rc; } diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 7ecf9babf0cb..1bffb87dcfdc 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -149,9 +149,6 @@ static const __initconst struct idt_data apic_idts[] = { # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif -# ifdef CONFIG_X86_UV - INTG(UV_BAU_MESSAGE, asm_sysvec_uv_bau_message), -# endif INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), #endif diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index c2f02f308ecf..ff7878df96b4 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -629,9 +629,10 @@ static void kgdb_hw_overflow_handler(struct perf_event *event, struct task_struct *tsk = current; int i; - for (i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { if (breakinfo[i].enabled) - tsk->thread.debugreg6 |= (DR_TRAP0 << i); + tsk->thread.virtual_dr6 |= (DR_TRAP0 << i); + } } void kgdb_arch_late(void) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index fdadc37d72af..db8f8693ab8d 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -767,124 +767,21 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); + /* * Called from kretprobe_trampoline */ __used __visible void *trampoline_handler(struct pt_regs *regs) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; - kprobe_opcode_t *correct_ret_addr = NULL; - void *frame_pointer; - bool skipped = false; - - /* - * Set a dummy kprobe for avoiding kretprobe recursion. - * Since kretprobe never run in kprobe handler, kprobe must not - * be running at this point. - */ - kprobe_busy_begin(); - - INIT_HLIST_HEAD(&empty_rp); - kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 - regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif - /* We use pt_regs->sp for return address holder. */ - frame_pointer = ®s->sp; - regs->ip = trampoline_address; + regs->ip = (unsigned long)&kretprobe_trampoline; regs->orig_ax = ~0UL; - /* - * It is possible to have multiple instances associated with a given - * task either because multiple functions in the call path have - * return probes installed on them, and/or more than one - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always pushed into the head of the list - * - when multiple return probes are registered for the same - * function, the (chronologically) first instance's ret_addr - * will be the real return address, and all the rest will - * point to kretprobe_trampoline. - */ - hlist_for_each_entry(ri, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - /* - * Return probes must be pushed on this hash list correct - * order (same as return order) so that it can be popped - * correctly. However, if we find it is pushed it incorrect - * order, this means we find a function which should not be - * probed, because the wrong order entry is pushed on the - * path of processing other kretprobe itself. - */ - if (ri->fp != frame_pointer) { - if (!skipped) - pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); - skipped = true; - continue; - } - - orig_ret_address = (unsigned long)ri->ret_addr; - if (skipped) - pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", - ri->rp->kp.addr); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - - correct_ret_addr = ri->ret_addr; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - if (ri->fp != frame_pointer) - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - if (ri->rp && ri->rp->handler) { - __this_cpu_write(current_kprobe, &ri->rp->kp); - ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, &kprobe_busy); - } - - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_hash_unlock(current, &flags); - - kprobe_busy_end(); - - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void *)orig_ret_address; + return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, ®s->sp); } NOKPROBE_SYMBOL(trampoline_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 40f380461e6d..15e06408f6ba 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -18,6 +18,7 @@ #include <linux/ftrace.h> #include <linux/frame.h> #include <linux/pgtable.h> +#include <linux/static_call.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -181,7 +182,6 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) /* Save skipped registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 - regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; @@ -210,7 +210,8 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) /* Check whether the address range is reserved */ if (ftrace_text_reserved(src, src + len - 1) || alternatives_text_reserved(src, src + len - 1) || - jump_label_text_reserved(src, src + len - 1)) + jump_label_text_reserved(src, src + len - 1) || + static_call_text_reserved(src, src + len - 1)) return -EBUSY; return len; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index baa21090c9be..8f06449aab27 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -24,7 +24,6 @@ #include <asm/irqdomain.h> #include <asm/mtrr.h> #include <asm/mpspec.h> -#include <asm/io_apic.h> #include <asm/proto.h> #include <asm/bios_ebda.h> #include <asm/e820/api.h> @@ -46,11 +45,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -int __init default_mpc_apic_id(struct mpc_cpu *m) -{ - return m->apicid; -} - static void __init MP_processor_info(struct mpc_cpu *m) { int apicid; @@ -61,7 +55,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apicid = x86_init.mpparse.mpc_apic_id(m); + apicid = m->apicid; if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; @@ -73,7 +67,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) } #ifdef CONFIG_X86_IO_APIC -void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) +static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str) { memcpy(str, m->bustype, 6); str[6] = 0; @@ -84,7 +78,7 @@ static void __init MP_bus_info(struct mpc_bus *m) { char str[7]; - x86_init.mpparse.mpc_oem_bus_info(m, str); + mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { @@ -100,9 +94,6 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_init.mpparse.mpc_oem_pci_bus) - x86_init.mpparse.mpc_oem_pci_bus(m); - clear_bit(m->busid, mp_bus_not_pci); #ifdef CONFIG_EISA mp_bus_id_to_type[m->busid] = MP_BUS_PCI; @@ -198,8 +189,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) 1, mpc, mpc->length, 1); } -void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } - static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -218,14 +207,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; - if (mpc->oemptr) - x86_init.mpparse.smp_read_mpc_oem(mpc); - - /* - * Now process the configuration blocks. - */ - x86_init.mpparse.mpc_record(0); - + /* Now process the configuration blocks. */ while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: @@ -256,7 +238,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) count = mpc->length; break; } - x86_init.mpparse.mpc_record(1); } if (!num_processors) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 49dcfb85e773..c0d409810658 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -80,18 +80,30 @@ static ssize_t msr_read(struct file *file, char __user *buf, static int filter_write(u32 reg) { + /* + * MSRs writes usually happen all at once, and can easily saturate kmsg. + * Only allow one message every 30 seconds. + * + * It's possible to be smarter here and do it (for example) per-MSR, but + * it would certainly be more complex, and this is enough at least to + * avoid saturating the ring buffer. + */ + static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1); + switch (allow_writes) { case MSR_WRITES_ON: return 0; case MSR_WRITES_OFF: return -EPERM; default: break; } + if (!__ratelimit(&fw_rs)) + return 0; + if (reg == MSR_IA32_ENERGY_PERF_BIAS) return 0; - pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n" - "Please report to x86@kernel.org\n", - reg, current->comm); + pr_err("Write to unrecognized MSR 0x%x by %s (pid: %d). Please report to x86@kernel.org.\n", + reg, current->comm, current->pid); return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4fc9954a9560..47381666d6a5 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -102,7 +102,6 @@ fs_initcall(nmi_warning_debugfs); static void nmi_check_duration(struct nmiaction *action, u64 duration) { - u64 whole_msecs = READ_ONCE(action->max_duration); int remainder_ns, decimal_msecs; if (duration < nmi_longest_ns || duration < action->max_duration) @@ -110,12 +109,12 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) action->max_duration = duration; - remainder_ns = do_div(whole_msecs, (1000 * 1000)); + remainder_ns = do_div(duration, (1000 * 1000)); decimal_msecs = remainder_ns / 1000; printk_ratelimited(KERN_INFO "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", - action->handler, whole_msecs, decimal_msecs); + action->handler, duration, decimal_msecs); } static int nmi_handle(unsigned int type, struct pt_regs *regs) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index de2138ba38e5..6c3407ba6ee9 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -263,13 +263,8 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL - .kernel_rpl = 0, - .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ - -#ifdef CONFIG_X86_64 .extra_user_64bit_cs = __USER_CS, #endif -#endif }; /* 64-bit pagetable entries */ @@ -305,9 +300,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_idt = native_load_idt, .cpu.store_tr = native_store_tr, .cpu.load_tls = native_load_tls, -#ifdef CONFIG_X86_64 .cpu.load_gs_index = native_load_gs_index, -#endif .cpu.write_ldt_entry = native_write_ldt_entry, .cpu.write_gdt_entry = native_write_gdt_entry, .cpu.write_idt_entry = native_write_idt_entry, @@ -317,9 +310,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, -#ifdef CONFIG_X86_64 .cpu.usergs_sysret64 = native_usergs_sysret64, -#endif .cpu.iret = native_iret, .cpu.swapgs = native_swapgs, @@ -369,24 +360,16 @@ struct paravirt_patch_template pv_ops = { .mmu.release_p4d = paravirt_nop, .mmu.set_pte = native_set_pte, - .mmu.set_pte_at = native_set_pte_at, .mmu.set_pmd = native_set_pmd, .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, -#if CONFIG_PGTABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - .mmu.set_pte_atomic = native_set_pte_atomic, - .mmu.pte_clear = native_pte_clear, - .mmu.pmd_clear = native_pmd_clear, -#endif .mmu.set_pud = native_set_pud, .mmu.pmd_val = PTE_IDENT, .mmu.make_pmd = PTE_IDENT, -#if CONFIG_PGTABLE_LEVELS >= 4 .mmu.pud_val = PTE_IDENT, .mmu.make_pud = PTE_IDENT, @@ -398,8 +381,6 @@ struct paravirt_patch_template pv_ops = { .mmu.set_pgd = native_set_pgd, #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ -#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ -#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c index 3eff63c090d2..ace6e334cb39 100644 --- a/arch/x86/kernel/paravirt_patch.c +++ b/arch/x86/kernel/paravirt_patch.c @@ -26,14 +26,10 @@ struct patch_xxl { const unsigned char mmu_read_cr3[3]; const unsigned char mmu_write_cr3[3]; const unsigned char irq_restore_fl[2]; -# ifdef CONFIG_X86_64 const unsigned char cpu_wbinvd[2]; const unsigned char cpu_usergs_sysret64[6]; const unsigned char cpu_swapgs[3]; const unsigned char mov64[3]; -# else - const unsigned char cpu_iret[1]; -# endif }; static const struct patch_xxl patch_data_xxl = { @@ -42,7 +38,6 @@ static const struct patch_xxl patch_data_xxl = { .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax -# ifdef CONFIG_X86_64 .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd @@ -50,19 +45,11 @@ static const struct patch_xxl patch_data_xxl = { 0x48, 0x0f, 0x07 }, // swapgs; sysretq .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax -# else - .mmu_write_cr3 = { 0x0f, 0x22, 0xd8 }, // mov %eax, %cr3 - .irq_restore_fl = { 0x50, 0x9d }, // push %eax; popf - .cpu_iret = { 0xcf }, // iret -# endif }; unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) { -#ifdef CONFIG_X86_64 return PATCH(xxl, mov64, insn_buff, len); -#endif - return 0; } # endif /* CONFIG_PARAVIRT_XXL */ @@ -98,13 +85,9 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); -# ifdef CONFIG_X86_64 PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); -# else - PATCH_CASE(cpu, iret, xxl, insn_buff, len); -# endif #endif #ifdef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9afefe325acb..df342bedea88 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -407,7 +407,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void) { unsigned long gsbase; - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); @@ -422,7 +422,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void) void x86_gsbase_write_cpu_inactive(unsigned long gsbase) { - if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); @@ -439,7 +439,7 @@ unsigned long x86_fsbase_read_task(struct task_struct *task) if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else @@ -454,7 +454,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e7537c5440bb..bedca011459c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -465,7 +465,7 @@ static void ptrace_triggered(struct perf_event *bp, break; } - thread->debugreg6 |= (DR_TRAP0 << i); + thread->virtual_dr6 |= (DR_TRAP0 << i); } /* @@ -601,7 +601,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) if (bp) val = bp->hw.info.address; } else if (n == 6) { - val = thread->debugreg6; + val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ } else if (n == 7) { val = thread->ptrace_dr7; } @@ -657,7 +657,7 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n, if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); } else if (n == 6) { - thread->debugreg6 = val; + thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ rc = 0; } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 1b10717c9321..6d0df6a58873 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -8,6 +8,7 @@ #include <asm/hpet.h> #include <asm/setup.h> +#include <asm/mce.h> #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) @@ -624,10 +625,6 @@ static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) -#include <linux/jump_label.h> -#include <asm/string_64.h> - /* Ivy Bridge, Haswell, Broadwell */ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) { @@ -636,7 +633,7 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) pci_read_config_dword(pdev, 0x84, &capid0); if (capid0 & 0x10) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); } /* Skylake */ @@ -653,7 +650,7 @@ static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) * enabled, so memory machine check recovery is also enabled. */ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); @@ -661,7 +658,6 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif -#endif bool x86_apple_machine; EXPORT_SYMBOL(x86_apple_machine); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3511736fbc74..fa16b906ea3f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -19,6 +19,7 @@ #include <linux/hugetlb.h> #include <linux/tboot.h> #include <linux/usb/xhci-dbgp.h> +#include <linux/static_call.h> #include <uapi/linux/mount.h> @@ -849,6 +850,7 @@ void __init setup_arch(char **cmdline_p) early_cpu_init(); arch_init_ideal_nops(); jump_label_init(); + static_call_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -1077,6 +1079,7 @@ void __init setup_arch(char **cmdline_p) efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); + efi_mokvar_table_init(); /* * The EFI specification says that boot service code won't be diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 9ccbf0576cd0..a7f3e12cfbdb 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -27,7 +27,7 @@ static inline void signal_compat_build_tests(void) */ BUILD_BUG_ON(NSIGILL != 11); BUILD_BUG_ON(NSIGFPE != 15); - BUILD_BUG_ON(NSIGSEGV != 7); + BUILD_BUG_ON(NSIGSEGV != 9); BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 2fd698e28e4d..8627fda8d993 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -18,13 +18,13 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct unwind_state state; unsigned long addr; - if (regs && !consume_entry(cookie, regs->ip, false)) + if (regs && !consume_entry(cookie, regs->ip)) return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); - if (!addr || !consume_entry(cookie, addr, false)) + if (!addr || !consume_entry(cookie, addr)) break; } } @@ -72,7 +72,7 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (!addr) return -EINVAL; - if (!consume_entry(cookie, addr, false)) + if (!consume_entry(cookie, addr)) return -EINVAL; } @@ -114,7 +114,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, { const void __user *fp = (const void __user *)regs->bp; - if (!consume_entry(cookie, regs->ip, false)) + if (!consume_entry(cookie, regs->ip)) return; while (1) { @@ -128,7 +128,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, break; if (!frame.ret_addr) break; - if (!consume_entry(cookie, frame.ret_addr, false)) + if (!consume_entry(cookie, frame.ret_addr)) break; fp = frame.next_fp; } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c new file mode 100644 index 000000000000..ca9a380d9c0b --- /dev/null +++ b/arch/x86/kernel/static_call.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/static_call.h> +#include <linux/memory.h> +#include <linux/bug.h> +#include <asm/text-patching.h> + +enum insn_type { + CALL = 0, /* site call */ + NOP = 1, /* site cond-call */ + JMP = 2, /* tramp / site tail-call */ + RET = 3, /* tramp / site cond-tail-call */ +}; + +static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) +{ + int size = CALL_INSN_SIZE; + const void *code; + + switch (type) { + case CALL: + code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + break; + + case NOP: + code = ideal_nops[NOP_ATOMIC5]; + break; + + case JMP: + code = text_gen_insn(JMP32_INSN_OPCODE, insn, func); + break; + + case RET: + code = text_gen_insn(RET_INSN_OPCODE, insn, func); + size = RET_INSN_SIZE; + break; + } + + if (memcmp(insn, code, size) == 0) + return; + + if (unlikely(system_state == SYSTEM_BOOTING)) + return text_poke_early(insn, code, size); + + text_poke_bp(insn, code, size, NULL); +} + +static void __static_call_validate(void *insn, bool tail) +{ + u8 opcode = *(u8 *)insn; + + if (tail) { + if (opcode == JMP32_INSN_OPCODE || + opcode == RET_INSN_OPCODE) + return; + } else { + if (opcode == CALL_INSN_OPCODE || + !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5)) + return; + } + + /* + * If we ever trigger this, our text is corrupt, we'll probably not live long. + */ + WARN_ONCE(1, "unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); +} + +static inline enum insn_type __sc_insn(bool null, bool tail) +{ + /* + * Encode the following table without branches: + * + * tail null insn + * -----+-------+------ + * 0 | 0 | CALL + * 0 | 1 | NOP + * 1 | 0 | JMP + * 1 | 1 | RET + */ + return 2*tail + null; +} + +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) +{ + mutex_lock(&text_mutex); + + if (tramp) { + __static_call_validate(tramp, true); + __static_call_transform(tramp, __sc_insn(!func, true), func); + } + + if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { + __static_call_validate(site, tail); + __static_call_transform(site, __sc_insn(!func, tail), func); + } + + mutex_unlock(&text_mutex); +} +EXPORT_SYMBOL_GPL(arch_static_call_transform); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 81a2fb711091..ec3a2572843f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -195,7 +195,7 @@ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) DEFINE_IDTENTRY(exc_divide_error) { - do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE, + do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); } @@ -745,9 +745,21 @@ static __always_inline unsigned long debug_read_clear_dr6(void) * Keep it simple: clear DR6 immediately. */ get_debugreg(dr6, 6); - set_debugreg(0, 6); - /* Filter out all the reserved bits which are preset to 1 */ - dr6 &= ~DR6_RESERVED; + set_debugreg(DR6_RESERVED, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + + /* + * Clear the virtual DR6 value, ptrace routines will set bits here for + * things we want signals for. + */ + current->thread.virtual_dr6 = 0; + + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_thread_flag(TIF_BLOCKSTEP); return dr6; } @@ -776,74 +788,20 @@ static __always_inline unsigned long debug_read_clear_dr6(void) * * May run on IST stack. */ -static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user) -{ - struct task_struct *tsk = current; - bool user_icebp; - int si_code; - - /* - * The SDM says "The processor clears the BTF flag when it - * generates a debug exception." Clear TIF_BLOCKSTEP to keep - * TIF_BLOCKSTEP in sync with the hardware BTF flag. - */ - clear_thread_flag(TIF_BLOCKSTEP); - - /* - * If DR6 is zero, no point in trying to handle it. The kernel is - * not using INT1. - */ - if (!user && !dr6) - return; +static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) +{ /* - * If dr6 has no reason to give us about the origin of this trap, - * then it's very likely the result of an icebp/int01 trap. - * User wants a sigtrap for that. + * Notifiers will clear bits in @dr6 to indicate the event has been + * consumed - hw_breakpoint_handler(), single_stop_cont(). + * + * Notifiers will set bits in @virtual_dr6 to indicate the desire + * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). */ - user_icebp = user && !dr6; - - /* Store the virtualized DR6 value */ - tsk->thread.debugreg6 = dr6; - -#ifdef CONFIG_KPROBES - if (kprobe_debug_handler(regs)) { - return; - } -#endif - - if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, - SIGTRAP) == NOTIFY_STOP) { - return; - } - - /* It's safe to allow irq's after DR6 has been saved */ - cond_local_irq_enable(regs); - - if (v8086_mode(regs)) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, 0, - X86_TRAP_DB); - goto out; - } - - if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { - /* - * Historical junk that used to handle SYSENTER single-stepping. - * This should be unreachable now. If we survive for a while - * without anyone hitting this warning, we'll turn this into - * an oops. - */ - tsk->thread.debugreg6 &= ~DR_STEP; - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - } - - si_code = get_si_code(tsk->thread.debugreg6); - if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) - send_sigtrap(regs, 0, si_code); + if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) + return true; -out: - cond_local_irq_disable(regs); + return false; } static __always_inline void exc_debug_kernel(struct pt_regs *regs, @@ -877,8 +835,32 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; - handle_debug(regs, dr6, false); + if (kprobe_debug_handler(regs)) + goto out; + + /* + * The kernel doesn't use INT1 + */ + if (!dr6) + goto out; + if (notify_debug(regs, &dr6)) + goto out; + + /* + * The kernel doesn't use TF single-step outside of: + * + * - Kprobes, consumed through kprobe_debug_handler() + * - KGDB, consumed through notify_debug() + * + * So if we get here with DR_STEP set, something is wonky. + * + * A known way to trigger this is through QEMU's GDB stub, + * which leaks #DB into the guest and causes IST recursion. + */ + if (WARN_ON_ONCE(dr6 & DR_STEP)) + regs->flags &= ~X86_EFLAGS_TF; +out: instrumentation_end(); idtentry_exit_nmi(regs, irq_state); @@ -888,6 +870,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { + bool icebp; + /* * If something gets miswired and we end up here for a kernel mode * #DB, we will malfunction. @@ -906,8 +890,32 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, irqentry_enter_from_user_mode(regs); instrumentation_begin(); - handle_debug(regs, dr6, true); + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + icebp = !dr6; + if (notify_debug(regs, &dr6)) + goto out; + + /* It's safe to allow irq's after DR6 has been saved */ + local_irq_enable(); + + if (v8086_mode(regs)) { + handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); + goto out_irq; + } + + /* Add the virtual_dr6 bits for signals. */ + dr6 |= current->thread.virtual_dr6; + if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) + send_sigtrap(regs, 0, get_si_code(dr6)); + +out_irq: + local_irq_disable(); +out: instrumentation_end(); irqentry_exit_to_user_mode(regs); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 49d925043171..f70dffc2771f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -54,7 +54,7 @@ struct clocksource *art_related_clocksource; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ - seqcount_t seq; /* 32 + 4 = 36 */ + seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ @@ -73,14 +73,14 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) preempt_disable_notrace(); do { - seq = this_cpu_read(cyc2ns.seq.sequence); + seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_end(void) @@ -186,7 +186,7 @@ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); - seqcount_init(&c2n->seq); + seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } @@ -203,7 +203,7 @@ static void __init cyc2ns_init_secondary_cpus(void) for_each_possible_cpu(cpu) { if (cpu != this_cpu) { - seqcount_init(&c2n->seq); + seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 8d5cbe1bbb3b..2c304fd0bb1a 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -45,11 +45,12 @@ * value that, lies close to the top of the kernel memory. The limit for the GDT * and the IDT are set to zero. * - * Given that SLDT and STR are not commonly used in programs that run on WineHQ - * or DOSEMU2, they are not emulated. - * - * The instruction smsw is emulated to return the value that the register CR0 + * The instruction SMSW is emulated to return the value that the register CR0 * has at boot time as set in the head_32. + * SLDT and STR are emulated to return the values that the kernel programmatically + * assigns: + * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not. + * - STR returns (GDT_ENTRY_TSS * 8). * * Emulation is provided for both 32-bit and 64-bit processes. * @@ -244,16 +245,34 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, *data_size += UMIP_GDT_IDT_LIMIT_SIZE; memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); - } else if (umip_inst == UMIP_INST_SMSW) { - unsigned long dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT || + umip_inst == UMIP_INST_STR) { + unsigned long dummy_value; + + if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_STR) { + dummy_value = GDT_ENTRY_TSS * 8; + } else if (umip_inst == UMIP_INST_SLDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + down_read(¤t->mm->context.ldt_usr_sem); + if (current->mm->context.ldt) + dummy_value = GDT_ENTRY_LDT * 8; + else + dummy_value = 0; + up_read(¤t->mm->context.ldt_usr_sem); +#else + dummy_value = 0; +#endif + } /* - * Even though the CR0 register has 4 bytes, the number + * For these 3 instructions, the number * of bytes to be copied in the result buffer is determined * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least - * siginificant bytes of CR0. + * siginificant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; @@ -261,7 +280,6 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, *data_size = 2; memcpy(data, &dummy_value, *data_size); - /* STR and SLDT are not emulated */ } else { return -EINVAL; } @@ -383,10 +401,6 @@ bool fixup_umip_exception(struct pt_regs *regs) umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - /* Do not emulate (spoof) SLDT or STR. */ - if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) - return false; - umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9a03e5b23135..bf9e0adb5b7e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -136,6 +136,7 @@ SECTIONS ENTRY_TEXT ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT + STATIC_CALL_TEXT *(.fixup) *(.gnu.warning) @@ -411,10 +412,47 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + ELF_DETAILS DISCARDS -} + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the lazy dispatch entries. + */ + .got.plt (INFO) : { *(.got.plt) } + ASSERT(SIZEOF(.got.plt) == 0 || +#ifdef CONFIG_X86_64 + SIZEOF(.got.plt) == 0x18, +#else + SIZEOF(.got.plt) == 0xc, +#endif + "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .got : { + *(.got) *(.igot.*) + } + ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!") + + .plt : { + *(.plt) *(.plt.*) *(.iplt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .rel.dyn : { + *(.rel.*) *(.rel_*) + } + ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!") + + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") +} #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 123f1c1f1788..a3038d8deb6a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -24,6 +24,7 @@ #include <asm/tsc.h> #include <asm/iommu.h> #include <asm/mach_traps.h> +#include <asm/irqdomain.h> void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -67,11 +68,7 @@ struct x86_init_ops x86_init __initdata = { }, .mpparse = { - .mpc_record = x86_init_uint_noop, .setup_ioapic_ids = x86_init_noop, - .mpc_apic_id = default_mpc_apic_id, - .smp_read_mpc_oem = default_smp_read_mpc_oem, - .mpc_oem_bus_info = default_mpc_oem_bus_info, .find_smp_config = default_find_smp_config, .get_smp_config = default_get_smp_config, }, @@ -80,7 +77,8 @@ struct x86_init_ops x86_init __initdata = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .intr_mode_select = apic_intr_mode_select, - .intr_mode_init = apic_intr_mode_init + .intr_mode_init = apic_intr_mode_init, + .create_pci_msi_domain = native_create_pci_msi_domain, }, .oem = { @@ -148,28 +146,10 @@ EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi __ro_after_init = { - .setup_msi_irqs = native_setup_msi_irqs, - .teardown_msi_irq = native_teardown_msi_irq, - .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, }; /* MSI arch specific hooks */ -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - return x86_msi.setup_msi_irqs(dev, nvec, type); -} - -void arch_teardown_msi_irqs(struct pci_dev *dev) -{ - x86_msi.teardown_msi_irqs(dev); -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - x86_msi.teardown_msi_irq(irq); -} - void arch_restore_msi_irqs(struct pci_dev *dev) { x86_msi.restore_msi_irqs(dev); |