From 279f1461432ccdec0b98c0bcbe0a8e2c0f6fdda5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 22 Oct 2012 14:37:58 -0700 Subject: x86: apic: Use tsc deadline for oneshot when available If the TSC deadline mode is supported, LAPIC timer one-shot mode can be implemented using IA32_TSC_DEADLINE MSR. An interrupt will be generated when the TSC value equals or exceeds the value in the IA32_TSC_DEADLINE MSR. This enables us to skip the APIC calibration during boot. Also, in xapic mode, this enables us to skip the uncached apic access to re-arm the APIC timer. As this timer ticks at the high frequency TSC rate, we use the TSC_DIVISOR (32) to work with the 32-bit restrictions in the clockevent API's to avoid 64-bit divides etc (frequency is u32 and "unsigned long" in the set_next_event(), max_delta limits the next event to 32-bit for 32-bit kernel). Signed-off-by: Suresh Siddha Cc: venki@google.com Cc: len.brown@intel.com Link: http://lkml.kernel.org/r/1350941878.6017.31.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 73 ++++++++++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b17416e72fbd..b994cc84aa7e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -90,21 +90,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); -/* - * Knob to control our willingness to enable the local APIC. - * - * +1=force-enable - */ -static int force_enable_local_apic __initdata; -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; @@ -133,6 +118,25 @@ static inline void imcr_apic_to_pic(void) } #endif +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic __initdata; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + if (config_enabled(CONFIG_X86_32) && !arg) + force_enable_local_apic = 1; + else if (!strncmp(arg, "notscdeadline", 13)) + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return 0; +} +early_param("lapic", parse_lapic); + #ifdef CONFIG_X86_64 static int apic_calibrate_pmtmr __initdata; static __init int setup_apicpmtimer(char *s) @@ -315,6 +319,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 +#define TSC_DIVISOR 32 /* * This function sets up the local APIC timer, with a timeout of @@ -333,6 +338,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) lvtt_value = LOCAL_TIMER_VECTOR; if (!oneshot) lvtt_value |= APIC_LVT_TIMER_PERIODIC; + else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + if (!lapic_is_integrated()) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); @@ -341,6 +349,11 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) apic_write(APIC_LVTT, lvtt_value); + if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { + printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); + return; + } + /* * Divide PICLK by 16 */ @@ -453,6 +466,16 @@ static int lapic_next_event(unsigned long delta, return 0; } +static int lapic_next_deadline(unsigned long delta, + struct clock_event_device *evt) +{ + u64 tsc; + + rdtscll(tsc); + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + return 0; +} + /* * Setup the lapic timer in periodic or oneshot mode */ @@ -533,7 +556,15 @@ static void __cpuinit setup_APIC_timer(void) memcpy(levt, &lapic_clockevent, sizeof(*levt)); levt->cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(levt); + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_DUMMY); + levt->set_next_event = lapic_next_deadline; + clockevents_config_and_register(levt, + (tsc_khz / TSC_DIVISOR) * 1000, + 0xF, ~0UL); + } else + clockevents_register_device(levt); } /* @@ -661,7 +692,9 @@ static int __init calibrate_APIC_clock(void) * in the clockevent structure and return. */ - if (lapic_timer_frequency) { + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + return 0; + } else if (lapic_timer_frequency) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", lapic_timer_frequency); lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, @@ -674,6 +707,9 @@ static int __init calibrate_APIC_clock(void) return 0; } + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + local_irq_disable(); /* Replace the global interrupt handler */ @@ -811,9 +847,6 @@ void __init setup_boot_APIC_clock(void) return; } - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) -- cgit v1.2.3 From 5074b85bdd3a464efe7b6de2ec163f4c07696a20 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 2 Nov 2012 14:00:29 +0000 Subject: x86: hpet: Fix inverted return value check in arch_setup_hpet_msi() setup_hpet_msi_remapped() returns a negative error indicator on error - check for this rather than for a boolean false indication, and pass on that error code rather than a meaningless "-1". Signed-off-by: Jan Beulich Cc: David Woodhouse Link: http://lkml.kernel.org/r/5093E00D02000078000A60E2@nat28.tlf.novell.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1817fa911024..b134f0b7ed25 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3317,8 +3317,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) int ret; if (irq_remapping_enabled) { - if (!setup_hpet_msi_remapped(irq, id)) - return -1; + ret = setup_hpet_msi_remapped(irq, id); + if (ret) + return ret; } ret = msi_compose_msg(NULL, irq, &msg, id); -- cgit v1.2.3 From 8d966a04107e56993a051cd41ead0b4f23ba2414 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:49 -0800 Subject: x86, hotplug: Handle retrigger irq by the first available CPU The first cpu in irq cfg->domain is likely to be CPU 0 and may not be available when CPU 0 is offline. Instead of using CPU 0 to handle retriggered irq, we use first available CPU which is online and in this irq's domain. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-13-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1817fa911024..f78fc2b4deb0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2199,9 +2199,11 @@ static int ioapic_retrigger_irq(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); + cpu = cpumask_first_and(cfg->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; -- cgit v1.2.3 From 29c574c0aba8dc0736e19eb9b24aad28cc5c9098 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Nov 2012 14:49:36 -0800 Subject: x86, apic: Cleanup cfg->domain setup for legacy interrupts Issues that need to be handled: * Handle PIC interrupts on any CPU irrespective of the apic mode * In the apic lowest priority logical flat delivery mode, be prepared to handle the interrupt on any CPU irrespective of what the IO-APIC RTE says. * Because of above, when the IO-APIC starts handling the legacy PIC interrupt, use the same vector that is being used by the PIC while programming the corresponding IO-APIC RTE. Start with all the cpu's in the legacy PIC interrupts cfg->domain. By the time IO-APIC starts taking over the PIC interrupts, apic driver model is finalized. So depend on the assign_irq_vector() to update the cfg->domain and retain the same vector that was used by PIC before. For the logical apic flat mode, cfg->domain is updated (during the first call to assign_irq_vector()) to contain all the possible online cpu's (0xff). Vector used for the legacy PIC interrupt doesn't change when the IO-APIC starts handling the interrupt. Any interrupt migration after that doesn't change the cfg->domain or the vector used. For other apic modes like physical mode, cfg->domain is updated (during the first call to assign_irq_vector()) to the boot cpu (cpu-0), with the same vector that is being used by the PIC. When that interrupt is migrated to a different cpu, cfg->domin and the vector assigned will change accordingly. Tested-by: Borislav Petkov Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1353970176.21070.51.camel@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c265593ec2cd..0c1f36650568 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -234,11 +234,11 @@ int __init arch_early_irq_init(void) zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ if (i < legacy_pic->nr_legacy_irqs) { cfg[i].vector = IRQ0_VECTOR + i; - cpumask_set_cpu(0, cfg[i].domain); + cpumask_setall(cfg[i].domain); } } @@ -1141,7 +1141,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * allocation for the members that are not used anymore. */ cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = 1; + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); cpumask_and(cfg->domain, cfg->domain, tmp_mask); break; } @@ -1172,8 +1173,9 @@ next: current_vector = vector; current_offset = offset; if (cfg->vector) { - cfg->move_in_progress = 1; cpumask_copy(cfg->old_domain, cfg->domain); + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; @@ -1241,12 +1243,6 @@ void __setup_vector_irq(int cpu) cfg = irq_get_chip_data(irq); if (!cfg) continue; - /* - * If it is a legacy IRQ handled by the legacy PIC, this cpu - * will be part of the irq_cfg's domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) - cpumask_set_cpu(cpu, cfg->domain); if (!cpumask_test_cpu(cpu, cfg->domain)) continue; @@ -1356,16 +1352,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; - /* - * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC - * can handle this irq and the apic driver is finialized at this point, - * update the cfg->domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && - cpumask_equal(cfg->domain, cpumask_of(0))) - apic->vector_allocation_domain(0, cfg->domain, - apic->target_cpus()); - if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.2.3 From f9726bfd4b14401d294207a70c7c0c4be8a8c6cc Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 7 Dec 2012 14:24:32 -0700 Subject: x86/PCI: Add NumaChip remote PCI support Add NumaChip-specific PCI access mechanism via MMCONFIG cycles, but preventing access to AMD Northbridges which shouldn't respond. Signed-off-by: Daniel J Blueman Signed-off-by: Bjorn Helgaas --- arch/x86/Kconfig | 1 + arch/x86/include/asm/numachip/numachip.h | 19 +++++ arch/x86/kernel/apic/apic_numachip.c | 2 + arch/x86/pci/Makefile | 1 + arch/x86/pci/numachip.c | 129 +++++++++++++++++++++++++++++++ 5 files changed, 152 insertions(+) create mode 100644 arch/x86/include/asm/numachip/numachip.h create mode 100644 arch/x86/pci/numachip.c (limited to 'arch/x86/kernel/apic') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced2..17fb8eb3fa6b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -374,6 +374,7 @@ config X86_NUMACHIP depends on NUMA depends on SMP depends on X86_X2APIC + depends on PCI_MMCONFIG ---help--- Adds support for Numascale NumaChip large-SMP systems. Needed to enable more than ~168 cores. diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h new file mode 100644 index 000000000000..1c6f7f6212c1 --- /dev/null +++ b/arch/x86/include/asm/numachip/numachip.h @@ -0,0 +1,19 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-specific header file + * + * Copyright (C) 2012 Numascale AS. All rights reserved. + * + * Send feedback to + * + */ + +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H +#define _ASM_X86_NUMACHIP_NUMACHIP_H + +extern int __init pci_numachip_init(void); + +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */ diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a65829ac2b9a..9c2aa89a11cb 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -179,6 +180,7 @@ static int __init numachip_system_init(void) return 0; x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; map_csrs(); diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 3af5a1e79c9c..ee0af58ca5bd 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_STA2X11) += sta2x11-fixup.o obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_X86_INTEL_MID) += mrst.o diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c new file mode 100644 index 000000000000..7307d9d12d15 --- /dev/null +++ b/arch/x86/pci/numachip.c @@ -0,0 +1,129 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-specific PCI code + * + * Copyright (C) 2012 Numascale AS. All rights reserved. + * + * Send feedback to + * + * PCI accessor functions derived from mmconfig_64.c + * + */ + +#include +#include + +static u8 limit __read_mostly; + +static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) +{ + struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus); + + if (cfg && cfg->virt) + return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12)); + return NULL; +} + +static int pci_mmcfg_read_numachip(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + char __iomem *addr; + + /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ + if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) { +err: *value = -1; + return -EINVAL; + } + + /* Ensure AMD Northbridges don't decode reads to other devices */ + if (unlikely(bus == 0 && devfn >= limit)) { + *value = -1; + return 0; + } + + rcu_read_lock(); + addr = pci_dev_base(seg, bus, devfn); + if (!addr) { + rcu_read_unlock(); + goto err; + } + + switch (len) { + case 1: + *value = mmio_config_readb(addr + reg); + break; + case 2: + *value = mmio_config_readw(addr + reg); + break; + case 4: + *value = mmio_config_readl(addr + reg); + break; + } + rcu_read_unlock(); + + return 0; +} + +static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + char __iomem *addr; + + /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ + if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) + return -EINVAL; + + /* Ensure AMD Northbridges don't decode writes to other devices */ + if (unlikely(bus == 0 && devfn >= limit)) + return 0; + + rcu_read_lock(); + addr = pci_dev_base(seg, bus, devfn); + if (!addr) { + rcu_read_unlock(); + return -EINVAL; + } + + switch (len) { + case 1: + mmio_config_writeb(addr + reg, value); + break; + case 2: + mmio_config_writew(addr + reg, value); + break; + case 4: + mmio_config_writel(addr + reg, value); + break; + } + rcu_read_unlock(); + + return 0; +} + +const struct pci_raw_ops pci_mmcfg_numachip = { + .read = pci_mmcfg_read_numachip, + .write = pci_mmcfg_write_numachip, +}; + +int __init pci_numachip_init(void) +{ + int ret = 0; + u32 val; + + /* For remote I/O, restrict bus 0 access to the actual number of AMD + Northbridges, which starts at device number 0x18 */ + ret = raw_pci_read(0, 0, PCI_DEVFN(0x18, 0), 0x60, sizeof(val), &val); + if (ret) + goto out; + + /* HyperTransport fabric size in bits 6:4 */ + limit = PCI_DEVFN(0x18 + ((val >> 4) & 7) + 1, 0); + + /* Use NumaChip PCI accessors for non-extended and extended access */ + raw_pci_ops = raw_pci_ext_ops = &pci_mmcfg_numachip; +out: + return ret; +} -- cgit v1.2.3