From 66f5ddf30a59f811818656cb2833c80da0340cfa Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Thu, 3 Nov 2011 11:46:47 -0700 Subject: x86/mce: Make mce_chrdev_ops 'static const' Arjan would like to make struct file_operations const, but mce-inject directly writes to the mce_chrdev_ops to install its write handler. In an ideal world mce-inject would have its own character device, but we have a sizable legacy of test scripts that hardwire "/dev/mcelog", so it would be painful to switch to a separate device now. Instead, this patch switches to a stub function in the mce code, with a registration helper that mce-inject can call when it is loaded. Note that this would also allow for a sane process to allow mce-inject to be unloaded again (with an unregister function, and appropriate module_{get,put}() calls), but that is left for potential future patches. Reported-by: Arjan van de Ven Signed-off-by: Tony Luck Link: http://lkml.kernel.org/r/4eb2e1971326651a3b@agluck-desktop.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 5 ++++- arch/x86/kernel/cpu/mcheck/mce-inject.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 25 ++++++++++++++++++++++--- 3 files changed, 27 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c9321f34e55b..0e8ae57d3656 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -201,7 +201,10 @@ int mce_notify_irq(void); void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); -extern struct file_operations mce_chrdev_ops; + +extern void register_mce_write_callback(ssize_t (*)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)); /* * Exception handler diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 6199232161cf..319882ef848d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -208,7 +208,7 @@ static int inject_init(void) if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); - mce_chrdev_ops.write = mce_write; + register_mce_write_callback(mce_write); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); return 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 362056aefeb4..2af127d4c3d1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1634,16 +1634,35 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, } } -/* Modified in mce-inject.c, so not static or const */ -struct file_operations mce_chrdev_ops = { +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { .open = mce_chrdev_open, .release = mce_chrdev_release, .read = mce_chrdev_read, + .write = mce_chrdev_write, .poll = mce_chrdev_poll, .unlocked_ioctl = mce_chrdev_ioctl, .llseek = no_llseek, }; -EXPORT_SYMBOL_GPL(mce_chrdev_ops); static struct miscdevice mce_chrdev_device = { MISC_MCELOG_MINOR, -- cgit v1.2.3 From cf8ff6b6ab0e99dd3058852f4ec76a6140abadec Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 10 Nov 2011 13:42:02 +0000 Subject: x86/platform: Add a wallclock_init func to x86_platforms ops Some wall clock devices use MMIO based HW register, this new function will give them a chance to do some initialization work before their get/set_time service get called. Signed-off-by: Feng Tang Signed-off-by: Jacob Pan Signed-off-by: Alan Cox Signed-off-by: Dirk Brandewie Signed-off-by: Ingo Molnar --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/setup.c | 2 ++ arch/x86/kernel/x86_init.c | 2 ++ 3 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index d3d859035af9..f864fbe474c6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -152,6 +152,7 @@ struct x86_cpuinit_ops { /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC + * @wallclock_init: init the wallclock device * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic @@ -160,6 +161,7 @@ struct x86_cpuinit_ops { */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); + void (*wallclock_init)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index afaf38447ef5..cf0ef986cb6d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1045,6 +1045,8 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); + x86_platform.wallclock_init(); + mcheck_init(); arch_init_ideal_nops(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 6f164bd5e14d..701c7be442f1 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -27,6 +27,7 @@ void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } +void wallclock_init_noop(void) { } /* * The platform setup functions are preset with the default functions @@ -97,6 +98,7 @@ static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, + .wallclock_init = wallclock_init_noop, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, -- cgit v1.2.3 From bb84ac2d3a603f8f6c7cc553a260e8ceaf871df2 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 10 Nov 2011 13:42:21 +0000 Subject: x86/apic: Do not clear nr_irqs_gsi if no legacy irqs nr_legacy_irqs is set in probe_nr_irqs_gsi, we should not clear it after that. Otherwise, the result is that MSI irqs will be allocated from the wrong range for the systems without legacy PIC. Signed-off-by: Jacob Pan Signed-off-by: Dirk Brandewie Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c31fa98af6d..841b8da40525 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -193,10 +193,8 @@ int __init arch_early_irq_init(void) struct irq_cfg *cfg; int count, node, i; - if (!legacy_pic->nr_legacy_irqs) { - nr_irqs_gsi = 0; + if (!legacy_pic->nr_legacy_irqs) io_apic_irqs = ~0UL; - } for (i = 0; i < nr_ioapics; i++) { ioapics[i].saved_registers = -- cgit v1.2.3 From 1ade93efd0a3dda5b0c0afda8ab8f4bd12938c1b Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 10 Nov 2011 13:42:40 +0000 Subject: x86/apic: Allow use of lapic timer early calibration result lapic timer calibration can be combined with tsc in platform specific calibration functions. if such calibration result is obtained early, we can skip the redundant calibration loops. Signed-off-by: Jacob Pan Signed-off-by: Jacob Pan Signed-off-by: Alan Cox Signed-off-by: Dirk Brandewie Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 1 + arch/x86/kernel/apic/apic.c | 33 ++++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 9b7273cb2193..1a6c09af048f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -49,6 +49,7 @@ extern unsigned int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; +extern unsigned int lapic_timer_frequency; #ifdef CONFIG_SMP extern void __inquire_remote_apic(int apicid); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a2fd72e0ab35..f98d84caf94c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -186,7 +186,7 @@ static struct resource lapic_resource = { .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; -static unsigned int calibration_result; +unsigned int lapic_timer_frequency = 0; static void apic_pm_activate(void); @@ -454,7 +454,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, switch (mode) { case CLOCK_EVT_MODE_PERIODIC: case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, + __setup_APIC_LVTT(lapic_timer_frequency, mode != CLOCK_EVT_MODE_PERIODIC, 1); break; case CLOCK_EVT_MODE_UNUSED: @@ -638,6 +638,25 @@ static int __init calibrate_APIC_clock(void) long delta, deltatsc; int pm_referenced = 0; + /** + * check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. if so, we just fill + * in the clockevent structure and return. + */ + + if (lapic_timer_frequency) { + apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", + lapic_timer_frequency); + lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + return 0; + } + local_irq_disable(); /* Replace the global interrupt handler */ @@ -679,12 +698,12 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); - calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - calibration_result); + lapic_timer_frequency); if (cpu_has_tsc) { apic_printk(APIC_VERBOSE, "..... CPU clock speed is " @@ -695,13 +714,13 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "..... host bus clock speed is " "%u.%04u MHz.\n", - calibration_result / (1000000 / HZ), - calibration_result % (1000000 / HZ)); + lapic_timer_frequency / (1000000 / HZ), + lapic_timer_frequency % (1000000 / HZ)); /* * Do a sanity check on the APIC calibration result */ - if (calibration_result < (1000000 / HZ)) { + if (lapic_timer_frequency < (1000000 / HZ)) { local_irq_enable(); pr_warning("APIC frequency too slow, disabling apic timer\n"); return -1; -- cgit v1.2.3 From 0a9153261d54c432bc0bdc88607f24c835ac729c Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Thu, 10 Nov 2011 13:42:53 +0000 Subject: x86/mrst: Add support for Penwell clock calibration Signed-off-by: Dirk Brandewie Signed-off-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mrst.h | 7 +++++++ arch/x86/platform/mrst/mrst.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 719f00b28ff5..e6283129c821 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -44,6 +44,13 @@ enum mrst_timer_options { extern enum mrst_timer_options mrst_timer_options; +/* + * Penwell uses spread spectrum clock, so the freq number is not exactly + * the same as reported by MSR based on SDM. + */ +#define PENWELL_FSB_FREQ_83SKU 83200 +#define PENWELL_FSB_FREQ_100SKU 99840 + #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 6ed7afdaf4af..b7f14e5b2c66 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -187,11 +187,34 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) static unsigned long __init mrst_calibrate_tsc(void) { unsigned long flags, fast_calibrate; - - local_irq_save(flags); - fast_calibrate = apbt_quick_calibrate(); - local_irq_restore(flags); - + if (__mrst_cpu_chip == MRST_CPU_CHIP_PENWELL) { + u32 lo, hi, ratio, fsb; + + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); + ratio = (hi >> 8) & 0x1f; + pr_debug("ratio is %d\n", ratio); + if (!ratio) { + pr_err("read a zero ratio, should be incorrect!\n"); + pr_err("force tsc ratio to 16 ...\n"); + ratio = 16; + } + rdmsr(MSR_FSB_FREQ, lo, hi); + if ((lo & 0x7) == 0x7) + fsb = PENWELL_FSB_FREQ_83SKU; + else + fsb = PENWELL_FSB_FREQ_100SKU; + fast_calibrate = ratio * fsb; + pr_debug("read penwell tsc %lu khz\n", fast_calibrate); + lapic_timer_frequency = fsb * 1000 / HZ; + /* mark tsc clocksource as reliable */ + set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + } else { + local_irq_save(flags); + fast_calibrate = apbt_quick_calibrate(); + local_irq_restore(flags); + } + if (fast_calibrate) return fast_calibrate; -- cgit v1.2.3 From 064a59b6dd1f341cc478c212bb436e3da9cb8d04 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 10 Nov 2011 13:43:05 +0000 Subject: x86/mrst: Avoid reporting wrong nmi status Moorestown/Medfield platform does not have port 0x61 to report NMI status, nor does it have external NMI sources. The only NMI sources are from lapic, as results of perf counter overflow or IPI, e.g. NMI watchdog or spin lock debug. Reading port 0x61 on Moorestown will return 0xff which misled NMI handlers to false critical errors such memory parity error. The subsequent ioport access for NMI handling can also cause undefined behavior on Moorestown. This patch allows kernel process NMI due to watchdog or backrace dump without unnecessary hangs. Signed-off-by: Jacob Pan Signed-off-by: Ingo Molnar [hand applied] Signed-off-by: Alan Cox --- arch/x86/include/asm/mach_traps.h | 2 +- arch/x86/include/asm/x86_init.h | 1 + arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/x86_init.c | 2 ++ arch/x86/platform/mrst/mrst.c | 13 +++++++++++++ 5 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h index 72a8b52e7dfd..a01e7ec7d237 100644 --- a/arch/x86/include/asm/mach_traps.h +++ b/arch/x86/include/asm/mach_traps.h @@ -17,7 +17,7 @@ #define NMI_REASON_CLEAR_IOCHK 0x08 #define NMI_REASON_CLEAR_MASK 0x0f -static inline unsigned char get_nmi_reason(void) +static inline unsigned char default_get_nmi_reason(void) { return inb(NMI_REASON_PORT); } diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f864fbe474c6..1971e652d24b 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -167,6 +167,7 @@ struct x86_platform_ops { void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); void (*nmi_init)(void); + unsigned char (*get_nmi_reason)(void); int (*i8042_detect)(void); }; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index b9c8628974af..27d1e7cbdb6c 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -348,7 +348,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ raw_spin_lock(&nmi_reason_lock); - reason = get_nmi_reason(); + reason = x86_platform.get_nmi_reason(); if (reason & NMI_REASON_MASK) { if (reason & NMI_REASON_SERR) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 701c7be442f1..c1d6cd549397 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -21,6 +21,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -104,6 +105,7 @@ struct x86_platform_ops x86_platform = { .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, + .get_nmi_reason = default_get_nmi_reason, .i8042_detect = default_i8042_detect }; diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index b7f14e5b2c66..9b9ee292c107 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -276,6 +276,17 @@ static void mrst_reboot(void) intel_scu_ipc_simple_command(0xf1, 0); } +/* + * Moorestown does not have external NMI source nor port 0x61 to report + * NMI status. The possible NMI sources are from pmu as a result of NMI + * watchdog or lock debug. Reading io port 0x61 results in 0xff which + * misled NMI handler. + */ +static unsigned char mrst_get_nmi_reason(void) +{ + return 0; +} + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -297,6 +308,8 @@ void __init x86_mrst_early_setup(void) x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_platform.i8042_detect = mrst_i8042_detect; x86_init.timers.wallclock_init = mrst_rtc_init; + x86_platform.get_nmi_reason = mrst_get_nmi_reason; + x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; -- cgit v1.2.3 From 6fd36ba02132c61f67ebefff77fe710bd38ba95a Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 10 Nov 2011 13:45:24 +0000 Subject: x86, ioapic: Only print ioapic debug information for IRQs belonging to an ioapic chip with "apic=verbose" the print_IO_APIC() function tries to print IRQ to pin mappings for every active irq. It assumes chip_data is of type irq_cfg and may cause an oops if not. As the print_IO_APIC() is called from a late_initcall other chained irq chips may already be registered with custom chip_data information, causing an oops. This is the case with intel MID SoC devices with gpio demuxers registered as irq_chips. Signed-off-by: Mathias Nyman Signed-off-by: Alan Cox [ -v2: fixed build failure ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 5 +++++ arch/x86/kernel/nmi.c | 1 + 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 841b8da40525..6d939d7847e2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1694,6 +1694,7 @@ __apicdebuginit(void) print_IO_APICs(void) int ioapic_idx; struct irq_cfg *cfg; unsigned int irq; + struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) @@ -1714,6 +1715,10 @@ __apicdebuginit(void) print_IO_APICs(void) for_each_active_irq(irq) { struct irq_pin_list *entry; + chip = irq_get_chip(irq); + if (chip != &ioapic_chip) + continue; + cfg = irq_get_chip_data(irq); if (!cfg) continue; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 27d1e7cbdb6c..e88f37b58ddd 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -29,6 +29,7 @@ #include #include #include +#include #define NMI_MAX_NAMELEN 16 struct nmiaction { -- cgit v1.2.3 From 78345d2edc25e001558f3b7c85906f645d38d23c Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Thu, 27 Oct 2011 13:24:32 +0530 Subject: x86: Call stop_machine_text_poke() on all CPUs It appears that stop_machine_text_poke() wants to be called on all CPUs, like it's done from text_poke_smp(). Fix text_poke_smp_batch() to do this. Signed-off-by: Rabin Vincent Acked-by: Masami Hiramatsu Signed-off-by: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jason Baron Link: http://lkml.kernel.org/r/1319702072-32676-1-git-send-email-rabin@rab.in Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c63822816249..1f84794f0759 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -738,5 +738,5 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) atomic_set(&stop_machine_first, 1); wrote_text = 0; - __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); } -- cgit v1.2.3 From cd12909cb576d37311fe35868780e82d5007d0c8 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 29 Sep 2011 16:53:32 +0100 Subject: xen: map foreign pages for shared rings by updating the PTEs directly When mapping a foreign page with xenbus_map_ring_valloc() with the GNTTABOP_map_grant_ref hypercall, set the GNTMAP_contains_pte flag and pass a pointer to the PTE (in init_mm). After the page is mapped, the usual fault mechanism can be used to update additional MMs. This allows the vmalloc_sync_all() to be removed from alloc_vm_area(). Signed-off-by: David Vrabel Acked-by: Andrew Morton [v1: Squashed fix by Michal for no-mmu case] Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Michal Simek --- arch/x86/xen/grant-table.c | 2 +- drivers/xen/xenbus/xenbus_client.c | 11 ++++++++--- include/linux/vmalloc.h | 2 +- mm/nommu.c | 2 +- mm/vmalloc.c | 27 +++++++++++++-------------- 5 files changed, 24 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 6bbfd7ac5e81..5a40d24ba331 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -71,7 +71,7 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, if (shared == NULL) { struct vm_struct *area = - alloc_vm_area(PAGE_SIZE * max_nr_gframes); + alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL); BUG_ON(area == NULL); shared = area->addr; *__shared = shared; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 81c3ce6b8bbe..1906125eab49 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -436,19 +437,20 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn); int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr) { struct gnttab_map_grant_ref op = { - .flags = GNTMAP_host_map, + .flags = GNTMAP_host_map | GNTMAP_contains_pte, .ref = gnt_ref, .dom = dev->otherend_id, }; struct vm_struct *area; + pte_t *pte; *vaddr = NULL; - area = alloc_vm_area(PAGE_SIZE); + area = alloc_vm_area(PAGE_SIZE, &pte); if (!area) return -ENOMEM; - op.host_addr = (unsigned long)area->addr; + op.host_addr = arbitrary_virt_to_machine(pte).maddr; if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) BUG(); @@ -527,6 +529,7 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) struct gnttab_unmap_grant_ref op = { .host_addr = (unsigned long)vaddr, }; + unsigned int level; /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr) * method so that we don't have to muck with vmalloc internals here. @@ -548,6 +551,8 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) } op.handle = (grant_handle_t)area->phys_addr; + op.host_addr = arbitrary_virt_to_machine( + lookup_address((unsigned long)vaddr, &level)).maddr; if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) BUG(); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 687fb11e2010..4bde182fcf93 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -119,7 +119,7 @@ unmap_kernel_range(unsigned long addr, unsigned long size) #endif /* Allocate/destroy a 'vmalloc' VM area. */ -extern struct vm_struct *alloc_vm_area(size_t size); +extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes); extern void free_vm_area(struct vm_struct *area); /* for /dev/kmem */ diff --git a/mm/nommu.c b/mm/nommu.c index 73419c55eda6..b982290fd962 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -454,7 +454,7 @@ void __attribute__((weak)) vmalloc_sync_all(void) * between processes, it syncs the pagetable across all * processes. */ -struct vm_struct *alloc_vm_area(size_t size) +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b669aa6f6caf..3231bf332878 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2141,23 +2141,30 @@ void __attribute__((weak)) vmalloc_sync_all(void) static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) { - /* apply_to_page_range() does all the hard work. */ + pte_t ***p = data; + + if (p) { + *(*p) = pte; + (*p)++; + } return 0; } /** * alloc_vm_area - allocate a range of kernel address space * @size: size of the area + * @ptes: returns the PTEs for the address space * * Returns: NULL on failure, vm_struct on success * * This function reserves a range of kernel address space, and * allocates pagetables to map that range. No actual mappings - * are created. If the kernel address space is not shared - * between processes, it syncs the pagetable across all - * processes. + * are created. + * + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) + * allocated for the VM area are returned. */ -struct vm_struct *alloc_vm_area(size_t size) +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { struct vm_struct *area; @@ -2171,19 +2178,11 @@ struct vm_struct *alloc_vm_area(size_t size) * of kernel virtual address space and mapped into init_mm. */ if (apply_to_page_range(&init_mm, (unsigned long)area->addr, - area->size, f, NULL)) { + size, f, ptes ? &ptes : NULL)) { free_vm_area(area); return NULL; } - /* - * If the allocated address space is passed to a hypercall - * before being used then we cannot rely on a page fault to - * trigger an update of the page tables. So sync all the page - * tables here. - */ - vmalloc_sync_all(); - return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); -- cgit v1.2.3 From 90d4f5534d14815bd94c10e8ceccc57287657ecc Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 27 Oct 2011 22:28:59 -0700 Subject: xen:pvhvm: enable PVHVM VCPU placement when using more than 32 CPUs. PVHVM running with more than 32 vcpus and pv_irq/pv_time enabled need VCPU placement to work, or else it will softlockup. CC: stable@kernel.org Acked-by: Stefano Stabellini Signed-off-by: Zhenzhong Duan Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index da8afd576a6b..1f928659c338 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1356,7 +1356,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, int cpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + xen_vcpu_setup(cpu); if (xen_have_vector_callback) xen_init_lock_cpu(cpu); break; @@ -1386,7 +1386,6 @@ static void __init xen_hvm_guest_init(void) xen_hvm_smp_init(); register_cpu_notifier(&xen_hvm_cpu_notifier); xen_unplug_emulated_devices(); - have_vcpu_info_placement = 0; x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); -- cgit v1.2.3 From 8bf00a529967dafbbb210b377c38a15834d1e979 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 5 Oct 2011 14:01:22 +0200 Subject: KVM: VMX: add support for switching of PERF_GLOBAL_CTRL Some cpus have special support for switching PERF_GLOBAL_CTRL msr. Add logic to detect if such support exists and works properly and extend msr switching code to use it if available. Also extend number of generic msr switching entries to 8. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a0d6bd9ad442..55e849b52d9e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -118,7 +118,7 @@ module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); -#define NR_AUTOLOAD_MSRS 1 +#define NR_AUTOLOAD_MSRS 8 #define VMCS02_POOL_SIZE 1 struct vmcs { @@ -622,6 +622,7 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static bool cpu_has_load_ia32_efer; +static bool cpu_has_load_perf_global_ctrl; static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -1191,15 +1192,34 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } +static void clear_atomic_switch_msr_special(unsigned long entry, + unsigned long exit) +{ + vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); + vmcs_clear_bits(VM_EXIT_CONTROLS, exit); +} + static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) { unsigned i; struct msr_autoload *m = &vmx->msr_autoload; - if (msr == MSR_EFER && cpu_has_load_ia32_efer) { - vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); - vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); - return; + switch (msr) { + case MSR_EFER: + if (cpu_has_load_ia32_efer) { + clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + VM_EXIT_LOAD_IA32_EFER); + return; + } + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl) { + clear_atomic_switch_msr_special( + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return; + } + break; } for (i = 0; i < m->nr; ++i) @@ -1215,18 +1235,44 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } +static void add_atomic_switch_msr_special(unsigned long entry, + unsigned long exit, unsigned long guest_val_vmcs, + unsigned long host_val_vmcs, u64 guest_val, u64 host_val) +{ + vmcs_write64(guest_val_vmcs, guest_val); + vmcs_write64(host_val_vmcs, host_val); + vmcs_set_bits(VM_ENTRY_CONTROLS, entry); + vmcs_set_bits(VM_EXIT_CONTROLS, exit); +} + static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, u64 guest_val, u64 host_val) { unsigned i; struct msr_autoload *m = &vmx->msr_autoload; - if (msr == MSR_EFER && cpu_has_load_ia32_efer) { - vmcs_write64(GUEST_IA32_EFER, guest_val); - vmcs_write64(HOST_IA32_EFER, host_val); - vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER); - vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER); - return; + switch (msr) { + case MSR_EFER: + if (cpu_has_load_ia32_efer) { + add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, + VM_EXIT_LOAD_IA32_EFER, + GUEST_IA32_EFER, + HOST_IA32_EFER, + guest_val, host_val); + return; + } + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl) { + add_atomic_switch_msr_special( + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, + GUEST_IA32_PERF_GLOBAL_CTRL, + HOST_IA32_PERF_GLOBAL_CTRL, + guest_val, host_val); + return; + } + break; } for (i = 0; i < m->nr; ++i) @@ -2455,6 +2501,42 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_LOAD_IA32_EFER); + cpu_has_load_perf_global_ctrl = + allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + + /* + * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL + * but due to arrata below it can't be used. Workaround is to use + * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. + * + * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] + * + * AAK155 (model 26) + * AAP115 (model 30) + * AAT100 (model 37) + * BC86,AAY89,BD102 (model 44) + * BA97 (model 46) + * + */ + if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { + switch (boot_cpu_data.x86_model) { + case 26: + case 30: + case 37: + case 44: + case 46: + cpu_has_load_perf_global_ctrl = false; + printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + break; + default: + break; + } + } + return 0; } -- cgit v1.2.3 From d7cd97964ba6d70c558348bd2c87290dce885583 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 5 Oct 2011 14:01:23 +0200 Subject: KVM: VMX: Add support for guest/host-only profiling Support guest/host-only profiling by switch perf msrs on a guest entry if needed. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 55e849b52d9e..98f4b0bcc660 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "trace.h" @@ -6050,6 +6051,24 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } +static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) +{ + int i, nr_msrs; + struct perf_guest_switch_msr *msrs; + + msrs = perf_guest_get_msrs(&nr_msrs); + + if (!msrs) + return; + + for (i = 0; i < nr_msrs; i++) + if (msrs[i].host == msrs[i].guest) + clear_atomic_switch_msr(vmx, msrs[i].msr); + else + add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, + msrs[i].host); +} + #ifdef CONFIG_X86_64 #define R "r" #define Q "q" @@ -6099,6 +6118,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + atomic_switch_perf_msrs(vmx); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ -- cgit v1.2.3 From e7fc6f93b4242b2b566f0070709e27257d6da8a2 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 5 Oct 2011 14:01:24 +0200 Subject: KVM: VMX: Check for automatic switch msr table overflow Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 98f4b0bcc660..579a0b51696a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1280,7 +1280,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, if (m->guest[i].index == msr) break; - if (i == m->nr) { + if (i == NR_AUTOLOAD_MSRS) { + printk_once(KERN_WARNING"Not enough mst switch entries. " + "Can't add msr %x\n", msr); + return; + } else if (i == m->nr) { ++m->nr; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); -- cgit v1.2.3 From 95ef1e52922cf75b1ea2eae54ef886f2cc47eecb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 15 Nov 2011 14:59:07 +0200 Subject: KVM guest: prevent tracing recursion with kvmclock Prevent tracing of preempt_disable() in get_cpu_var() in kvm_clock_read(). When CONFIG_DEBUG_PREEMPT is enabled, preempt_disable/enable() are traced and this causes the function_graph tracer to go into an infinite recursion. By open coding the preempt_disable() around the get_cpu_var(), we can use the notrace version which prevents preempt_disable/enable() from being traced and prevents the recursion. Based on a similar patch for Xen from Jeremy Fitzhardinge. Tested-by: Gleb Natapov Acked-by: Steven Rostedt Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index c1a0188e29ae..44842d756b29 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -74,9 +74,10 @@ static cycle_t kvm_clock_read(void) struct pvclock_vcpu_time_info *src; cycle_t ret; - src = &get_cpu_var(hv_clock); + preempt_disable_notrace(); + src = &__get_cpu_var(hv_clock); ret = pvclock_clocksource_read(src); - put_cpu_var(hv_clock); + preempt_enable_notrace(); return ret; } -- cgit v1.2.3 From cc11f9edd919002d8b3a03601a181a449150defd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 Nov 2011 03:52:18 +0000 Subject: fix braino in um patchset (mea culpa) wrong register returned... Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/x86/um/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 118c143a9cb4..2c32df6fe231 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -11,7 +11,7 @@ #endif #define KSTK_EIP(tsk) KSTK_REG(tsk, HOST_IP) -#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_IP) +#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_SP) #define KSTK_EBP(tsk) KSTK_REG(tsk, HOST_BP) #define ARCH_IS_STACKGROW(address) \ -- cgit v1.2.3