summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c89
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c41
9 files changed, 296 insertions, 39 deletions
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc83a002786e..68e4a6f2211e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1121,9 +1121,9 @@ void __cpuinit cpu_init(void)
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
- if (cpu != 0 && percpu_read(node_number) == 0 &&
- cpu_to_node(cpu) != NUMA_NO_NODE)
- percpu_write(node_number, cpu_to_node(cpu));
+ if (cpu != 0 && percpu_read(numa_node) == 0 &&
+ early_cpu_to_node(cpu) != NUMA_NO_NODE)
+ set_numa_node(early_cpu_to_node(cpu));
#endif
me = current;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6f3dc8fbbfdc..7ec2123838e6 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1497,8 +1497,8 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
* simply keep the boost-disable flag in sync with the current global
* state.
*/
-static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
- void *hcpu)
+static int cpb_notify(struct notifier_block *nb, unsigned long action,
+ void *hcpu)
{
unsigned cpu = (long)hcpu;
u32 lo, hi;
@@ -1528,7 +1528,7 @@ static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata cpb_nb = {
+static struct notifier_block cpb_nb = {
.notifier_call = cpb_notify,
};
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11b..bb34b03af252 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+
+obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 000000000000..745b54f9be89
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,138 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+{
+ struct mce m;
+
+ /* Only corrected MC is reported */
+ if (!corrected)
+ return;
+
+ mce_setup(&m);
+ m.bank = 1;
+ /* Fake a memory read corrected error with unknown channel */
+ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+ m.addr = mem_err->physical_addr;
+ mce_log(&m);
+ mce_notify_irq();
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+#define CPER_CREATOR_MCE \
+ UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE \
+ UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_hdr;
+ struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+ struct cper_mce_record rcd;
+
+ memset(&rcd, 0, sizeof(rcd));
+ memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd.hdr.revision = CPER_RECORD_REV;
+ rcd.hdr.signature_end = CPER_SIG_END;
+ rcd.hdr.section_count = 1;
+ rcd.hdr.error_severity = CPER_SER_FATAL;
+ /* timestamp, platform_id, partition_id are all invalid */
+ rcd.hdr.validation_bits = 0;
+ rcd.hdr.record_length = sizeof(rcd);
+ rcd.hdr.creator_id = CPER_CREATOR_MCE;
+ rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd.hdr.record_id = cper_next_record_id();
+ rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+ rcd.sec_hdr.section_length = sizeof(rcd.mce);
+ rcd.sec_hdr.revision = CPER_SEC_REV;
+ /* fru_id and fru_text is invalid */
+ rcd.sec_hdr.validation_bits = 0;
+ rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+ rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+
+ memcpy(&rcd.mce, m, sizeof(*m));
+
+ return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ struct cper_mce_record rcd;
+ ssize_t len;
+
+ len = erst_read_next(&rcd.hdr, sizeof(rcd));
+ if (len <= 0)
+ return len;
+ /* Can not skip other records in storage via ERST unless clear them */
+ else if (len != sizeof(rcd) ||
+ uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
+ if (printk_ratelimit())
+ pr_warning(
+ "MCE-APEI: Can not skip the unknown record in ERST");
+ return -EIO;
+ }
+
+ memcpy(m, &rcd.mce, sizeof(*m));
+ *record_id = rcd.hdr.record_id;
+
+ return sizeof(*m);
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+ return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+ return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab67..fefcc69ee8b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
extern struct mce_bank *mce_banks;
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+ return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ return 0;
+}
+static inline int apei_check_mce(void)
+{
+ return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+ return -EINVAL;
+}
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7a355ddcc64b..18cc42562250 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/debugfs.h>
+#include <linux/edac_mce.h>
#include <asm/processor.h>
#include <asm/hw_irq.h>
@@ -169,6 +170,15 @@ void mce_log(struct mce *mce)
entry = rcu_dereference_check_mce(mcelog.next);
for (;;) {
/*
+ * If edac_mce is enabled, it will check the error type
+ * and will process it, if it is a known error.
+ * Otherwise, the error will be sent through mcelog
+ * interface
+ */
+ if (edac_mce_parse(mce))
+ return;
+
+ /*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
@@ -264,7 +274,7 @@ static void wait_for_panic(void)
static void mce_panic(char *msg, struct mce *final, char *exp)
{
- int i;
+ int i, apei_err = 0;
if (!fake_panic) {
/*
@@ -287,8 +297,11 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
struct mce *m = &mcelog.entry[i];
if (!(m->status & MCI_STATUS_VAL))
continue;
- if (!(m->status & MCI_STATUS_UC))
+ if (!(m->status & MCI_STATUS_UC)) {
print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
}
/* Now print uncorrected but with the final one last */
for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,11 +310,17 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
continue;
if (!(m->status & MCI_STATUS_UC))
continue;
- if (!final || memcmp(m, final, sizeof(struct mce)))
+ if (!final || memcmp(m, final, sizeof(struct mce))) {
print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
}
- if (final)
+ if (final) {
print_mce(final);
+ if (!apei_err)
+ apei_err = apei_write_mce(final);
+ }
if (cpu_missing)
printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
print_mce_tail();
@@ -1493,6 +1512,43 @@ static void collect_tscs(void *data)
rdtscll(cpu_tsc[smp_processor_id()]);
}
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
loff_t *off)
{
@@ -1506,15 +1562,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
return -ENOMEM;
mutex_lock(&mce_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
next = rcu_dereference_check_mce(mcelog.next);
/* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
-
- return -EINVAL;
- }
+ err = -EINVAL;
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ goto out;
err = 0;
prev = 0;
@@ -1562,10 +1622,15 @@ timeout:
memset(&mcelog.entry[i], 0, sizeof(struct mce));
}
}
+
+ if (err)
+ err = -EFAULT;
+
+out:
mutex_unlock(&mce_read_mutex);
kfree(cpu_tsc);
- return err ? -EFAULT : buf - ubuf;
+ return err ? err : buf - ubuf;
}
static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1638,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
poll_wait(file, &mce_wait, wait);
if (rcu_dereference_check_mce(mcelog.next))
return POLLIN | POLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return POLLIN | POLLRDNORM;
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb21..e1a0a3bf9716 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
mutex_unlock(&therm_cpu_lock);
break;
}
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ return notifier_from_errno(err);
}
static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fd4db0db3708..5db5b7d65a18 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -106,6 +106,7 @@ struct cpu_hw_events {
int n_events;
int n_added;
+ int n_txn;
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
@@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
out:
cpuc->n_events = n;
cpuc->n_added += n - n0;
+ cpuc->n_txn += n - n0;
return 0;
}
@@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event)
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int i;
+ /*
+ * If we're called during a txn, we don't need to do anything.
+ * The events never got scheduled and ->cancel_txn will truncate
+ * the event_list.
+ */
+ if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+ return;
+
x86_pmu_stop(event);
for (i = 0; i < cpuc->n_events; i++) {
@@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+ cpuc->n_txn = 0;
}
/*
@@ -1391,6 +1402,11 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+ /*
+ * Truncate the collected events.
+ */
+ cpuc->n_added -= cpuc->n_txn;
+ cpuc->n_events -= cpuc->n_txn;
}
/*
@@ -1419,6 +1435,12 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
+ /*
+ * Clear out the txn count so that ->cancel_txn() which gets
+ * run after ->commit_txn() doesn't undo things.
+ */
+ cpuc->n_txn = 0;
+
return 0;
}
@@ -1717,7 +1739,11 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
*/
regs->bp = rewind_frame_pointer(skip + 1);
regs->cs = __KERNEL_CS;
- local_save_flags(regs->flags);
+ /*
+ * We abuse bit 3 to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+ regs->flags = 0;
}
unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 424fc8de68e4..ae85d69644d1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -465,15 +465,21 @@ out:
return rc;
}
-static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
{
- unsigned long dummy;
+ int overflow = 0;
+ u32 low, high;
- rdmsrl(hwc->config_base + hwc->idx, dummy);
- if (dummy & P4_CCCR_OVF) {
+ rdmsr(hwc->config_base + hwc->idx, low, high);
+
+ /* we need to check high bit for unflagged overflows */
+ if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
+ overflow = 1;
(void)checking_wrmsrl(hwc->config_base + hwc->idx,
- ((u64)dummy) & ~P4_CCCR_OVF);
+ ((u64)low) & ~P4_CCCR_OVF);
}
+
+ return overflow;
}
static inline void p4_pmu_disable_event(struct perf_event *event)
@@ -584,21 +590,15 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
WARN_ON_ONCE(hwc->idx != idx);
- /*
- * FIXME: Redundant call, actually not needed
- * but just to check if we're screwed
- */
- p4_pmu_clear_cccr_ovf(hwc);
+ /* it might be unflagged overflow */
+ handled = p4_pmu_clear_cccr_ovf(hwc);
val = x86_perf_event_update(event);
- if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+ if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
continue;
- /*
- * event overflow
- */
- handled = 1;
- data.period = event->hw.last_period;
+ /* event overflow for sure */
+ data.period = event->hw.last_period;
if (!x86_perf_event_set_period(event))
continue;
@@ -670,7 +670,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
/*
* ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
* the metric between any ESCRs is laid in range [0xa0,0xe1]
*
* so we make ~70% filled hashtable
@@ -735,8 +735,9 @@ static int p4_get_escr_idx(unsigned int addr)
{
unsigned int idx = P4_ESCR_MSR_IDX(addr);
- if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
- !p4_escr_table[idx])) {
+ if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+ !p4_escr_table[idx] ||
+ p4_escr_table[idx] != addr)) {
WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
return -1;
}
@@ -762,7 +763,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
{
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
- int cpu = raw_smp_processor_id();
+ int cpu = smp_processor_id();
struct hw_perf_event *hwc;
struct p4_event_bind *bind;
unsigned int i, thread, num;