From 47ca08a40b043815134d489e21870b53276f1a4a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 27 Sep 2010 15:30:39 +0200 Subject: EDAC, MCE: Rename files Drop "edac_" string from the filenames since they're prefixed with edac/ in their pathname anyway. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 drivers/edac/mce_amd.h (limited to 'drivers/edac/mce_amd.h') diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h new file mode 100644 index 000000000000..2712a906afdf --- /dev/null +++ b/drivers/edac/mce_amd.h @@ -0,0 +1,72 @@ +#ifndef _EDAC_MCE_AMD_H +#define _EDAC_MCE_AMD_H + +#include + +#include + +#define ERROR_CODE(x) ((x) & 0xffff) +#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] + +#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) +#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) + +#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) +#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) +#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) + +#define TT(x) (((x) >> 2) & 0x3) +#define TT_MSG(x) tt_msgs[TT(x)] +#define II(x) (((x) >> 2) & 0x3) +#define II_MSG(x) ii_msgs[II(x)] +#define LL(x) (((x) >> 0) & 0x3) +#define LL_MSG(x) ll_msgs[LL(x)] +#define TO(x) (((x) >> 8) & 0x1) +#define TO_MSG(x) to_msgs[TO(x)] +#define PP(x) (((x) >> 9) & 0x3) +#define PP_MSG(x) pp_msgs[PP(x)] + +#define RRRR(x) (((x) >> 4) & 0xf) +#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!") + +#define K8_NBSH 0x4C + +#define K8_NBSH_VALID_BIT BIT(31) +#define K8_NBSH_OVERFLOW BIT(30) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) +#define K8_NBSH_VALID_ERROR_ADDR BIT(26) +#define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) +#define K8_NBSH_CECC BIT(14) +#define K8_NBSH_UECC BIT(13) +#define K8_NBSH_ERR_SCRUBER BIT(8) + +extern const char *tt_msgs[]; +extern const char *ll_msgs[]; +extern const char *rrrr_msgs[]; +extern const char *pp_msgs[]; +extern const char *to_msgs[]; +extern const char *ii_msgs[]; +extern const char *ext_msgs[]; + +/* + * relevant NB regs + */ +struct err_regs { + u32 nbcfg; + u32 nbsh; + u32 nbsl; + u32 nbeah; + u32 nbeal; +}; + +void amd_report_gart_errors(bool); +void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); +void amd_decode_nb_mce(int, struct mce *, u32); +int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data); + +#endif /* _EDAC_MCE_AMD_H */ -- cgit v1.2.3 From 888ab8e6eb2e41179cdc8edf5d0abd1cce0f0370 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 18 Aug 2010 15:11:35 +0200 Subject: EDAC, MCE: Adjust DC decoders to F14h Add a per-family data cache decoders. Since there is a certain overlap between the different DC MCE signatures, reuse functionality between the families as far as possible. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 158 ++++++++++++++++++++++++++++++++++++++++--------- drivers/edac/mce_amd.h | 40 +++++++++++++ 2 files changed, 171 insertions(+), 27 deletions(-) (limited to 'drivers/edac/mce_amd.h') diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 5eb8042d0c6a..33985aa61356 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1,6 +1,10 @@ #include +#include + #include "mce_amd.h" +static struct amd_decoder_ops *fam_ops; + static bool report_gart_errors; static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); @@ -97,41 +101,116 @@ const char *ext_msgs[] = { }; EXPORT_SYMBOL_GPL(ext_msgs); -static void amd_decode_dc_mce(struct mce *m) +static bool f10h_dc_mce(u16 ec) { - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; + u8 r4 = (ec >> 4) & 0xf; + bool ret = false; - pr_emerg(HW_ERR "Data Cache Error: "); + if (r4 == R4_GEN) { + pr_cont("during data scrub.\n"); + return true; + } - if (xec == 1 && TLB_ERROR(ec)) - pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); - else if (xec == 0) { - if (m->status & (1ULL << 40)) - pr_cont(" during Data Scrub.\n"); - else if (TLB_ERROR(ec)) - pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); - else if (MEM_ERROR(ec)) { - u8 ll = ec & 0x3; - u8 tt = (ec >> 2) & 0x3; - u8 rrrr = (ec >> 4) & 0xf; + if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + ret = true; - /* see F10h BKDG (31116), Table 92. */ - if (ll == 0x1) { - if (tt != 0x1) - goto wrong_dc_mce; + if (ll == LL_L2) + pr_cont("during L1 linefill from L2.\n"); + else if (ll == LL_L1) + pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec)); + else + ret = false; + } + return ret; +} - pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); +static bool k8_dc_mce(u16 ec) +{ + if (BUS_ERROR(ec)) { + pr_cont("during system linefill.\n"); + return true; + } - } else if (ll == 0x2 && rrrr == 0x3) - pr_cont(" during L1 linefill from L2.\n"); - else - goto wrong_dc_mce; - } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) - pr_cont(" during system linefill.\n"); + return f10h_dc_mce(ec); +} + +static bool f14h_dc_mce(u16 ec) +{ + u8 r4 = (ec >> 4) & 0xf; + u8 ll = ec & 0x3; + u8 tt = (ec >> 2) & 0x3; + u8 ii = tt; + bool ret = true; + + if (MEM_ERROR(ec)) { + + if (tt != TT_DATA || ll != LL_L1) + return false; + + switch (r4) { + case R4_DRD: + case R4_DWR: + pr_cont("Data/Tag parity error due to %s.\n", + (r4 == R4_DRD ? "load/hw prf" : "store")); + break; + case R4_EVICT: + pr_cont("Copyback parity error on a tag miss.\n"); + break; + case R4_SNOOP: + pr_cont("Tag parity error during snoop.\n"); + break; + default: + ret = false; + } + } else if (BUS_ERROR(ec)) { + + if ((ii != II_MEM && ii != II_IO) || ll != LL_LG) + return false; + + pr_cont("System read data error on a "); + + switch (r4) { + case R4_RD: + pr_cont("TLB reload.\n"); + break; + case R4_DWR: + pr_cont("store.\n"); + break; + case R4_DRD: + pr_cont("load.\n"); + break; + default: + ret = false; + } + } else { + ret = false; + } + + return ret; +} + +static void amd_decode_dc_mce(struct mce *m) +{ + u16 ec = m->status & 0xffff; + u8 xec = (m->status >> 16) & 0xf; + + pr_emerg(HW_ERR "Data Cache Error: "); + + /* TLB error signatures are the same across families */ + if (TLB_ERROR(ec)) { + u8 tt = (ec >> 2) & 0x3; + + if (tt == TT_DATA) { + pr_cont("%s TLB %s.\n", LL_MSG(ec), + (xec ? "multimatch" : "parity error")); + return; + } else goto wrong_dc_mce; - } else + } + + if (!fam_ops->dc_mce(ec)) goto wrong_dc_mce; return; @@ -395,6 +474,30 @@ static int __init mce_amd_init(void) if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) return 0; + fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); + if (!fam_ops) + return -ENOMEM; + + switch (boot_cpu_data.x86) { + case 0xf: + fam_ops->dc_mce = k8_dc_mce; + break; + + case 0x10: + fam_ops->dc_mce = f10h_dc_mce; + break; + + case 0x14: + fam_ops->dc_mce = f14h_dc_mce; + break; + + default: + printk(KERN_WARNING "Huh? What family is that: %d?!\n", + boot_cpu_data.x86); + kfree(fam_ops); + return -EINVAL; + } + atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); return 0; @@ -405,6 +508,7 @@ early_initcall(mce_amd_init); static void __exit mce_amd_exit(void) { atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); + kfree(fam_ops); } MODULE_DESCRIPTION("AMD MCE decoder"); diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 2712a906afdf..85985c225442 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -44,6 +44,39 @@ #define K8_NBSH_UECC BIT(13) #define K8_NBSH_ERR_SCRUBER BIT(8) +enum tt_ids { + TT_INSTR = 0, + TT_DATA, + TT_GEN, + TT_RESV, +}; + +enum ll_ids { + LL_RESV = 0, + LL_L1, + LL_L2, + LL_LG, +}; + +enum ii_ids { + II_MEM = 0, + II_RESV, + II_IO, + II_GEN, +}; + +enum rrrr_ids { + R4_GEN = 0, + R4_RD, + R4_WR, + R4_DRD, + R4_DWR, + R4_IRD, + R4_PREF, + R4_EVICT, + R4_SNOOP, +}; + extern const char *tt_msgs[]; extern const char *ll_msgs[]; extern const char *rrrr_msgs[]; @@ -63,6 +96,13 @@ struct err_regs { u32 nbeal; }; +/* + * per-family decoder ops + */ +struct amd_decoder_ops { + bool (*dc_mce)(u16); +}; + void amd_report_gart_errors(bool); void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); -- cgit v1.2.3 From dd53bce4e8987f6848840d42bbeead5221eff308 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 26 Aug 2010 19:05:49 +0200 Subject: EDAC, MCE: Adjust IC decoders to F14h Add support for IC MCEs for F14h CPUs. K8 and F10h are almost identical so use one function for both. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 118 +++++++++++++++++++++++++++++-------------------- drivers/edac/mce_amd.h | 1 + 2 files changed, 71 insertions(+), 48 deletions(-) (limited to 'drivers/edac/mce_amd.h') diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 33985aa61356..60d5d9f4dfee 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -219,61 +219,80 @@ wrong_dc_mce: pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); } -static void amd_decode_ic_mce(struct mce *m) +static bool k8_ic_mce(u16 ec) { - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; + u8 ll = ec & 0x3; + u8 r4 = (ec >> 4) & 0xf; + bool ret = true; - pr_emerg(HW_ERR "Instruction Cache Error"); + if (!MEM_ERROR(ec)) + return false; - if (xec == 1 && TLB_ERROR(ec)) - pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); - else if (xec == 0) { - if (TLB_ERROR(ec)) - pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); - else if (BUS_ERROR(ec)) { - if (boot_cpu_data.x86 == 0xf && - (m->status & BIT(58))) - pr_cont(" during system linefill.\n"); - else - pr_cont(" during attempted NB data read.\n"); - } else if (MEM_ERROR(ec)) { - u8 ll = ec & 0x3; - u8 rrrr = (ec >> 4) & 0xf; + if (ll == 0x2) + pr_cont("during a linefill from L2.\n"); + else if (ll == 0x1) { + switch (r4) { + case R4_IRD: + pr_cont("Parity error during data load.\n"); + break; - if (ll == 0x2) - pr_cont(" during a linefill from L2.\n"); - else if (ll == 0x1) { - - switch (rrrr) { - case 0x5: - pr_cont(": Parity error during " - "data load.\n"); - break; - - case 0x7: - pr_cont(": Copyback Parity/Victim" - " error.\n"); - break; - - case 0x8: - pr_cont(": Tag Snoop error.\n"); - break; - - default: - goto wrong_ic_mce; - break; - } - } - } else - goto wrong_ic_mce; + case R4_EVICT: + pr_cont("Copyback Parity/Victim error.\n"); + break; + + case R4_SNOOP: + pr_cont("Tag Snoop error.\n"); + break; + + default: + ret = false; + break; + } } else - goto wrong_ic_mce; + ret = false; - return; + return ret; +} + +static bool f14h_ic_mce(u16 ec) +{ + u8 ll = ec & 0x3; + u8 tt = (ec >> 2) & 0x3; + u8 r4 = (ec >> 4) & 0xf; + bool ret = true; -wrong_ic_mce: - pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); + if (MEM_ERROR(ec)) { + if (tt != 0 || ll != 1) + ret = false; + + if (r4 == R4_IRD) + pr_cont("Data/tag array parity error for a tag hit.\n"); + else if (r4 == R4_SNOOP) + pr_cont("Tag error during snoop/victimization.\n"); + else + ret = false; + } + return ret; +} + +static void amd_decode_ic_mce(struct mce *m) +{ + u16 ec = m->status & 0xffff; + u8 xec = (m->status >> 16) & 0xf; + + pr_emerg(HW_ERR "Instruction Cache Error: "); + + if (TLB_ERROR(ec)) + pr_cont("%s TLB %s.\n", LL_MSG(ec), + (xec ? "multimatch" : "parity error")); + else if (BUS_ERROR(ec)) { + bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58))); + + pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); + } else if (fam_ops->ic_mce(ec)) + ; + else + pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); } static void amd_decode_bu_mce(struct mce *m) @@ -481,14 +500,17 @@ static int __init mce_amd_init(void) switch (boot_cpu_data.x86) { case 0xf: fam_ops->dc_mce = k8_dc_mce; + fam_ops->ic_mce = k8_ic_mce; break; case 0x10: fam_ops->dc_mce = f10h_dc_mce; + fam_ops->ic_mce = k8_ic_mce; break; case 0x14: fam_ops->dc_mce = f14h_dc_mce; + fam_ops->ic_mce = f14h_ic_mce; break; default: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 85985c225442..dc81dba9364b 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -101,6 +101,7 @@ struct err_regs { */ struct amd_decoder_ops { bool (*dc_mce)(u16); + bool (*ic_mce)(u16); }; void amd_report_gart_errors(bool); -- cgit v1.2.3 From 5ce88f6ea6bef929f59f9468413f922c9a486fa4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 Aug 2010 18:28:08 +0200 Subject: EDAC, MCE: Complete NB MCE decoders Add support for decoding F14h BU MCEs and improve decoding of the remaining families. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.h | 1 - drivers/edac/mce_amd.c | 210 ++++++++++++++++++++++++++++++++++------------ drivers/edac/mce_amd.h | 3 +- 3 files changed, 158 insertions(+), 56 deletions(-) (limited to 'drivers/edac/mce_amd.h') diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 13e1d6f25bd1..044aee4f944d 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -482,7 +482,6 @@ extern const char *rrrr_msgs[16]; extern const char *to_msgs[2]; extern const char *pp_msgs[4]; extern const char *ii_msgs[4]; -extern const char *ext_msgs[32]; extern const char *htlink_msgs[8]; #ifdef CONFIG_EDAC_DEBUG diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 3c161672a84b..d8d1c9de1ed6 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -5,6 +5,8 @@ static struct amd_decoder_ops *fam_ops; +static u8 nb_err_cpumask = 0xf; + static bool report_gart_errors; static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); @@ -61,45 +63,16 @@ EXPORT_SYMBOL_GPL(to_msgs); const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); -/* - * Map the 4 or 5 (family-specific) bits of Extended Error code to the - * string table. - */ -const char *ext_msgs[] = { - "K8 ECC error", /* 0_0000b */ - "CRC error on link", /* 0_0001b */ - "Sync error packets on link", /* 0_0010b */ - "Master Abort during link operation", /* 0_0011b */ - "Target Abort during link operation", /* 0_0100b */ - "Invalid GART PTE entry during table walk", /* 0_0101b */ - "Unsupported atomic RMW command received", /* 0_0110b */ - "WDT error: NB transaction timeout", /* 0_0111b */ - "ECC/ChipKill ECC error", /* 0_1000b */ - "SVM DEV Error", /* 0_1001b */ - "Link Data error", /* 0_1010b */ - "Link/L3/Probe Filter Protocol error", /* 0_1011b */ - "NB Internal Arrays Parity error", /* 0_1100b */ - "DRAM Address/Control Parity error", /* 0_1101b */ - "Link Transmission error", /* 0_1110b */ - "GART/DEV Table Walk Data error" /* 0_1111b */ - "Res 0x100 error", /* 1_0000b */ - "Res 0x101 error", /* 1_0001b */ - "Res 0x102 error", /* 1_0010b */ - "Res 0x103 error", /* 1_0011b */ - "Res 0x104 error", /* 1_0100b */ - "Res 0x105 error", /* 1_0101b */ - "Res 0x106 error", /* 1_0110b */ - "Res 0x107 error", /* 1_0111b */ - "Res 0x108 error", /* 1_1000b */ - "Res 0x109 error", /* 1_1001b */ - "Res 0x10A error", /* 1_1010b */ - "Res 0x10B error", /* 1_1011b */ - "ECC error in L3 Cache Data", /* 1_1100b */ - "L3 Cache Tag error", /* 1_1101b */ - "L3 Cache LRU Parity error", /* 1_1110b */ - "Probe Filter error" /* 1_1111b */ +static const char *f10h_nb_mce_desc[] = { + "HT link data error", + "Protocol error (link, L3, probe filter, etc.)", + "Parity error in NB-internal arrays", + "Link Retry due to IO link transmission error", + "L3 ECC data cache error", + "ECC error in L3 cache tag", + "L3 LRU parity bits error", + "ECC Error in the Probe Filter directory" }; -EXPORT_SYMBOL_GPL(ext_msgs); static bool f10h_dc_mce(u16 ec) { @@ -366,19 +339,97 @@ wrong_ls_mce: pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); } +static bool k8_nb_mce(u16 ec, u8 xec) +{ + bool ret = true; + + switch (xec) { + case 0x1: + pr_cont("CRC error detected on HT link.\n"); + break; + + case 0x5: + pr_cont("Invalid GART PTE entry during GART table walk.\n"); + break; + + case 0x6: + pr_cont("Unsupported atomic RMW received from an IO link.\n"); + break; + + case 0x0: + case 0x8: + pr_cont("DRAM ECC error detected on the NB.\n"); + break; + + case 0xd: + pr_cont("Parity error on the DRAM addr/ctl signals.\n"); + break; + + default: + ret = false; + break; + } + + return ret; +} + +static bool f10h_nb_mce(u16 ec, u8 xec) +{ + bool ret = true; + u8 offset = 0; + + if (k8_nb_mce(ec, xec)) + return true; + + switch(xec) { + case 0xa ... 0xc: + offset = 10; + break; + + case 0xe: + offset = 11; + break; + + case 0xf: + if (TLB_ERROR(ec)) + pr_cont("GART Table Walk data error.\n"); + else if (BUS_ERROR(ec)) + pr_cont("DMA Exclusion Vector Table Walk error.\n"); + else + ret = false; + + goto out; + break; + + case 0x1c ... 0x1f: + offset = 24; + break; + + default: + ret = false; + + goto out; + break; + } + + pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); + +out: + return ret; +} + +static bool f14h_nb_mce(u16 ec, u8 xec) +{ + return false; +} + void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) { - u32 ec = m->status & 0xffff; + u8 xec = (m->status >> 16) & 0x1f; + u16 ec = m->status & 0xffff; u32 nbsh = (u32)(m->status >> 32); - u32 nbsl = (u32)m->status; - - /* - * GART TLB error reporting is disabled by default. Bail out early. - */ - if (TLB_ERROR(ec) && !report_gart_errors) - return; - pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); + pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id); /* * F10h, revD can disable ErrCpu[3:0] so check that first and also the @@ -387,20 +438,50 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model > 7)) { if (nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); + pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask)); } else { - u8 assoc_cpus = nbsh & 0xf; + u8 assoc_cpus = nbsh & nb_err_cpumask; if (assoc_cpus > 0) pr_cont(", core: %d", fls(assoc_cpus) - 1); + } - pr_cont("\n"); + switch (xec) { + case 0x2: + pr_cont("Sync error (sync packets on HT link detected).\n"); + return; + + case 0x3: + pr_cont("HT Master abort.\n"); + return; + + case 0x4: + pr_cont("HT Target abort.\n"); + return; + + case 0x7: + pr_cont("NB Watchdog timeout.\n"); + return; + + case 0x9: + pr_cont("SVM DMA Exclusion Vector error.\n"); + return; + + default: + break; } - pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); + if (!fam_ops->nb_mce(ec, xec)) + goto wrong_nb_mce; + + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) + if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) + nb_bus_decoder(node_id, m, nbcfg); - if (BUS_ERROR(ec) && nb_bus_decoder) - nb_bus_decoder(node_id, m, nbcfg); + return; + +wrong_nb_mce: + pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); } EXPORT_SYMBOL_GPL(amd_decode_nb_mce); @@ -430,11 +511,30 @@ static inline void amd_decode_err_code(u16 ec) pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); } +/* + * Filter out unwanted MCE signatures here. + */ +static bool amd_filter_mce(struct mce *m) +{ + u8 xec = (m->status >> 16) & 0x1f; + + /* + * NB GART TLB error reporting is disabled by default. + */ + if (m->bank == 4 && xec == 0x5 && !report_gart_errors) + return true; + + return false; +} + int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; int node, ecc; + if (amd_filter_mce(m)) + return NOTIFY_STOP; + pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); pr_cont("%sorrected error, other errors lost: %s, " @@ -509,16 +609,20 @@ static int __init mce_amd_init(void) case 0xf: fam_ops->dc_mce = k8_dc_mce; fam_ops->ic_mce = k8_ic_mce; + fam_ops->nb_mce = k8_nb_mce; break; case 0x10: fam_ops->dc_mce = f10h_dc_mce; fam_ops->ic_mce = k8_ic_mce; + fam_ops->nb_mce = f10h_nb_mce; break; case 0x14: + nb_err_cpumask = 0x3; fam_ops->dc_mce = f14h_dc_mce; fam_ops->ic_mce = f14h_ic_mce; + fam_ops->nb_mce = f14h_nb_mce; break; default: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index dc81dba9364b..0d0637debbad 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -7,7 +7,6 @@ #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) -#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) @@ -83,7 +82,6 @@ extern const char *rrrr_msgs[]; extern const char *pp_msgs[]; extern const char *to_msgs[]; extern const char *ii_msgs[]; -extern const char *ext_msgs[]; /* * relevant NB regs @@ -102,6 +100,7 @@ struct err_regs { struct amd_decoder_ops { bool (*dc_mce)(u16); bool (*ic_mce)(u16); + bool (*nb_mce)(u16, u8); }; void amd_report_gart_errors(bool); -- cgit v1.2.3 From cf1d2200dbc214c26a116c4d0c75b7cf27bb19b6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 15 Oct 2010 15:20:18 +0200 Subject: EDAC, MCE: Add a BIT_64() macro Add a macro for 64-bit vectors to use when accessing MSR contents. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/edac/mce_amd.h') diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 0d0637debbad..35f6e0e3b297 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -5,6 +5,8 @@ #include +#define BIT_64(n) (U64_C(1) << (n)) + #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) -- cgit v1.2.3