diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-08 18:11:39 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-08 18:11:39 +0100 |
commit | e13284da944df29ab08e8a9d2a50fc0ad1d858ab (patch) | |
tree | 8e6e2580d27cf4fe5f712e0857dc495aa52fd27e /drivers/edac | |
parent | Merge tag 'edac_for_5.1' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp (diff) | |
parent | x86/mce: Improve error message when kernel cannot recover, p2 (diff) | |
download | linux-e13284da944df29ab08e8a9d2a50fc0ad1d858ab.tar.xz linux-e13284da944df29ab08e8a9d2a50fc0ad1d858ab.zip |
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Borislav Petkov:
"This time around we have in store:
- Disable MC4_MISC thresholding banks on all AMD family 0x15 models
(Shirish S)
- AMD MCE error descriptions update and error decode improvements
(Yazen Ghannam)
- The usual smaller conversions and fixes"
* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Improve error message when kernel cannot recover, p2
EDAC/mce_amd: Decode MCA_STATUS in bit definition order
EDAC/mce_amd: Decode MCA_STATUS[Scrub] bit
EDAC, mce_amd: Print ExtErrorCode and description on a single line
EDAC, mce_amd: Match error descriptions to latest documentation
x86/MCE/AMD, EDAC/mce_amd: Add new error descriptions for some SMCA bank types
x86/MCE/AMD, EDAC/mce_amd: Add new McaTypes for CS, PSP, and SMU units
x86/MCE/AMD, EDAC/mce_amd: Add new MP5, NBIO, and PCIE SMCA bank types
RAS: Add a MAINTAINERS entry
RAS: Use consistent types for UUIDs
x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk
x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
x86/MCE: Switch to use the new generic UUID API
Diffstat (limited to 'drivers/edac')
-rw-r--r-- | drivers/edac/mce_amd.c | 291 |
1 files changed, 192 insertions, 99 deletions
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index c605089d899f..0a1814dad6cf 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -151,138 +151,223 @@ static const char * const mc6_mce_desc[] = { /* Scalable MCA error strings */ static const char * const smca_ls_mce_desc[] = { - "Load queue parity", - "Store queue parity", - "Miss address buffer payload parity", - "L1 TLB parity", - "Reserved", - "DC tag error type 6", - "DC tag error type 1", + "Load queue parity error", + "Store queue parity error", + "Miss address buffer payload parity error", + "Level 1 TLB parity error", + "DC Tag error type 5", + "DC Tag error type 6", + "DC Tag error type 1", "Internal error type 1", "Internal error type 2", - "Sys Read data error thread 0", - "Sys read data error thread 1", - "DC tag error type 2", - "DC data error type 1 (poison consumption)", - "DC data error type 2", - "DC data error type 3", - "DC tag error type 4", - "L2 TLB parity", + "System Read Data Error Thread 0", + "System Read Data Error Thread 1", + "DC Tag error type 2", + "DC Data error type 1 and poison consumption", + "DC Data error type 2", + "DC Data error type 3", + "DC Tag error type 4", + "Level 2 TLB parity error", "PDC parity error", - "DC tag error type 3", - "DC tag error type 5", - "L2 fill data error", + "DC Tag error type 3", + "DC Tag error type 5", + "L2 Fill Data error", }; static const char * const smca_if_mce_desc[] = { - "microtag probe port parity error", - "IC microtag or full tag multi-hit error", - "IC full tag parity", - "IC data array parity", - "Decoupling queue phys addr parity error", - "L0 ITLB parity error", - "L1 ITLB parity error", - "L2 ITLB parity error", - "BPQ snoop parity on Thread 0", - "BPQ snoop parity on Thread 1", - "L1 BTB multi-match error", - "L2 BTB multi-match error", - "L2 Cache Response Poison error", - "System Read Data error", + "Op Cache Microtag Probe Port Parity Error", + "IC Microtag or Full Tag Multi-hit Error", + "IC Full Tag Parity Error", + "IC Data Array Parity Error", + "Decoupling Queue PhysAddr Parity Error", + "L0 ITLB Parity Error", + "L1 ITLB Parity Error", + "L2 ITLB Parity Error", + "BPQ Thread 0 Snoop Parity Error", + "BPQ Thread 1 Snoop Parity Error", + "L1 BTB Multi-Match Error", + "L2 BTB Multi-Match Error", + "L2 Cache Response Poison Error", + "System Read Data Error", }; static const char * const smca_l2_mce_desc[] = { - "L2M tag multi-way-hit error", - "L2M tag ECC error", - "L2M data ECC error", - "HW assert", + "L2M Tag Multiple-Way-Hit error", + "L2M Tag or State Array ECC Error", + "L2M Data Array ECC Error", + "Hardware Assert Error", }; static const char * const smca_de_mce_desc[] = { - "uop cache tag parity error", - "uop cache data parity error", - "Insn buffer parity error", - "uop queue parity error", - "Insn dispatch queue parity error", - "Fetch address FIFO parity", - "Patch RAM data parity", - "Patch RAM sequencer parity", - "uop buffer parity" + "Micro-op cache tag parity error", + "Micro-op cache data parity error", + "Instruction buffer parity error", + "Micro-op queue parity error", + "Instruction dispatch queue parity error", + "Fetch address FIFO parity error", + "Patch RAM data parity error", + "Patch RAM sequencer parity error", + "Micro-op buffer parity error" }; static const char * const smca_ex_mce_desc[] = { - "Watchdog timeout error", - "Phy register file parity", - "Flag register file parity", - "Immediate displacement register file parity", - "Address generator payload parity", - "EX payload parity", - "Checkpoint queue parity", - "Retire dispatch queue parity", + "Watchdog Timeout error", + "Physical register file parity error", + "Flag register file parity error", + "Immediate displacement register file parity error", + "Address generator payload parity error", + "EX payload parity error", + "Checkpoint queue parity error", + "Retire dispatch queue parity error", "Retire status queue parity error", "Scheduling queue parity error", "Branch buffer queue parity error", + "Hardware Assertion error", }; static const char * const smca_fp_mce_desc[] = { - "Physical register file parity", - "Freelist parity error", - "Schedule queue parity", + "Physical register file (PRF) parity error", + "Freelist (FL) parity error", + "Schedule queue parity error", "NSQ parity error", - "Retire queue parity", - "Status register file parity", + "Retire queue (RQ) parity error", + "Status register file (SRF) parity error", "Hardware assertion", }; static const char * const smca_l3_mce_desc[] = { - "Shadow tag macro ECC error", - "Shadow tag macro multi-way-hit error", - "L3M tag ECC error", - "L3M tag multi-way-hit error", - "L3M data ECC error", - "XI parity, L3 fill done channel error", - "L3 victim queue parity", - "L3 HW assert", + "Shadow Tag Macro ECC Error", + "Shadow Tag Macro Multi-way-hit Error", + "L3M Tag ECC Error", + "L3M Tag Multi-way-hit Error", + "L3M Data ECC Error", + "SDP Parity Error or SystemReadDataError from XI", + "L3 Victim Queue Parity Error", + "L3 Hardware Assertion", }; static const char * const smca_cs_mce_desc[] = { - "Illegal request from transport layer", - "Address violation", - "Security violation", - "Illegal response from transport layer", - "Unexpected response", - "Parity error on incoming request or probe response data", - "Parity error on incoming read response data", - "Atomic request parity", - "ECC error on probe filter access", + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "Probe Filter ECC Error", +}; + +static const char * const smca_cs2_mce_desc[] = { + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "SDP read response had no match in the CS queue", + "Probe Filter Protocol Error", + "Probe Filter ECC Error", + "SDP read response had an unexpected RETRY error", + "Counter overflow error", + "Counter underflow error", }; static const char * const smca_pie_mce_desc[] = { - "HW assert", - "Internal PIE register security violation", - "Error on GMI link", - "Poison data written to internal PIE register", + "Hardware Assert", + "Register security violation", + "Link Error", + "Poison data consumption", + "A deferred error was detected in the DF" }; static const char * const smca_umc_mce_desc[] = { "DRAM ECC error", - "Data poison error on DRAM", + "Data poison error", "SDP parity error", "Advanced peripheral bus error", - "Command/address parity error", + "Address/Command parity error", "Write data CRC error", + "DCQ SRAM ECC error", + "AES SRAM ECC error", }; static const char * const smca_pb_mce_desc[] = { - "Parameter Block RAM ECC error", + "An ECC error in the Parameter Block RAM array", }; static const char * const smca_psp_mce_desc[] = { - "PSP RAM ECC or parity error", + "An ECC or parity error in a PSP RAM instance", +}; + +static const char * const smca_psp2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Instruction Cache Bank 0 ECC or parity error", + "Instruction Cache Bank 1 ECC or parity error", + "Instruction Tag Ram 0 parity error", + "Instruction Tag Ram 1 parity error", + "Data Cache Bank 0 ECC or parity error", + "Data Cache Bank 1 ECC or parity error", + "Data Cache Bank 2 ECC or parity error", + "Data Cache Bank 3 ECC or parity error", + "Data Tag Bank 0 parity error", + "Data Tag Bank 1 parity error", + "Data Tag Bank 2 parity error", + "Data Tag Bank 3 parity error", + "Dirty Data Ram parity error", + "TLB Bank 0 parity error", + "TLB Bank 1 parity error", + "System Hub Read Buffer ECC or parity error", }; static const char * const smca_smu_mce_desc[] = { - "SMU RAM ECC or parity error", + "An ECC or parity error in an SMU RAM instance", +}; + +static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", +}; + +static const char * const smca_mp5_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", +}; + +static const char * const smca_nbio_mce_desc[] = { + "ECC or Parity error", + "PCIE error", + "SDP ErrEvent error", + "SDP Egress Poison Error", + "IOHC Internal Poison Error", +}; + +static const char * const smca_pcie_mce_desc[] = { + "CCIX PER Message logging", + "CCIX Read Response with Status: Non-Data Error", + "CCIX Write Response with Status: Non-Data Error", + "CCIX Read Response with Status: Data Error", + "CCIX Non-okay write response with data error", }; struct smca_mce_desc { @@ -299,11 +384,17 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, + [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, + [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, + [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, + [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, + [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, }; static bool f12h_mc0_mce(u16 ec, u8 xec) @@ -874,13 +965,12 @@ static void decode_smca_error(struct mce *m) ip_name = smca_get_long_name(bank_type); - pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec); + pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); /* Only print the decode of valid error codes */ if (xec < smca_mce_descs[bank_type].num_descs && (hwid->xec_bitmap & BIT_ULL(xec))) { - pr_emerg(HW_ERR "%s Error: ", ip_name); - pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]); + pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); } if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) @@ -961,26 +1051,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_UC) ? "UE" : (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), - ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), - ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - - if (fam >= 0x15) { - pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); - - /* F15h, bank4, bit 43 is part of McaStatSubCache. */ - if (fam != 0x15 || m->bank != 4) - pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); - } + ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), + ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); if (boot_cpu_has(X86_FEATURE_SMCA)) { u32 low, high; u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); - if (!rdmsr_safe(addr, &low, &high) && (low & MCI_CONFIG_MCAX)) pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + + pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); } /* do the two bits[14:13] together */ @@ -988,6 +1070,17 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (ecc) pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); + if (fam >= 0x15) { + pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); + + /* F15h, bank4, bit 43 is part of McaStatSubCache. */ + if (fam != 0x15 || m->bank != 4) + pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); + } + + if (fam >= 0x17) + pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); + pr_cont("]: 0x%016llx\n", m->status); if (m->status & MCI_STATUS_ADDRV) |