summaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-08 18:11:39 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-08 18:11:39 +0100
commite13284da944df29ab08e8a9d2a50fc0ad1d858ab (patch)
tree8e6e2580d27cf4fe5f712e0857dc495aa52fd27e /drivers/edac
parentMerge tag 'edac_for_5.1' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp (diff)
parentx86/mce: Improve error message when kernel cannot recover, p2 (diff)
downloadlinux-e13284da944df29ab08e8a9d2a50fc0ad1d858ab.tar.xz
linux-e13284da944df29ab08e8a9d2a50fc0ad1d858ab.zip
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Borislav Petkov: "This time around we have in store: - Disable MC4_MISC thresholding banks on all AMD family 0x15 models (Shirish S) - AMD MCE error descriptions update and error decode improvements (Yazen Ghannam) - The usual smaller conversions and fixes" * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Improve error message when kernel cannot recover, p2 EDAC/mce_amd: Decode MCA_STATUS in bit definition order EDAC/mce_amd: Decode MCA_STATUS[Scrub] bit EDAC, mce_amd: Print ExtErrorCode and description on a single line EDAC, mce_amd: Match error descriptions to latest documentation x86/MCE/AMD, EDAC/mce_amd: Add new error descriptions for some SMCA bank types x86/MCE/AMD, EDAC/mce_amd: Add new McaTypes for CS, PSP, and SMU units x86/MCE/AMD, EDAC/mce_amd: Add new MP5, NBIO, and PCIE SMCA bank types RAS: Add a MAINTAINERS entry RAS: Use consistent types for UUIDs x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models x86/MCE: Switch to use the new generic UUID API
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/mce_amd.c291
1 files changed, 192 insertions, 99 deletions
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index c605089d899f..0a1814dad6cf 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -151,138 +151,223 @@ static const char * const mc6_mce_desc[] = {
/* Scalable MCA error strings */
static const char * const smca_ls_mce_desc[] = {
- "Load queue parity",
- "Store queue parity",
- "Miss address buffer payload parity",
- "L1 TLB parity",
- "Reserved",
- "DC tag error type 6",
- "DC tag error type 1",
+ "Load queue parity error",
+ "Store queue parity error",
+ "Miss address buffer payload parity error",
+ "Level 1 TLB parity error",
+ "DC Tag error type 5",
+ "DC Tag error type 6",
+ "DC Tag error type 1",
"Internal error type 1",
"Internal error type 2",
- "Sys Read data error thread 0",
- "Sys read data error thread 1",
- "DC tag error type 2",
- "DC data error type 1 (poison consumption)",
- "DC data error type 2",
- "DC data error type 3",
- "DC tag error type 4",
- "L2 TLB parity",
+ "System Read Data Error Thread 0",
+ "System Read Data Error Thread 1",
+ "DC Tag error type 2",
+ "DC Data error type 1 and poison consumption",
+ "DC Data error type 2",
+ "DC Data error type 3",
+ "DC Tag error type 4",
+ "Level 2 TLB parity error",
"PDC parity error",
- "DC tag error type 3",
- "DC tag error type 5",
- "L2 fill data error",
+ "DC Tag error type 3",
+ "DC Tag error type 5",
+ "L2 Fill Data error",
};
static const char * const smca_if_mce_desc[] = {
- "microtag probe port parity error",
- "IC microtag or full tag multi-hit error",
- "IC full tag parity",
- "IC data array parity",
- "Decoupling queue phys addr parity error",
- "L0 ITLB parity error",
- "L1 ITLB parity error",
- "L2 ITLB parity error",
- "BPQ snoop parity on Thread 0",
- "BPQ snoop parity on Thread 1",
- "L1 BTB multi-match error",
- "L2 BTB multi-match error",
- "L2 Cache Response Poison error",
- "System Read Data error",
+ "Op Cache Microtag Probe Port Parity Error",
+ "IC Microtag or Full Tag Multi-hit Error",
+ "IC Full Tag Parity Error",
+ "IC Data Array Parity Error",
+ "Decoupling Queue PhysAddr Parity Error",
+ "L0 ITLB Parity Error",
+ "L1 ITLB Parity Error",
+ "L2 ITLB Parity Error",
+ "BPQ Thread 0 Snoop Parity Error",
+ "BPQ Thread 1 Snoop Parity Error",
+ "L1 BTB Multi-Match Error",
+ "L2 BTB Multi-Match Error",
+ "L2 Cache Response Poison Error",
+ "System Read Data Error",
};
static const char * const smca_l2_mce_desc[] = {
- "L2M tag multi-way-hit error",
- "L2M tag ECC error",
- "L2M data ECC error",
- "HW assert",
+ "L2M Tag Multiple-Way-Hit error",
+ "L2M Tag or State Array ECC Error",
+ "L2M Data Array ECC Error",
+ "Hardware Assert Error",
};
static const char * const smca_de_mce_desc[] = {
- "uop cache tag parity error",
- "uop cache data parity error",
- "Insn buffer parity error",
- "uop queue parity error",
- "Insn dispatch queue parity error",
- "Fetch address FIFO parity",
- "Patch RAM data parity",
- "Patch RAM sequencer parity",
- "uop buffer parity"
+ "Micro-op cache tag parity error",
+ "Micro-op cache data parity error",
+ "Instruction buffer parity error",
+ "Micro-op queue parity error",
+ "Instruction dispatch queue parity error",
+ "Fetch address FIFO parity error",
+ "Patch RAM data parity error",
+ "Patch RAM sequencer parity error",
+ "Micro-op buffer parity error"
};
static const char * const smca_ex_mce_desc[] = {
- "Watchdog timeout error",
- "Phy register file parity",
- "Flag register file parity",
- "Immediate displacement register file parity",
- "Address generator payload parity",
- "EX payload parity",
- "Checkpoint queue parity",
- "Retire dispatch queue parity",
+ "Watchdog Timeout error",
+ "Physical register file parity error",
+ "Flag register file parity error",
+ "Immediate displacement register file parity error",
+ "Address generator payload parity error",
+ "EX payload parity error",
+ "Checkpoint queue parity error",
+ "Retire dispatch queue parity error",
"Retire status queue parity error",
"Scheduling queue parity error",
"Branch buffer queue parity error",
+ "Hardware Assertion error",
};
static const char * const smca_fp_mce_desc[] = {
- "Physical register file parity",
- "Freelist parity error",
- "Schedule queue parity",
+ "Physical register file (PRF) parity error",
+ "Freelist (FL) parity error",
+ "Schedule queue parity error",
"NSQ parity error",
- "Retire queue parity",
- "Status register file parity",
+ "Retire queue (RQ) parity error",
+ "Status register file (SRF) parity error",
"Hardware assertion",
};
static const char * const smca_l3_mce_desc[] = {
- "Shadow tag macro ECC error",
- "Shadow tag macro multi-way-hit error",
- "L3M tag ECC error",
- "L3M tag multi-way-hit error",
- "L3M data ECC error",
- "XI parity, L3 fill done channel error",
- "L3 victim queue parity",
- "L3 HW assert",
+ "Shadow Tag Macro ECC Error",
+ "Shadow Tag Macro Multi-way-hit Error",
+ "L3M Tag ECC Error",
+ "L3M Tag Multi-way-hit Error",
+ "L3M Data ECC Error",
+ "SDP Parity Error or SystemReadDataError from XI",
+ "L3 Victim Queue Parity Error",
+ "L3 Hardware Assertion",
};
static const char * const smca_cs_mce_desc[] = {
- "Illegal request from transport layer",
- "Address violation",
- "Security violation",
- "Illegal response from transport layer",
- "Unexpected response",
- "Parity error on incoming request or probe response data",
- "Parity error on incoming read response data",
- "Atomic request parity",
- "ECC error on probe filter access",
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "Probe Filter ECC Error",
+};
+
+static const char * const smca_cs2_mce_desc[] = {
+ "Illegal Request",
+ "Address Violation",
+ "Security Violation",
+ "Illegal Response",
+ "Unexpected Response",
+ "Request or Probe Parity Error",
+ "Read Response Parity Error",
+ "Atomic Request Parity Error",
+ "SDP read response had no match in the CS queue",
+ "Probe Filter Protocol Error",
+ "Probe Filter ECC Error",
+ "SDP read response had an unexpected RETRY error",
+ "Counter overflow error",
+ "Counter underflow error",
};
static const char * const smca_pie_mce_desc[] = {
- "HW assert",
- "Internal PIE register security violation",
- "Error on GMI link",
- "Poison data written to internal PIE register",
+ "Hardware Assert",
+ "Register security violation",
+ "Link Error",
+ "Poison data consumption",
+ "A deferred error was detected in the DF"
};
static const char * const smca_umc_mce_desc[] = {
"DRAM ECC error",
- "Data poison error on DRAM",
+ "Data poison error",
"SDP parity error",
"Advanced peripheral bus error",
- "Command/address parity error",
+ "Address/Command parity error",
"Write data CRC error",
+ "DCQ SRAM ECC error",
+ "AES SRAM ECC error",
};
static const char * const smca_pb_mce_desc[] = {
- "Parameter Block RAM ECC error",
+ "An ECC error in the Parameter Block RAM array",
};
static const char * const smca_psp_mce_desc[] = {
- "PSP RAM ECC or parity error",
+ "An ECC or parity error in a PSP RAM instance",
+};
+
+static const char * const smca_psp2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Instruction Cache Bank 0 ECC or parity error",
+ "Instruction Cache Bank 1 ECC or parity error",
+ "Instruction Tag Ram 0 parity error",
+ "Instruction Tag Ram 1 parity error",
+ "Data Cache Bank 0 ECC or parity error",
+ "Data Cache Bank 1 ECC or parity error",
+ "Data Cache Bank 2 ECC or parity error",
+ "Data Cache Bank 3 ECC or parity error",
+ "Data Tag Bank 0 parity error",
+ "Data Tag Bank 1 parity error",
+ "Data Tag Bank 2 parity error",
+ "Data Tag Bank 3 parity error",
+ "Dirty Data Ram parity error",
+ "TLB Bank 0 parity error",
+ "TLB Bank 1 parity error",
+ "System Hub Read Buffer ECC or parity error",
};
static const char * const smca_smu_mce_desc[] = {
- "SMU RAM ECC or parity error",
+ "An ECC or parity error in an SMU RAM instance",
+};
+
+static const char * const smca_smu2_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+ "System Hub Read Buffer ECC or parity error",
+};
+
+static const char * const smca_mp5_mce_desc[] = {
+ "High SRAM ECC or parity error",
+ "Low SRAM ECC or parity error",
+ "Data Cache Bank A ECC or parity error",
+ "Data Cache Bank B ECC or parity error",
+ "Data Tag Cache Bank A ECC or parity error",
+ "Data Tag Cache Bank B ECC or parity error",
+ "Instruction Cache Bank A ECC or parity error",
+ "Instruction Cache Bank B ECC or parity error",
+ "Instruction Tag Cache Bank A ECC or parity error",
+ "Instruction Tag Cache Bank B ECC or parity error",
+};
+
+static const char * const smca_nbio_mce_desc[] = {
+ "ECC or Parity error",
+ "PCIE error",
+ "SDP ErrEvent error",
+ "SDP Egress Poison Error",
+ "IOHC Internal Poison Error",
+};
+
+static const char * const smca_pcie_mce_desc[] = {
+ "CCIX PER Message logging",
+ "CCIX Read Response with Status: Non-Data Error",
+ "CCIX Write Response with Status: Non-Data Error",
+ "CCIX Read Response with Status: Data Error",
+ "CCIX Non-okay write response with data error",
};
struct smca_mce_desc {
@@ -299,11 +384,17 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
+ [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
+ [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) },
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
+ [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
+ [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
+ [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
+ [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
};
static bool f12h_mc0_mce(u16 ec, u8 xec)
@@ -874,13 +965,12 @@ static void decode_smca_error(struct mce *m)
ip_name = smca_get_long_name(bank_type);
- pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
+ pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
/* Only print the decode of valid error codes */
if (xec < smca_mce_descs[bank_type].num_descs &&
(hwid->xec_bitmap & BIT_ULL(xec))) {
- pr_emerg(HW_ERR "%s Error: ", ip_name);
- pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
+ pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
}
if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
@@ -961,26 +1051,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
((m->status & MCI_STATUS_UC) ? "UE" :
(m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
- ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
- ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
-
- if (fam >= 0x15) {
- pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
-
- /* F15h, bank4, bit 43 is part of McaStatSubCache. */
- if (fam != 0x15 || m->bank != 4)
- pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
- }
+ ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
+ ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
if (boot_cpu_has(X86_FEATURE_SMCA)) {
u32 low, high;
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
- pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
-
if (!rdmsr_safe(addr, &low, &high) &&
(low & MCI_CONFIG_MCAX))
pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+
+ pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
}
/* do the two bits[14:13] together */
@@ -988,6 +1070,17 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (ecc)
pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
+ if (fam >= 0x15) {
+ pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
+
+ /* F15h, bank4, bit 43 is part of McaStatSubCache. */
+ if (fam != 0x15 || m->bank != 4)
+ pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
+ }
+
+ if (fam >= 0x17)
+ pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
+
pr_cont("]: 0x%016llx\n", m->status);
if (m->status & MCI_STATUS_ADDRV)