diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-04 05:01:00 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-04 05:01:00 +0200 |
commit | f8851cb2d0cc2b4f8ac5dadb03148d6b58609b1c (patch) | |
tree | ba8c635c9a02c2ddf496c35689cc13dfcd3d4888 /drivers/edac | |
parent | Merge tag 'arm-newsoc-5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/s... (diff) | |
parent | Merge branch 'edac-ghes' into edac-for-next (diff) | |
download | linux-f8851cb2d0cc2b4f8ac5dadb03148d6b58609b1c.tar.xz linux-f8851cb2d0cc2b4f8ac5dadb03148d6b58609b1c.zip |
Merge tag 'edac_updates_for_5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras
Pull EDAC updates from Tony Luck:
"Boris is on vacation and aske me to send you the EDAC changes"
* tag 'edac_updates_for_5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras:
EDAC: Fix reference count leaks
EDAC: Remove edac_get_dimm_by_index()
EDAC/ghes: Scan the system once on driver init
EDAC/ghes: Remove unused members of struct ghes_edac_pvt, rename it to ghes_pvt
EDAC/ghes: Setup DIMM label from DMI and use it in error reports
EDAC, {skx,i10nm}: Use CPU stepping macro to pass configurations
EDAC/mc: Call edac_inc_ue_error() before panic
EDAC, pnd2: Set MCE_PRIO_EDAC priority for pnd2_mce_dec notifier
Diffstat (limited to 'drivers/edac')
-rw-r--r-- | drivers/edac/edac_device_sysfs.c | 1 | ||||
-rw-r--r-- | drivers/edac/edac_mc.c | 4 | ||||
-rw-r--r-- | drivers/edac/edac_pci_sysfs.c | 2 | ||||
-rw-r--r-- | drivers/edac/ghes_edac.c | 323 | ||||
-rw-r--r-- | drivers/edac/i10nm_base.c | 12 | ||||
-rw-r--r-- | drivers/edac/pnd2_edac.c | 1 | ||||
-rw-r--r-- | drivers/edac/skx_base.c | 2 |
7 files changed, 204 insertions, 141 deletions
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c index 0e7ea3591b78..5e7593753799 100644 --- a/drivers/edac/edac_device_sysfs.c +++ b/drivers/edac/edac_device_sysfs.c @@ -275,6 +275,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev) /* Error exit stack */ err_kobj_reg: + kobject_put(&edac_dev->kobj); module_put(edac_dev->owner); err_out: diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 5813e931f2f0..01ff71f7b645 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -950,6 +950,8 @@ static void edac_ue_error(struct edac_raw_error_desc *e) e->other_detail); } + edac_inc_ue_error(e); + if (edac_mc_get_panic_on_ue()) { panic("UE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld%s%s)\n", e->msg, @@ -959,8 +961,6 @@ static void edac_ue_error(struct edac_raw_error_desc *e) *e->other_detail ? " - " : "", e->other_detail); } - - edac_inc_ue_error(e); } static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan) diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index 72c9eb9fdffb..53042af7262e 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -386,7 +386,7 @@ static int edac_pci_main_kobj_setup(void) /* Error unwind statck */ kobject_init_and_add_fail: - kfree(edac_pci_top_main_kobj); + kobject_put(edac_pci_top_main_kobj); kzalloc_fail: module_put(THIS_MODULE); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index cb3dab56a875..da60c29468a7 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -15,9 +15,7 @@ #include "edac_module.h" #include <ras/ras_event.h> -struct ghes_edac_pvt { - struct list_head list; - struct ghes *ghes; +struct ghes_pvt { struct mem_ctl_info *mci; /* Buffers for the error handling routine */ @@ -32,7 +30,16 @@ static refcount_t ghes_refcount = REFCOUNT_INIT(0); * also provides the necessary (implicit) memory barrier for the SMP * case to make the pointer visible on another CPU. */ -static struct ghes_edac_pvt *ghes_pvt; +static struct ghes_pvt *ghes_pvt; + +/* + * This driver's representation of the system hardware, as collected + * from DMI. + */ +struct ghes_hw_desc { + int num_dimms; + struct dimm_info *dimms; +} ghes_hw; /* GHES registration mutex */ static DEFINE_MUTEX(ghes_reg_mutex); @@ -74,136 +81,165 @@ struct memdev_dmi_entry { u16 conf_mem_clk_speed; } __attribute__((__packed__)); -struct ghes_edac_dimm_fill { - struct mem_ctl_info *mci; - unsigned int count; -}; +static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle) +{ + struct dimm_info *dimm; + + mci_for_each_dimm(mci, dimm) { + if (dimm->smbios_handle == handle) + return dimm; + } + + return NULL; +} -static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) +static void dimm_setup_label(struct dimm_info *dimm, u16 handle) { - int *num_dimm = arg; + const char *bank = NULL, *device = NULL; + + dmi_memdev_name(handle, &bank, &device); - if (dh->type == DMI_ENTRY_MEM_DEVICE) - (*num_dimm)++; + /* both strings must be non-zero */ + if (bank && *bank && device && *device) + snprintf(dimm->label, sizeof(dimm->label), "%s %s", bank, device); } -static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle) +static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry) { - struct dimm_info *dimm; + u16 rdr_mask = BIT(7) | BIT(13); - mci_for_each_dimm(mci, dimm) { - if (dimm->smbios_handle == handle) - return dimm->idx; + if (entry->size == 0xffff) { + pr_info("Can't get DIMM%i size\n", dimm->idx); + dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ + } else if (entry->size == 0x7fff) { + dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); + } else { + if (entry->size & BIT(15)) + dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10); + else + dimm->nr_pages = MiB_TO_PAGES(entry->size); + } + + switch (entry->memory_type) { + case 0x12: + if (entry->type_detail & BIT(13)) + dimm->mtype = MEM_RDDR; + else + dimm->mtype = MEM_DDR; + break; + case 0x13: + if (entry->type_detail & BIT(13)) + dimm->mtype = MEM_RDDR2; + else + dimm->mtype = MEM_DDR2; + break; + case 0x14: + dimm->mtype = MEM_FB_DDR2; + break; + case 0x18: + if (entry->type_detail & BIT(12)) + dimm->mtype = MEM_NVDIMM; + else if (entry->type_detail & BIT(13)) + dimm->mtype = MEM_RDDR3; + else + dimm->mtype = MEM_DDR3; + break; + case 0x1a: + if (entry->type_detail & BIT(12)) + dimm->mtype = MEM_NVDIMM; + else if (entry->type_detail & BIT(13)) + dimm->mtype = MEM_RDDR4; + else + dimm->mtype = MEM_DDR4; + break; + default: + if (entry->type_detail & BIT(6)) + dimm->mtype = MEM_RMBS; + else if ((entry->type_detail & rdr_mask) == rdr_mask) + dimm->mtype = MEM_RDR; + else if (entry->type_detail & BIT(7)) + dimm->mtype = MEM_SDR; + else if (entry->type_detail & BIT(9)) + dimm->mtype = MEM_EDO; + else + dimm->mtype = MEM_UNKNOWN; } - return -1; + /* + * Actually, we can only detect if the memory has bits for + * checksum or not + */ + if (entry->total_width == entry->data_width) + dimm->edac_mode = EDAC_NONE; + else + dimm->edac_mode = EDAC_SECDED; + + dimm->dtype = DEV_UNKNOWN; + dimm->grain = 128; /* Likely, worse case */ + + dimm_setup_label(dimm, entry->handle); + + if (dimm->nr_pages) { + edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", + dimm->idx, edac_mem_types[dimm->mtype], + PAGES_TO_MiB(dimm->nr_pages), + (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); + edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", + entry->memory_type, entry->type_detail, + entry->total_width, entry->data_width); + } + + dimm->smbios_handle = entry->handle; } -static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) +static void enumerate_dimms(const struct dmi_header *dh, void *arg) { - struct ghes_edac_dimm_fill *dimm_fill = arg; - struct mem_ctl_info *mci = dimm_fill->mci; - - if (dh->type == DMI_ENTRY_MEM_DEVICE) { - struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; - struct dimm_info *dimm = edac_get_dimm(mci, dimm_fill->count, 0, 0); - u16 rdr_mask = BIT(7) | BIT(13); - - if (entry->size == 0xffff) { - pr_info("Can't get DIMM%i size\n", - dimm_fill->count); - dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ - } else if (entry->size == 0x7fff) { - dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); - } else { - if (entry->size & BIT(15)) - dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10); - else - dimm->nr_pages = MiB_TO_PAGES(entry->size); - } + struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; + struct ghes_hw_desc *hw = (struct ghes_hw_desc *)arg; + struct dimm_info *d; - switch (entry->memory_type) { - case 0x12: - if (entry->type_detail & BIT(13)) - dimm->mtype = MEM_RDDR; - else - dimm->mtype = MEM_DDR; - break; - case 0x13: - if (entry->type_detail & BIT(13)) - dimm->mtype = MEM_RDDR2; - else - dimm->mtype = MEM_DDR2; - break; - case 0x14: - dimm->mtype = MEM_FB_DDR2; - break; - case 0x18: - if (entry->type_detail & BIT(12)) - dimm->mtype = MEM_NVDIMM; - else if (entry->type_detail & BIT(13)) - dimm->mtype = MEM_RDDR3; - else - dimm->mtype = MEM_DDR3; - break; - case 0x1a: - if (entry->type_detail & BIT(12)) - dimm->mtype = MEM_NVDIMM; - else if (entry->type_detail & BIT(13)) - dimm->mtype = MEM_RDDR4; - else - dimm->mtype = MEM_DDR4; - break; - default: - if (entry->type_detail & BIT(6)) - dimm->mtype = MEM_RMBS; - else if ((entry->type_detail & rdr_mask) == rdr_mask) - dimm->mtype = MEM_RDR; - else if (entry->type_detail & BIT(7)) - dimm->mtype = MEM_SDR; - else if (entry->type_detail & BIT(9)) - dimm->mtype = MEM_EDO; - else - dimm->mtype = MEM_UNKNOWN; - } + if (dh->type != DMI_ENTRY_MEM_DEVICE) + return; - /* - * Actually, we can only detect if the memory has bits for - * checksum or not - */ - if (entry->total_width == entry->data_width) - dimm->edac_mode = EDAC_NONE; - else - dimm->edac_mode = EDAC_SECDED; + /* Enlarge the array with additional 16 */ + if (!hw->num_dimms || !(hw->num_dimms % 16)) { + struct dimm_info *new; - dimm->dtype = DEV_UNKNOWN; - dimm->grain = 128; /* Likely, worse case */ - - /* - * FIXME: It shouldn't be hard to also fill the DIMM labels - */ - - if (dimm->nr_pages) { - edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", - dimm_fill->count, edac_mem_types[dimm->mtype], - PAGES_TO_MiB(dimm->nr_pages), - (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); - edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", - entry->memory_type, entry->type_detail, - entry->total_width, entry->data_width); + new = krealloc(hw->dimms, (hw->num_dimms + 16) * sizeof(struct dimm_info), + GFP_KERNEL); + if (!new) { + WARN_ON_ONCE(1); + return; } - dimm->smbios_handle = entry->handle; - - dimm_fill->count++; + hw->dimms = new; } + + d = &hw->dimms[hw->num_dimms]; + d->idx = hw->num_dimms; + + assign_dmi_dimm_info(d, entry); + + hw->num_dimms++; +} + +static void ghes_scan_system(void) +{ + static bool scanned; + + if (scanned) + return; + + dmi_walk(enumerate_dimms, &ghes_hw); + + scanned = true; } void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) { struct edac_raw_error_desc *e; struct mem_ctl_info *mci; - struct ghes_edac_pvt *pvt; + struct ghes_pvt *pvt; unsigned long flags; char *p; @@ -228,7 +264,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) memset(e, 0, sizeof (*e)); e->error_count = 1; e->grain = 1; - strcpy(e->label, "unknown label"); e->msg = pvt->msg; e->other_detail = pvt->other_detail; e->top_layer = -1; @@ -345,7 +380,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos); if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { const char *bank = NULL, *device = NULL; - int index = -1; + struct dimm_info *dimm; dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device); if (bank != NULL && device != NULL) @@ -354,13 +389,18 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) p += sprintf(p, "DIMM DMI handle: 0x%.4x ", mem_err->mem_dev_handle); - index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle); - if (index >= 0) - e->top_layer = index; + dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle); + if (dimm) { + e->top_layer = dimm->idx; + strcpy(e->label, dimm->label); + } } if (p > e->location) *(p - 1) = '\0'; + if (!*e->label) + strcpy(e->label, "unknown memory"); + /* All other fields are mapped on e->other_detail */ p = pvt->other_detail; p += snprintf(p, sizeof(pvt->other_detail), @@ -455,13 +495,12 @@ static struct acpi_platform_list plat_list[] = { int ghes_edac_register(struct ghes *ghes, struct device *dev) { bool fake = false; - int rc = 0, num_dimm = 0; struct mem_ctl_info *mci; - struct ghes_edac_pvt *pvt; + struct ghes_pvt *pvt; struct edac_mc_layer layers[1]; - struct ghes_edac_dimm_fill dimm_fill; unsigned long flags; int idx = -1; + int rc = 0; if (IS_ENABLED(CONFIG_X86)) { /* Check if safe to enable on this system */ @@ -481,20 +520,19 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) if (refcount_inc_not_zero(&ghes_refcount)) goto unlock; - /* Get the number of DIMMs */ - dmi_walk(ghes_edac_count_dimms, &num_dimm); + ghes_scan_system(); /* Check if we've got a bogus BIOS */ - if (num_dimm == 0) { + if (!ghes_hw.num_dimms) { fake = true; - num_dimm = 1; + ghes_hw.num_dimms = 1; } layers[0].type = EDAC_MC_LAYER_ALL_MEM; - layers[0].size = num_dimm; + layers[0].size = ghes_hw.num_dimms; layers[0].is_virt_csrow = true; - mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt)); + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_pvt)); if (!mci) { pr_info("Can't allocate memory for EDAC data\n"); rc = -ENOMEM; @@ -502,7 +540,6 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) } pvt = mci->pvt_info; - pvt->ghes = ghes; pvt->mci = mci; mci->pdev = dev; @@ -523,13 +560,34 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); pr_info("If you find incorrect reports, please contact your hardware vendor\n"); pr_info("to correct its BIOS.\n"); - pr_info("This system has %d DIMM sockets.\n", num_dimm); + pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms); } if (!fake) { - dimm_fill.count = 0; - dimm_fill.mci = mci; - dmi_walk(ghes_edac_dmidecode, &dimm_fill); + struct dimm_info *src, *dst; + int i = 0; + + mci_for_each_dimm(mci, dst) { + src = &ghes_hw.dimms[i]; + + dst->idx = src->idx; + dst->smbios_handle = src->smbios_handle; + dst->nr_pages = src->nr_pages; + dst->mtype = src->mtype; + dst->edac_mode = src->edac_mode; + dst->dtype = src->dtype; + dst->grain = src->grain; + + /* + * If no src->label, preserve default label assigned + * from EDAC core. + */ + if (strlen(src->label)) + memcpy(dst->label, src->label, sizeof(src->label)); + + i++; + } + } else { struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0); @@ -542,7 +600,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) rc = edac_mc_add_mc(mci); if (rc < 0) { - pr_info("Can't register at EDAC core\n"); + pr_info("Can't register with the EDAC core\n"); edac_mc_free(mci); rc = -ENODEV; goto unlock; @@ -556,6 +614,11 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) refcount_set(&ghes_refcount, 1); unlock: + + /* Not needed anymore */ + kfree(ghes_hw.dimms); + ghes_hw.dimms = NULL; + mutex_unlock(&ghes_reg_mutex); return rc; diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 9b0044cd21cd..c8d11da85bec 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -135,9 +135,11 @@ static struct res_config i10nm_cfg1 = { }; static const struct x86_cpu_id i10nm_cpuids[] = { - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &i10nm_cfg0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &i10nm_cfg0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &i10nm_cfg1), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x0, 0x3), &i10nm_cfg0), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0xf), &i10nm_cfg1), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x0, 0xf), &i10nm_cfg1), {} }; MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids); @@ -264,10 +266,6 @@ static int __init i10nm_init(void) cfg = (struct res_config *)id->driver_data; - /* Newer steppings have different offset for ATOM_TREMONT_D/ICELAKE_X */ - if (boot_cpu_data.x86_stepping >= 4) - cfg->busno_cfg_offset = 0xd0; - rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm); if (rc) return rc; diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index c1f2e6deb021..fd363746f5b0 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -1432,6 +1432,7 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo static struct notifier_block pnd2_mce_dec = { .notifier_call = pnd2_mce_check_error, + .priority = MCE_PRIO_EDAC, }; #ifdef CONFIG_EDAC_DEBUG diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index b907a0f4ece6..2c7db95df326 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -164,7 +164,7 @@ static struct res_config skx_cfg = { }; static const struct x86_cpu_id skx_cpuids[] = { - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_cfg), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x0, 0xf), &skx_cfg), { } }; MODULE_DEVICE_TABLE(x86cpu, skx_cpuids); |