From 7556419180a304b9999c2c33162aec3da2d0deb9 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Wed, 28 Sep 2022 08:48:15 -0400 Subject: EDAC/i5000: Mark as BROKEN i5000_edac supports very old hardware which isn't available and it's been broken for single/dual channel for many years without anyone noticing. Marking as BROKEN. Signed-off-by: Aristeu Rozanski Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220921181009.oxytvicy6sry6it7@redhat.com --- drivers/edac/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 456602d373b7..70a84aba2b48 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -211,6 +211,7 @@ config EDAC_R82600 config EDAC_I5000 tristate "Intel Greencreek/Blackford chipset" depends on X86 && PCI + depends on BROKEN help Support for error detection and correction the Intel Greekcreek/Blackford chipsets. -- cgit v1.2.3 From 5012524eb051fccbb9f8729f0de1df7b7783333b Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 10 Oct 2022 02:35:53 +0000 Subject: efi/cper: Export several helpers for ghes_edac to use Before ghes_edac can be turned back into a proper module again, export several helpers which are going to be used by it. Signed-off-by: Jia He Signed-off-by: Borislav Petkov Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20221010023559.69655-2-justin.he@arm.com --- drivers/firmware/efi/cper.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index e4e5ea7ce910..053eae13f409 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -290,6 +290,7 @@ int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) return n; } +EXPORT_SYMBOL_GPL(cper_mem_err_location); int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg) { @@ -310,6 +311,7 @@ int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg) return n; } +EXPORT_SYMBOL_GPL(cper_dimm_err_location); void cper_mem_err_pack(const struct cper_sec_mem_err *mem, struct cper_mem_err_compact *cmem) @@ -331,6 +333,7 @@ void cper_mem_err_pack(const struct cper_sec_mem_err *mem, cmem->mem_array_handle = mem->mem_array_handle; cmem->mem_dev_handle = mem->mem_dev_handle; } +EXPORT_SYMBOL_GPL(cper_mem_err_pack); const char *cper_mem_err_unpack(struct trace_seq *p, struct cper_mem_err_compact *cmem) -- cgit v1.2.3 From 8e40612f6146da1333e9bb5cfd9af7511c063d93 Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 10 Oct 2022 02:35:54 +0000 Subject: EDAC/ghes: Add a notifier for reporting memory errors In order to make it a proper module and disentangle it from facilities, add a notifier for reporting memory errors. Use an atomic notifier because calls sites like ghes_proc_in_irq() run in interrupt context. [ bp: Massage commit message. ] Suggested-by: Borislav Petkov Signed-off-by: Jia He Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221010023559.69655-3-justin.he@arm.com --- drivers/acpi/apei/ghes.c | 16 +++++++++++++++- drivers/edac/ghes_edac.c | 19 +++++++++++++++++-- include/acpi/ghes.h | 10 +++------- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 80ad530583c9..55013e024ba3 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -94,6 +94,8 @@ #define FIX_APEI_GHES_SDEI_CRITICAL __end_of_fixed_addresses #endif +static ATOMIC_NOTIFIER_HEAD(ghes_report_chain); + static inline bool is_hest_type_generic_v2(struct ghes *ghes) { return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2; @@ -645,7 +647,7 @@ static bool ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); - ghes_edac_report_mem_error(sev, mem_err); + atomic_notifier_call_chain(&ghes_report_chain, sev, mem_err); arch_apei_report_mem_error(sev, mem_err); queued = ghes_handle_memory_failure(gdata, sev); @@ -1497,3 +1499,15 @@ void __init acpi_ghes_init(void) else pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n"); } + +void ghes_register_report_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&ghes_report_chain, nb); +} +EXPORT_SYMBOL_GPL(ghes_register_report_chain); + +void ghes_unregister_report_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&ghes_report_chain, nb); +} +EXPORT_SYMBOL_GPL(ghes_unregister_report_chain); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index c8fa7dcfdbd0..7b8d56a769f6 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -14,6 +14,7 @@ #include #include "edac_module.h" #include +#include #define OTHER_DETAIL_LEN 400 @@ -267,11 +268,14 @@ out: return n; } -void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) +static int ghes_edac_report_mem_error(struct notifier_block *nb, + unsigned long val, void *data) { + struct cper_sec_mem_err *mem_err = (struct cper_sec_mem_err *)data; struct cper_mem_err_compact cmem; struct edac_raw_error_desc *e; struct mem_ctl_info *mci; + unsigned long sev = val; struct ghes_pvt *pvt; unsigned long flags; char *p; @@ -282,7 +286,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) * know. */ if (WARN_ON_ONCE(in_nmi())) - return; + return NOTIFY_OK; spin_lock_irqsave(&ghes_lock, flags); @@ -374,8 +378,15 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) unlock: spin_unlock_irqrestore(&ghes_lock, flags); + + return NOTIFY_OK; } +static struct notifier_block ghes_edac_mem_err_nb = { + .notifier_call = ghes_edac_report_mem_error, + .priority = 0, +}; + /* * Known systems that are safe to enable this module. */ @@ -503,6 +514,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) ghes_pvt = pvt; spin_unlock_irqrestore(&ghes_lock, flags); + ghes_register_report_chain(&ghes_edac_mem_err_nb); + /* only set on success */ refcount_set(&ghes_refcount, 1); @@ -548,6 +561,8 @@ void ghes_edac_unregister(struct ghes *ghes) if (mci) edac_mc_free(mci); + ghes_unregister_report_chain(&ghes_edac_mem_err_nb); + unlock: mutex_unlock(&ghes_reg_mutex); } diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 34fb3431a8f3..5cbd38b6e4e1 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -76,18 +76,11 @@ int ghes_estatus_pool_init(int num_ghes); /* From drivers/edac/ghes_edac.c */ #ifdef CONFIG_EDAC_GHES -void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err); - int ghes_edac_register(struct ghes *ghes, struct device *dev); void ghes_edac_unregister(struct ghes *ghes); #else -static inline void ghes_edac_report_mem_error(int sev, - struct cper_sec_mem_err *mem_err) -{ -} - static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) { return -ENODEV; @@ -145,4 +138,7 @@ int ghes_notify_sea(void); static inline int ghes_notify_sea(void) { return -ENOENT; } #endif +struct notifier_block; +extern void ghes_register_report_chain(struct notifier_block *nb); +extern void ghes_unregister_report_chain(struct notifier_block *nb); #endif /* GHES_H */ -- cgit v1.2.3 From 9057a3f7ac360e068ceb261938e9ae2b1a7e654c Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 10 Oct 2022 02:35:55 +0000 Subject: EDAC/ghes: Prepare to make ghes_edac a proper module To make ghes_edac a proper module, prepare to decouple its dependencies from GHES. Move the ghes_edac.force_load parameter to ghes.c in order to properly control whether ghes_edac should be force-loaded: In ghes_edac_register() it is too late to set the module flag. Introduce a helper ghes_get_devices(), which returns the list of GHES devices which got probed when the platform-check passes on the system. The previous force_load check is not needed in ghes_edac_unregister() since it will be checked in the module's init function of ghes_edac later. [ bp: Massage. ] Suggested-by: Toshi Kani Suggested-by: Borislav Petkov Signed-off-by: Jia He Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221010023559.69655-4-justin.he@arm.com --- drivers/acpi/apei/ghes.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/edac/ghes_edac.c | 35 ++------------------------------- include/acpi/ghes.h | 6 ++++++ 3 files changed, 58 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 55013e024ba3..acab512741f6 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -109,6 +109,13 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes) bool ghes_disable; module_param_named(disable, ghes_disable, bool, 0); +/* + * "ghes.edac_force_enable" forcibly enables ghes_edac and skips the platform + * check. + */ +static bool ghes_edac_force_enable; +module_param_named(edac_force_enable, ghes_edac_force_enable, bool, 0); + /* * All error sources notified with HED (Hardware Error Device) share a * single notifier callback, so they need to be linked and checked one @@ -120,6 +127,13 @@ module_param_named(disable, ghes_disable, bool, 0); static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); +/* + * A list of GHES devices which are given to the corresponding EDAC driver + * ghes_edac for further use. + */ +static LIST_HEAD(ghes_devs); +static DEFINE_MUTEX(ghes_devs_mutex); + /* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer @@ -1380,6 +1394,12 @@ static int ghes_probe(struct platform_device *ghes_dev) ghes_edac_register(ghes, &ghes_dev->dev); + ghes->dev = &ghes_dev->dev; + + mutex_lock(&ghes_devs_mutex); + list_add_tail(&ghes->elist, &ghes_devs); + mutex_unlock(&ghes_devs_mutex); + /* Handle any pending errors right away */ spin_lock_irqsave(&ghes_notify_lock_irq, flags); ghes_proc(ghes); @@ -1444,6 +1464,10 @@ static int ghes_remove(struct platform_device *ghes_dev) ghes_edac_unregister(ghes); + mutex_lock(&ghes_devs_mutex); + list_del(&ghes->elist); + mutex_unlock(&ghes_devs_mutex); + kfree(ghes); platform_set_drvdata(ghes_dev, NULL); @@ -1500,6 +1524,32 @@ void __init acpi_ghes_init(void) pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n"); } +/* + * Known x86 systems that prefer GHES error reporting: + */ +static struct acpi_platform_list plat_list[] = { + {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, + { } /* End */ +}; + +struct list_head *ghes_get_devices(void) +{ + int idx = -1; + + if (IS_ENABLED(CONFIG_X86)) { + idx = acpi_match_platform_list(plat_list); + if (idx < 0) { + if (!ghes_edac_force_enable) + return NULL; + + pr_warn_once("Force-loading ghes_edac on an unsupported platform. You're on your own!\n"); + } + } + + return &ghes_devs; +} +EXPORT_SYMBOL_GPL(ghes_get_devices); + void ghes_register_report_chain(struct notifier_block *nb) { atomic_notifier_chain_register(&ghes_report_chain, nb); diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 7b8d56a769f6..b85a545d1cb0 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -54,10 +54,6 @@ static DEFINE_MUTEX(ghes_reg_mutex); */ static DEFINE_SPINLOCK(ghes_lock); -/* "ghes_edac.force_load=1" skips the platform check */ -static bool __read_mostly force_load; -module_param(force_load, bool, 0); - static bool system_scanned; /* Memory Device - Type 17 of SMBIOS spec */ @@ -387,14 +383,6 @@ static struct notifier_block ghes_edac_mem_err_nb = { .priority = 0, }; -/* - * Known systems that are safe to enable this module. - */ -static struct acpi_platform_list plat_list[] = { - {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, - { } /* End */ -}; - int ghes_edac_register(struct ghes *ghes, struct device *dev) { bool fake = false; @@ -402,19 +390,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) struct ghes_pvt *pvt; struct edac_mc_layer layers[1]; unsigned long flags; - int idx = -1; int rc = 0; - if (IS_ENABLED(CONFIG_X86)) { - /* Check if safe to enable on this system */ - idx = acpi_match_platform_list(plat_list); - if (!force_load && idx < 0) - return -ENODEV; - } else { - force_load = true; - idx = 0; - } - /* finish another registration/unregistration instance first */ mutex_lock(&ghes_reg_mutex); @@ -458,15 +435,10 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); pr_info("work on such system. Use this driver with caution\n"); - } else if (idx < 0) { - pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); - pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); - pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); - pr_info("If you find incorrect reports, please contact your hardware vendor\n"); - pr_info("to correct its BIOS.\n"); - pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms); } + pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms); + if (!fake) { struct dimm_info *src, *dst; int i = 0; @@ -535,9 +507,6 @@ void ghes_edac_unregister(struct ghes *ghes) struct mem_ctl_info *mci; unsigned long flags; - if (!force_load) - return; - mutex_lock(&ghes_reg_mutex); system_scanned = false; diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 5cbd38b6e4e1..ce693e9f07a0 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -27,6 +27,8 @@ struct ghes { struct timer_list timer; unsigned int irq; }; + struct device *dev; + struct list_head elist; }; struct ghes_estatus_node { @@ -80,6 +82,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev); void ghes_edac_unregister(struct ghes *ghes); +struct list_head *ghes_get_devices(void); + #else static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) { @@ -89,6 +93,8 @@ static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) static inline void ghes_edac_unregister(struct ghes *ghes) { } + +static inline struct list_head *ghes_get_devices(void) { return NULL; } #endif static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata) -- cgit v1.2.3 From 802e7f1dfed7cc7fb309995e0c4138f08977fdfc Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 10 Oct 2022 02:35:56 +0000 Subject: EDAC/ghes: Make ghes_edac a proper module Commit dc4e8c07e9e2 ("ACPI: APEI: explicit init of HEST and GHES in apci_init()") introduced a bug leading to ghes_edac_register() to be invoked before edac_init(). Because at that time the bus "edac" hadn't been even registered, this created sysfs nodes as /devices/mc0 instead of /sys/devices/system/edac/mc/mc0 on an Ampere eMag server. Fix this by turning ghes_edac into a proper module. The list of GHES devices returned is not protected from being modified concurrently but it is pretty static as it gets created only during GHES init and latter is not a module so... [ bp: Massage. ] Fixes: dc4e8c07e9e2 ("ACPI: APEI: explicit init of HEST and GHES in apci_init()") Co-developed-by: Borislav Petkov Signed-off-by: Borislav Petkov Signed-off-by: Jia He Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221010023559.69655-5-justin.he@arm.com --- drivers/acpi/apei/ghes.c | 4 ---- drivers/edac/Kconfig | 4 ++-- drivers/edac/ghes_edac.c | 40 ++++++++++++++++++++++++++++++++++++++-- include/acpi/ghes.h | 22 ++-------------------- 4 files changed, 42 insertions(+), 28 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index acab512741f6..249cd01cb920 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1392,8 +1392,6 @@ static int ghes_probe(struct platform_device *ghes_dev) platform_set_drvdata(ghes_dev, ghes); - ghes_edac_register(ghes, &ghes_dev->dev); - ghes->dev = &ghes_dev->dev; mutex_lock(&ghes_devs_mutex); @@ -1462,8 +1460,6 @@ static int ghes_remove(struct platform_device *ghes_dev) ghes_fini(ghes); - ghes_edac_unregister(ghes); - mutex_lock(&ghes_devs_mutex); list_del(&ghes->elist); mutex_unlock(&ghes_devs_mutex); diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 456602d373b7..cde0849cf861 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -53,8 +53,8 @@ config EDAC_DECODE_MCE has been initialized. config EDAC_GHES - bool "Output ACPI APEI/GHES BIOS detected errors via EDAC" - depends on ACPI_APEI_GHES && (EDAC=y) + tristate "Output ACPI APEI/GHES BIOS detected errors via EDAC" + depends on ACPI_APEI_GHES select UEFI_CPER help Not all machines support hardware-driven error report. Some of those diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index b85a545d1cb0..cf2b618c1ada 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -56,6 +56,8 @@ static DEFINE_SPINLOCK(ghes_lock); static bool system_scanned; +static struct list_head *ghes_devs; + /* Memory Device - Type 17 of SMBIOS spec */ struct memdev_dmi_entry { u8 type; @@ -383,7 +385,7 @@ static struct notifier_block ghes_edac_mem_err_nb = { .priority = 0, }; -int ghes_edac_register(struct ghes *ghes, struct device *dev) +static int ghes_edac_register(struct device *dev) { bool fake = false; struct mem_ctl_info *mci; @@ -502,7 +504,7 @@ unlock: return rc; } -void ghes_edac_unregister(struct ghes *ghes) +static void ghes_edac_unregister(struct ghes *ghes) { struct mem_ctl_info *mci; unsigned long flags; @@ -535,3 +537,37 @@ void ghes_edac_unregister(struct ghes *ghes) unlock: mutex_unlock(&ghes_reg_mutex); } + +static int __init ghes_edac_init(void) +{ + struct ghes *g, *g_tmp; + + ghes_devs = ghes_get_devices(); + if (!ghes_devs) + return -ENODEV; + + if (list_empty(ghes_devs)) { + pr_info("GHES probing device list is empty"); + return -ENODEV; + } + + list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) { + ghes_edac_register(g->dev); + } + + return 0; +} +module_init(ghes_edac_init); + +static void __exit ghes_edac_exit(void) +{ + struct ghes *g, *g_tmp; + + list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) { + ghes_edac_unregister(g); + } +} +module_exit(ghes_edac_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Output ACPI APEI/GHES BIOS detected errors via EDAC"); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index ce693e9f07a0..2e785d3554d8 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -71,32 +71,14 @@ int ghes_register_vendor_record_notifier(struct notifier_block *nb); * @nb: pointer to the notifier_block structure of the vendor record handler. */ void ghes_unregister_vendor_record_notifier(struct notifier_block *nb); -#endif - -int ghes_estatus_pool_init(int num_ghes); - -/* From drivers/edac/ghes_edac.c */ - -#ifdef CONFIG_EDAC_GHES -int ghes_edac_register(struct ghes *ghes, struct device *dev); - -void ghes_edac_unregister(struct ghes *ghes); struct list_head *ghes_get_devices(void); - #else -static inline int ghes_edac_register(struct ghes *ghes, struct device *dev) -{ - return -ENODEV; -} - -static inline void ghes_edac_unregister(struct ghes *ghes) -{ -} - static inline struct list_head *ghes_get_devices(void) { return NULL; } #endif +int ghes_estatus_pool_init(int num_ghes); + static inline int acpi_hest_get_version(struct acpi_hest_generic_data *gdata) { return gdata->revision >> 8; -- cgit v1.2.3 From 315bada690e0c4a5c268a1dd3601aefd3355773f Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 10 Oct 2022 02:35:57 +0000 Subject: EDAC: Check for GHES preference in the chipset-specific EDAC drivers Call ghes_get_devices() to check whether ghes_edac should be used on the platform where it is preferred over the corresponding chipset-specific EDAC driver. Unlike the existing edac_get_owner() check, the ghes_get_devices() check works independent to the module_init ordering. [ bp: Massage. ] Suggested-by: Toshi Kani Signed-off-by: Jia He Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221010023559.69655-6-justin.he@arm.com --- drivers/edac/amd64_edac.c | 3 +++ drivers/edac/armada_xp_edac.c | 3 +++ drivers/edac/edac_module.h | 1 + drivers/edac/i10nm_base.c | 3 +++ drivers/edac/igen6_edac.c | 3 +++ drivers/edac/layerscape_edac.c | 3 +++ drivers/edac/pnd2_edac.c | 3 +++ drivers/edac/sb_edac.c | 3 +++ drivers/edac/skx_base.c | 3 +++ drivers/edac/thunderx_edac.c | 3 +++ drivers/edac/xgene_edac.c | 3 +++ 11 files changed, 31 insertions(+) (limited to 'drivers') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 2f854feeeb23..e3318e5575a3 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -4329,6 +4329,9 @@ static int __init amd64_edac_init(void) int err = -ENODEV; int i; + if (ghes_get_devices()) + return -EBUSY; + owner = edac_get_owner(); if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) return -EBUSY; diff --git a/drivers/edac/armada_xp_edac.c b/drivers/edac/armada_xp_edac.c index 038abbb83f4b..c4bd2fb9c46b 100644 --- a/drivers/edac/armada_xp_edac.c +++ b/drivers/edac/armada_xp_edac.c @@ -599,6 +599,9 @@ static int __init armada_xp_edac_init(void) { int res; + if (ghes_get_devices()) + return -EBUSY; + /* only polling is supported */ edac_op_state = EDAC_OPSTATE_POLL; diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index 50ed9f2425bb..763c076d96f2 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -11,6 +11,7 @@ #ifndef __EDAC_MODULE_H__ #define __EDAC_MODULE_H__ +#include #include "edac_mc.h" #include "edac_pci.h" #include "edac_device.h" diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index a22ea053f8e1..ba77c79d234d 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -756,6 +756,9 @@ static int __init i10nm_init(void) edac_dbg(2, "\n"); + if (ghes_get_devices()) + return -EBUSY; + owner = edac_get_owner(); if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) return -EBUSY; diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index a07bbfd075d0..d33c666221f9 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -1271,6 +1271,9 @@ static int __init igen6_init(void) edac_dbg(2, "\n"); + if (ghes_get_devices()) + return -EBUSY; + owner = edac_get_owner(); if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) return -ENODEV; diff --git a/drivers/edac/layerscape_edac.c b/drivers/edac/layerscape_edac.c index 94cac7686a56..35ceaca578e1 100644 --- a/drivers/edac/layerscape_edac.c +++ b/drivers/edac/layerscape_edac.c @@ -38,6 +38,9 @@ static int __init fsl_ddr_mc_init(void) { int res; + if (ghes_get_devices()) + return -EBUSY; + /* make sure error reporting method is sane */ switch (edac_op_state) { case EDAC_OPSTATE_POLL: diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index a20b299f1202..2b306f2cc605 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -1528,6 +1528,9 @@ static int __init pnd2_init(void) edac_dbg(2, "\n"); + if (ghes_get_devices()) + return -EBUSY; + owner = edac_get_owner(); if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) return -EBUSY; diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 8e39370fdb5c..0c779a0326b6 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3634,6 +3634,9 @@ static int __init sbridge_init(void) edac_dbg(2, "\n"); + if (ghes_get_devices()) + return -EBUSY; + owner = edac_get_owner(); if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) return -EBUSY; diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 7e2762f62eec..9397abb42c49 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -653,6 +653,9 @@ static int __init skx_init(void) edac_dbg(2, "\n"); + if (ghes_get_devices()) + return -EBUSY; + owner = edac_get_owner(); if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) return -EBUSY; diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c index f13674081cb6..0bcd9f02c84a 100644 --- a/drivers/edac/thunderx_edac.c +++ b/drivers/edac/thunderx_edac.c @@ -2114,6 +2114,9 @@ static int __init thunderx_edac_init(void) { int rc = 0; + if (ghes_get_devices()) + return -EBUSY; + rc = pci_register_driver(&thunderx_lmc_driver); if (rc) return rc; diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c index 54081403db4f..c52b9dd9154c 100644 --- a/drivers/edac/xgene_edac.c +++ b/drivers/edac/xgene_edac.c @@ -2004,6 +2004,9 @@ static int __init xgene_edac_init(void) { int rc; + if (ghes_get_devices()) + return -EBUSY; + /* Make sure error reporting method is sane */ switch (edac_op_state) { case EDAC_OPSTATE_POLL: -- cgit v1.2.3 From 0d2aa70b8f58c382059fe847fdaa65f05361fc1c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Oct 2022 08:22:13 +0000 Subject: apei/ghes: Use xchg_release() for updating new cache slot instead of cmpxchg() Some documentation first, about how this machinery works: It seems, the intent of the GHES error records cache is to collect already reported errors - see the ghes_estatus_cached() checks. There's even a sentence trying to say what this does: /* * GHES error status reporting throttle, to report more kinds of * errors, instead of just most frequently occurred errors. */ New elements are added to the cache this way: if (!ghes_estatus_cached(estatus)) { if (ghes_print_estatus(NULL, ghes->generic, estatus)) ghes_estatus_cache_add(ghes->generic, estatus); The intent being, once this new error record is reported, it gets cached so that it doesn't get reported for a while due to too many, same-type error records getting reported in burst-like scenarios. I.e., new, unreported error types can have a higher chance of getting reported. Now, the loop in ghes_estatus_cache_add() is trying to pick out the oldest element in there. Meaning, something which got reported already but a long while ago, i.e., a LRU-type scheme. And the cmpxchg() is there presumably to make sure when that selected element slot_cache is removed, it really *is* that element that gets removed and not one which replaced it in the meantime. Now, ghes_estatus_cache_add() selects a slot, and either succeeds in replacing its contents with a pointer to a newly cached item, or it just gives up and frees the new item again, without attempting to select another slot even if one might be available. Since only inserting new items is being done here, the race can only cause a failure if the selected slot was updated with another new item concurrently, which means that it is arbitrary which of those two items gets dropped. And "dropped" here means, the item doesn't get added to the cache so the next time it is seen, it'll get reported again and an insertion attempt will be done again. Eventually, it'll get inserted and all those times when the insertion fails, the item will get reported although the cache is supposed to prevent that and "ratelimit" those repeated error records. Not a big deal in any case. This means the cmpxchg() and the special case are not necessary. Therefore, just drop the existing item unconditionally. Move the xchg_release() and call_rcu() out of rcu_read_lock/unlock section since there is no actually dereferencing the pointer at all. [ bp: - Flesh out and summarize what was discussed on the thread now that that cache contraption is understood; - Touch up code style. ] Co-developed-by: Jia He Signed-off-by: Jia He Signed-off-by: Ard Biesheuvel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221010023559.69655-7-justin.he@arm.com --- drivers/acpi/apei/ghes.c | 60 ++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 249cd01cb920..6164bf737ee6 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -154,7 +154,7 @@ struct ghes_vendor_record_entry { static struct gen_pool *ghes_estatus_pool; static unsigned long ghes_estatus_pool_size_request; -static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; +static struct ghes_estatus_cache __rcu *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; static atomic_t ghes_estatus_cache_alloced; static int ghes_panic_timeout __read_mostly = 30; @@ -789,48 +789,42 @@ static struct ghes_estatus_cache *ghes_estatus_cache_alloc( return cache; } -static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache) +static void ghes_estatus_cache_rcu_free(struct rcu_head *head) { + struct ghes_estatus_cache *cache; u32 len; + cache = container_of(head, struct ghes_estatus_cache, rcu); len = cper_estatus_len(GHES_ESTATUS_FROM_CACHE(cache)); len = GHES_ESTATUS_CACHE_LEN(len); gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len); atomic_dec(&ghes_estatus_cache_alloced); } -static void ghes_estatus_cache_rcu_free(struct rcu_head *head) -{ - struct ghes_estatus_cache *cache; - - cache = container_of(head, struct ghes_estatus_cache, rcu); - ghes_estatus_cache_free(cache); -} - -static void ghes_estatus_cache_add( - struct acpi_hest_generic *generic, - struct acpi_hest_generic_status *estatus) +static void +ghes_estatus_cache_add(struct acpi_hest_generic *generic, + struct acpi_hest_generic_status *estatus) { - int i, slot = -1, count; unsigned long long now, duration, period, max_period = 0; - struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache; + struct ghes_estatus_cache *cache, *new_cache; + struct ghes_estatus_cache __rcu *victim; + int i, slot = -1, count; new_cache = ghes_estatus_cache_alloc(generic, estatus); - if (new_cache == NULL) + if (!new_cache) return; + rcu_read_lock(); now = sched_clock(); for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { cache = rcu_dereference(ghes_estatus_caches[i]); if (cache == NULL) { slot = i; - slot_cache = NULL; break; } duration = now - cache->time_in; if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) { slot = i; - slot_cache = cache; break; } count = atomic_read(&cache->count); @@ -839,18 +833,30 @@ static void ghes_estatus_cache_add( if (period > max_period) { max_period = period; slot = i; - slot_cache = cache; } } - /* new_cache must be put into array after its contents are written */ - smp_wmb(); - if (slot != -1 && cmpxchg(ghes_estatus_caches + slot, - slot_cache, new_cache) == slot_cache) { - if (slot_cache) - call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free); - } else - ghes_estatus_cache_free(new_cache); rcu_read_unlock(); + + if (slot != -1) { + /* + * Use release semantics to ensure that ghes_estatus_cached() + * running on another CPU will see the updated cache fields if + * it can see the new value of the pointer. + */ + victim = xchg_release(&ghes_estatus_caches[slot], + RCU_INITIALIZER(new_cache)); + + /* + * At this point, victim may point to a cached item different + * from the one based on which we selected the slot. Instead of + * going to the loop again to pick another slot, let's just + * drop the other item anyway: this may cause a false cache + * miss later on, but that won't cause any problems. + */ + if (victim) + call_rcu(&unrcu_pointer(victim)->rcu, + ghes_estatus_cache_rcu_free); + } } static void __ghes_panic(struct ghes *ghes, -- cgit v1.2.3 From f5e32344d472f1021b9d382333f01b356bc8b8b8 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 18 Oct 2022 08:22:14 +0000 Subject: EDAC/igen6: Return the correct error type when not the MC owner Return -EBUSY instead of -ENODEV just like the other EDAC drivers do. [ bp: Rewrite text. ] Signed-off-by: Jia He Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221018082214.569504-8-justin.he@arm.com --- drivers/edac/igen6_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index d33c666221f9..544dd19072ea 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -1276,7 +1276,7 @@ static int __init igen6_init(void) owner = edac_get_owner(); if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) - return -ENODEV; + return -EBUSY; edac_op_state = EDAC_OPSTATE_NMI; -- cgit v1.2.3 From 25836ce1df827cb4830291cb2325067efb46753a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 18 Oct 2022 10:36:30 -0500 Subject: EDAC/mc_sysfs: Increase legacy channel support to 12 Newer AMD systems, such as Genoa, can support up to 12 channels per EDAC "mc" device. These are detected by the device's EDAC module, and the current EDAC interface is properly enumerated. However, the legacy EDAC sysfs interface provides device attributes only for channels 0 to 7. Therefore, channels 8 to 11 will not be visible in the legacy interface. This was overlooked in the initial support for AMD Genoa. Add additional device attributes so that up to 12 channels are visible in the legacy EDAC sysfs interface. Fixes: e2be5955a886 ("EDAC/amd64: Add support for AMD Family 19h Models 10h-1Fh and A0h-AFh") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Cc: Link: https://lore.kernel.org/r/20221018153630.14664-1-yazen.ghannam@amd.com --- drivers/edac/edac_mc_sysfs.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'drivers') diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 0a638c97702a..15f63452a9be 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -298,6 +298,14 @@ DEVICE_CHANNEL(ch6_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 6); DEVICE_CHANNEL(ch7_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 7); +DEVICE_CHANNEL(ch8_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 8); +DEVICE_CHANNEL(ch9_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 9); +DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 10); +DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 11); /* Total possible dynamic DIMM Label attribute file table */ static struct attribute *dynamic_csrow_dimm_attr[] = { @@ -309,6 +317,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { &dev_attr_legacy_ch5_dimm_label.attr.attr, &dev_attr_legacy_ch6_dimm_label.attr.attr, &dev_attr_legacy_ch7_dimm_label.attr.attr, + &dev_attr_legacy_ch8_dimm_label.attr.attr, + &dev_attr_legacy_ch9_dimm_label.attr.attr, + &dev_attr_legacy_ch10_dimm_label.attr.attr, + &dev_attr_legacy_ch11_dimm_label.attr.attr, NULL }; @@ -329,6 +341,14 @@ DEVICE_CHANNEL(ch6_ce_count, S_IRUGO, channel_ce_count_show, NULL, 6); DEVICE_CHANNEL(ch7_ce_count, S_IRUGO, channel_ce_count_show, NULL, 7); +DEVICE_CHANNEL(ch8_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 8); +DEVICE_CHANNEL(ch9_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 9); +DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 10); +DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 11); /* Total possible dynamic ce_count attribute file table */ static struct attribute *dynamic_csrow_ce_count_attr[] = { @@ -340,6 +360,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { &dev_attr_legacy_ch5_ce_count.attr.attr, &dev_attr_legacy_ch6_ce_count.attr.attr, &dev_attr_legacy_ch7_ce_count.attr.attr, + &dev_attr_legacy_ch8_ce_count.attr.attr, + &dev_attr_legacy_ch9_ce_count.attr.attr, + &dev_attr_legacy_ch10_ce_count.attr.attr, + &dev_attr_legacy_ch11_ce_count.attr.attr, NULL }; -- cgit v1.2.3 From b586a59e14e61a4805e1ed08a8c4f67ed38ea7e4 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 2 Nov 2022 16:12:48 +0800 Subject: EDAC/i5400: Fix typo in comment: vaious -> various Fix spelling typo in comment: vaious -> various. [ bp: Massage. ] Reported-by: k2ci Signed-off-by: Chen Zhang Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20221102081248.45694-1-chenzhang@kylinos.cn --- drivers/edac/i5400_edac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index f76624ee82ef..49b4499269fb 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -279,7 +279,8 @@ static inline int from_nf_ferr(unsigned int mask) #define FERR_NF_RECOVERABLE to_nf_mask(ERROR_NF_RECOVERABLE) #define FERR_NF_UNCORRECTABLE to_nf_mask(ERROR_NF_UNCORRECTABLE) -/* Defines to extract the vaious fields from the +/* + * Defines to extract the various fields from the * MTRx - Memory Technology Registers */ #define MTR_DIMMS_PRESENT(mtr) ((mtr) & (1 << 10)) -- cgit v1.2.3 From 9c8921555907f4d723f01ed2d859b66f2d14f08e Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 28 Nov 2022 14:55:12 +0800 Subject: EDAC/i10nm: fix refcount leak in pci_get_dev_wrapper() As the comment of pci_get_domain_bus_and_slot() says, it returns a PCI device with refcount incremented, so it doesn't need to call an extra pci_dev_get() in pci_get_dev_wrapper(), and the PCI device needs to be put in the error path. Fixes: d4dc89d069aa ("EDAC, i10nm: Add a driver for Intel 10nm server processors") Signed-off-by: Yang Yingliang Reviewed-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20221128065512.3572550-1-yangyingliang@huawei.com --- drivers/edac/i10nm_base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index a22ea053f8e1..8af4d2523194 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -304,11 +304,10 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus, if (unlikely(pci_enable_device(pdev) < 0)) { edac_dbg(2, "Failed to enable device %02x:%02x.%x\n", bus, dev, fun); + pci_dev_put(pdev); return NULL; } - pci_dev_get(pdev); - return pdev; } -- cgit v1.2.3