summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/include/asm/eeh.h2
-rw-r--r--arch/powerpc/kernel/eeh_driver.c128
2 files changed, 112 insertions, 18 deletions
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 0c0ac93f422f..a0b11fb3237e 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -53,6 +53,7 @@ struct device_node;
#define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */
#define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */
+#define EEH_PE_PHB_DEAD (1 << 2) /* Dead PHB */
struct eeh_pe {
int type; /* PE type: PHB/Bus/Device */
@@ -145,6 +146,7 @@ struct eeh_ops {
int (*configure_bridge)(struct eeh_pe *pe);
int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
int (*write_config)(struct device_node *dn, int where, int size, u32 val);
+ int (*next_error)(struct eeh_pe **pe);
};
extern struct eeh_ops *eeh_ops;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 678bc6cddf82..0974e1326842 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -399,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
*/
#define MAX_WAIT_FOR_RECOVERY 150
-/**
- * eeh_handle_event - Reset a PCI device after hard lockup.
- * @pe: EEH PE
- *
- * While PHB detects address or data parity errors on particular PCI
- * slot, the associated PE will be frozen. Besides, DMA's occurring
- * to wild addresses (which usually happen due to bugs in device
- * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
- * #PERR or other misc PCI-related errors also can trigger EEH errors.
- *
- * Recovery process consists of unplugging the device driver (which
- * generated hotplug events to userspace), then issuing a PCI #RST to
- * the device, then reconfiguring the PCI config space for all bridges
- * & devices under this slot, and then finally restarting the device
- * drivers (which cause a second set of hotplug events to go out to
- * userspace).
- */
-void eeh_handle_event(struct eeh_pe *pe)
+static void eeh_handle_normal_event(struct eeh_pe *pe)
{
struct pci_bus *frozen_bus;
int rc = 0;
@@ -554,3 +537,112 @@ perm_error:
if (frozen_bus)
pcibios_remove_pci_devices(frozen_bus);
}
+
+static void eeh_handle_special_event(void)
+{
+ struct eeh_pe *pe, *phb_pe;
+ struct pci_bus *bus;
+ struct pci_controller *hose, *tmp;
+ unsigned long flags;
+ int rc = 0;
+
+ /*
+ * The return value from next_error() has been classified as follows.
+ * It might be good to enumerate them. However, next_error() is only
+ * supported by PowerNV platform for now. So it would be fine to use
+ * integer directly:
+ *
+ * 4 - Dead IOC 3 - Dead PHB
+ * 2 - Fenced PHB 1 - Frozen PE
+ * 0 - No error found
+ *
+ */
+ rc = eeh_ops->next_error(&pe);
+ if (rc <= 0)
+ return;
+
+ switch (rc) {
+ case 4:
+ /* Mark all PHBs in dead state */
+ eeh_serialize_lock(&flags);
+ list_for_each_entry_safe(hose, tmp,
+ &hose_list, list_node) {
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe) continue;
+
+ eeh_pe_state_mark(phb_pe,
+ EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+ }
+ eeh_serialize_unlock(flags);
+
+ /* Purge all events */
+ eeh_remove_event(NULL);
+ break;
+ case 3:
+ case 2:
+ case 1:
+ /* Mark the PE in fenced state */
+ eeh_serialize_lock(&flags);
+ if (rc == 3)
+ eeh_pe_state_mark(pe,
+ EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+ else
+ eeh_pe_state_mark(pe,
+ EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+ eeh_serialize_unlock(flags);
+
+ /* Purge all events of the PHB */
+ eeh_remove_event(pe);
+ break;
+ default:
+ pr_err("%s: Invalid value %d from next_error()\n",
+ __func__, rc);
+ return;
+ }
+
+ /*
+ * For fenced PHB and frozen PE, it's handled as normal
+ * event. We have to remove the affected PHBs for dead
+ * PHB and IOC
+ */
+ if (rc == 2 || rc == 1)
+ eeh_handle_normal_event(pe);
+ else {
+ list_for_each_entry_safe(hose, tmp,
+ &hose_list, list_node) {
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
+ continue;
+
+ bus = eeh_pe_bus_get(phb_pe);
+ /* Notify all devices that they're about to go down. */
+ eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+ pcibios_remove_pci_devices(bus);
+ }
+ }
+}
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+ if (pe)
+ eeh_handle_normal_event(pe);
+ else
+ eeh_handle_special_event();
+}