diff options
Diffstat (limited to 'arch/powerpc/kernel/eeh_driver.c')
-rw-r--r-- | arch/powerpc/kernel/eeh_driver.c | 237 |
1 files changed, 115 insertions, 122 deletions
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 67619b4b3f96..9446248eb6b8 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -35,8 +35,8 @@ #include <asm/rtas.h> struct eeh_rmv_data { - struct list_head edev_list; - int removed; + struct list_head removed_vf_list; + int removed_dev_count; }; static int eeh_result_priority(enum pci_ers_result result) @@ -281,6 +281,10 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, struct pci_driver *driver; enum pci_ers_result new_result; + if (!edev->pdev) { + eeh_edev_info(edev, "no device"); + return; + } device_lock(&edev->pdev->dev); if (eeh_edev_actionable(edev)) { driver = eeh_pcid_get(edev->pdev); @@ -400,7 +404,7 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) * EEH device is created. */ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { - if (list_is_last(&edev->list, &edev->pe->edevs)) + if (list_is_last(&edev->entry, &edev->pe->edevs)) eeh_pe_restore_bars(edev->pe); return NULL; @@ -465,10 +469,9 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, return rc; } -static void *eeh_add_virt_device(void *data, void *userdata) +static void *eeh_add_virt_device(struct eeh_dev *edev) { struct pci_driver *driver; - struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct pci_dn *pdn = eeh_dev_to_pdn(edev); @@ -499,7 +502,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; - int *removed = rmv_data ? &rmv_data->removed : NULL; /* * Actually, we should remove the PCI bridges as well. @@ -521,7 +523,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) if (eeh_dev_removed(edev)) return NULL; - if (removed) { + if (rmv_data) { if (eeh_pe_passed(edev->pe)) return NULL; driver = eeh_pcid_get(dev); @@ -539,10 +541,9 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) /* Remove it from PCI subsystem */ pr_debug("EEH: Removing %s without EEH sensitive driver\n", pci_name(dev)); - edev->bus = dev->bus; edev->mode |= EEH_DEV_DISCONNECTED; - if (removed) - (*removed)++; + if (rmv_data) + rmv_data->removed_dev_count++; if (edev->physfn) { #ifdef CONFIG_PCI_IOV @@ -558,7 +559,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) - list_add(&edev->rmv_list, &rmv_data->edev_list); + list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); } else { pci_lock_rescan_remove(); pci_stop_and_remove_bus_device(dev); @@ -727,7 +728,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * the device up before the scripts have taken it down, * potentially weird things happen. */ - if (!driver_eeh_aware || rmv_data->removed) { + if (!driver_eeh_aware || rmv_data->removed_dev_count) { pr_info("EEH: Sleep 5s ahead of %s hotplug\n", (driver_eeh_aware ? "partial" : "complete")); ssleep(5); @@ -737,10 +738,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * PE. We should disconnect it so the binding can be * rebuilt when adding PCI devices. */ - edev = list_first_entry(&pe->edevs, struct eeh_dev, list); + edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); if (pe->type & EEH_PE_VF) { - eeh_add_virt_device(edev, NULL); + eeh_add_virt_device(edev); } else { if (!driver_eeh_aware) eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); @@ -789,7 +790,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) struct eeh_pe *tmp_pe; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; - struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0}; + struct eeh_rmv_data rmv_data = + {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; bus = eeh_pe_bus_get(pe); if (!bus) { @@ -806,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pe->phb->global_number, pe->addr, pe->freeze_count); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", - pe->freeze_count, eeh_max_freezes); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs @@ -821,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * the error. Override the result if necessary to have partially * hotplug for this case. */ - pr_info("EEH: Notify device drivers to shutdown\n"); - eeh_set_channel_state(pe, pci_channel_io_frozen); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, - &result); - if ((pe->type & EEH_PE_PHB) && - result != PCI_ERS_RESULT_NONE && - result != PCI_ERS_RESULT_NEED_RESET) - result = PCI_ERS_RESULT_NEED_RESET; + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", + pe->freeze_count, eeh_max_freezes); + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_set_channel_state(pe, pci_channel_io_frozen); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(IO frozen)", pe, + eeh_report_error, &result); + if ((pe->type & EEH_PE_PHB) && + result != PCI_ERS_RESULT_NONE && + result != PCI_ERS_RESULT_NEED_RESET) + result = PCI_ERS_RESULT_NEED_RESET; + } /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - pr_warn("EEH: Permanent failure\n"); - goto hard_fail; + if (result != PCI_ERS_RESULT_DISCONNECT) { + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warn("EEH: Permanent failure\n"); + result = PCI_ERS_RESULT_DISCONNECT; + } } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ - pr_info("EEH: Collect temporary log\n"); - eeh_slot_error_detail(pe, EEH_LOG_TEMP); + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); + } /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they @@ -857,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } } @@ -866,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { pr_info("EEH: Notify device drivers to resume I/O\n"); @@ -882,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { /* @@ -897,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } } - /* If any device has a hard failure, then shut off everything. */ - if (result == PCI_ERS_RESULT_DISCONNECT) { - pr_warn("EEH: Device driver gave up\n"); - goto hard_fail; - } - /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { pr_info("EEH: Reset without hotplug activity\n"); @@ -910,88 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; + } else { + result = PCI_ERS_RESULT_NONE; + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("slot_reset", pe, eeh_report_reset, + &result); } - - pr_info("EEH: Notify device drivers " - "the completion of reset\n"); - result = PCI_ERS_RESULT_NONE; - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("slot_reset", pe, eeh_report_reset, &result); - } - - /* All devices should claim they have recovered by now. */ - if ((result != PCI_ERS_RESULT_RECOVERED) && - (result != PCI_ERS_RESULT_NONE)) { - pr_warn("EEH: Not recovered\n"); - goto hard_fail; - } - - /* - * For those hot removed VFs, we should add back them after PF get - * recovered properly. - */ - list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) { - eeh_add_virt_device(edev, NULL); - list_del(&edev->rmv_list); } - /* Tell all device drivers that they can resume operations */ - pr_info("EEH: Notify device driver to resume\n"); - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("resume", pe, eeh_report_resume, NULL); - eeh_for_each_pe(pe, tmp_pe) { - eeh_pe_for_each_dev(tmp_pe, edev, tmp) { - edev->mode &= ~EEH_DEV_NO_HANDLER; - edev->in_error = false; + if ((result == PCI_ERS_RESULT_RECOVERED) || + (result == PCI_ERS_RESULT_NONE)) { + /* + * For those hot removed VFs, we should add back them after PF + * get recovered properly. + */ + list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, + rmv_entry) { + eeh_add_virt_device(edev); + list_del(&edev->rmv_entry); } - } - pr_info("EEH: Recovery successful.\n"); - goto final; + /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("resume", pe, eeh_report_resume, NULL); + eeh_for_each_pe(pe, tmp_pe) { + eeh_pe_for_each_dev(tmp_pe, edev, tmp) { + edev->mode &= ~EEH_DEV_NO_HANDLER; + edev->in_error = false; + } + } -hard_fail: - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" - "Please try reseating or replacing it\n", - pe->phb->global_number, pe->addr); + pr_info("EEH: Recovery successful.\n"); + } else { + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); - eeh_slot_error_detail(pe, EEH_LOG_PERM); + eeh_slot_error_detail(pe, EEH_LOG_PERM); - /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(permanent failure)", pe, - eeh_report_failure, NULL); + /* Notify all devices that they're about to go down. */ + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(permanent failure)", pe, + eeh_report_failure, NULL); - /* Mark the PE to be removed permanently */ - eeh_pe_state_mark(pe, EEH_PE_REMOVED); + /* Mark the PE to be removed permanently */ + eeh_pe_state_mark(pe, EEH_PE_REMOVED); - /* - * Shut down the device drivers for good. We mark - * all removed devices correctly to avoid access - * the their PCI config any more. - */ - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - /* The passed PE should no longer be used */ - return; + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; + } } -final: eeh_pe_state_clear(pe, EEH_PE_RECOVERING); } @@ -1026,7 +1021,7 @@ void eeh_handle_special_event(void) phb_pe = eeh_phb_pe_get(hose); if (!phb_pe) continue; - eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_pe_mark_isolated(phb_pe); } eeh_serialize_unlock(flags); @@ -1041,11 +1036,9 @@ void eeh_handle_special_event(void) /* Purge all events of the PHB */ eeh_remove_event(pe, true); - if (rc == EEH_NEXT_ERR_DEAD_PHB) - eeh_pe_state_mark(pe, EEH_PE_ISOLATED); - else - eeh_pe_state_mark(pe, - EEH_PE_ISOLATED | EEH_PE_RECOVERING); + if (rc != EEH_NEXT_ERR_DEAD_PHB) + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + eeh_pe_mark_isolated(pe); eeh_serialize_unlock(flags); |