diff options
author | Sam Bobroff <sbobroff@linux.ibm.com> | 2018-11-29 04:16:41 +0100 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2019-02-05 01:55:44 +0100 |
commit | 1ef52073fd25ea97090eaff2c8b528ebf401a12a (patch) | |
tree | 85d99a82f5dbd6386015ee73b704c744113b41f7 /arch/powerpc/kernel/eeh_driver.c | |
parent | powerpc/eeh: Add include_passed to eeh_clear_pe_frozen_state() (diff) | |
download | linux-1ef52073fd25ea97090eaff2c8b528ebf401a12a.tar.xz linux-1ef52073fd25ea97090eaff2c8b528ebf401a12a.zip |
powerpc/eeh: Improve recovery of passed-through devices
Currently, the EEH recovery process considers passed-through devices
as if they were not EEH-aware, which can cause them to be removed as
part of recovery. Because device removal requires cooperation from
the guest, this may lead to the process stalling or deadlocking.
Also, if devices are removed on the host side, they will be removed
from their IOMMU group, making recovery in the guest impossible.
Therefore, alter the recovery process so that passed-through devices
are not removed but are instead left frozen (and marked isolated)
until the guest performs it's own recovery. If firmware thaws a
passed-through PE because it's parent PE has been thawed (because it
was not passed through), re-freeze it.
Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch/powerpc/kernel/eeh_driver.c')
-rw-r--r-- | arch/powerpc/kernel/eeh_driver.c | 32 |
1 files changed, 13 insertions, 19 deletions
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 91629b3f3b74..89623962c727 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -510,22 +510,11 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) * support EEH. So we just care about PCI devices for * simplicity here. */ - if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) - return NULL; - - /* - * We rely on count-based pcibios_release_device() to - * detach permanently offlined PEs. Unfortunately, that's - * not reliable enough. We might have the permanently - * offlined PEs attached, but we needn't take care of - * them and their child devices. - */ - if (eeh_dev_removed(edev)) + if (!eeh_edev_actionable(edev) || + (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) return NULL; if (rmv_data) { - if (eeh_pe_passed(edev->pe)) - return NULL; driver = eeh_pcid_get(dev); if (driver) { if (driver->err_handler && @@ -539,8 +528,8 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) } /* Remove it from PCI subsystem */ - pr_debug("EEH: Removing %s without EEH sensitive driver\n", - pci_name(dev)); + pr_info("EEH: Removing %s without EEH sensitive driver\n", + pci_name(dev)); edev->mode |= EEH_DEV_DISCONNECTED; if (rmv_data) rmv_data->removed_dev_count++; @@ -624,7 +613,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); /* Issue reset */ - ret = eeh_pe_reset_full(pe); + ret = eeh_pe_reset_full(pe, true); if (ret) { eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; @@ -664,6 +653,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, time64_t tstamp; int cnt, rc; struct eeh_dev *edev; + struct eeh_pe *tmp_pe; + bool any_passed = false; + + eeh_for_each_pe(pe, tmp_pe) + any_passed |= eeh_pe_passed(tmp_pe); /* pcibios will clear the counter; save the value */ cnt = pe->freeze_count; @@ -676,7 +670,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * into pci_hp_add_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); - if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { + if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); } else { pci_lock_rescan_remove(); @@ -693,7 +687,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * config accesses. So we prefer to block them. However, controlled * PCI config accesses initiated from EEH itself are allowed. */ - rc = eeh_pe_reset_full(pe); + rc = eeh_pe_reset_full(pe, false); if (rc) return rc; @@ -704,7 +698,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, eeh_pe_restore_bars(pe); /* Clear frozen state */ - rc = eeh_clear_pe_frozen_state(pe, true); + rc = eeh_clear_pe_frozen_state(pe, false); if (rc) { pci_unlock_rescan_remove(); return rc; |