summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOliver O'Halloran <oohall@gmail.com>2019-09-03 12:15:55 +0200
committerMichael Ellerman <mpe@ellerman.id.au>2019-09-05 06:22:38 +0200
commitb104af5a7687060792ca398bb86b033057afce75 (patch)
tree10b677425d96097a9ea4e089024f3145964d53fb
parentpowerpc/eeh: Make permanently failed devices non-actionable (diff)
downloadlinux-b104af5a7687060792ca398bb86b033057afce75.tar.xz
linux-b104af5a7687060792ca398bb86b033057afce75.zip
powerpc/eeh: Check slot presence state in eeh_handle_normal_event()
When a device is surprise removed while undergoing IO we will probably get an EEH PE freeze due to MMIO timeouts and other errors. When a freeze is detected we send a recovery event to the EEH worker thread which will notify drivers, and perform recovery as needed. In the event of a hot-remove we don't want recovery to occur since there isn't a device to recover. The recovery process is fairly long due to the number of wait states (required by PCIe) which causes problems when devices are removed and replaced (e.g. hot swapping of U.2 NVMe drives). To determine if we need to skip the recovery process we can use the get_adapter_state() operation of the hotplug_slot to determine if the slot contains a device or not, and if the slot is empty we can skip recovery entirely. One thing to note is that the slot being EEH frozen does not prevent the hotplug driver from working. We don't have the EEH recovery thread remove any of the devices since it's assumed that the hotplug driver will handle tearing down the slot state. Signed-off-by: Oliver O'Halloran <oohall@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20190903101605.2890-5-oohall@gmail.com
-rw-r--r--arch/powerpc/kernel/eeh_driver.c60
1 files changed, 60 insertions, 0 deletions
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 18a69fac4d80..52ce7584af43 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -27,6 +27,7 @@
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/pci_hotplug.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/ppc-pci.h>
@@ -770,6 +771,46 @@ static void eeh_pe_cleanup(struct eeh_pe *pe)
}
/**
+ * eeh_check_slot_presence - Check if a device is still present in a slot
+ * @pdev: pci_dev to check
+ *
+ * This function may return a false positive if we can't determine the slot's
+ * presence state. This might happen for for PCIe slots if the PE containing
+ * the upstream bridge is also frozen, or the bridge is part of the same PE
+ * as the device.
+ *
+ * This shouldn't happen often, but you might see it if you hotplug a PCIe
+ * switch.
+ */
+static bool eeh_slot_presence_check(struct pci_dev *pdev)
+{
+ const struct hotplug_slot_ops *ops;
+ struct pci_slot *slot;
+ u8 state;
+ int rc;
+
+ if (!pdev)
+ return false;
+
+ if (pdev->error_state == pci_channel_io_perm_failure)
+ return false;
+
+ slot = pdev->slot;
+ if (!slot || !slot->hotplug)
+ return true;
+
+ ops = slot->hotplug->ops;
+ if (!ops || !ops->get_adapter_status)
+ return true;
+
+ rc = ops->get_adapter_status(slot->hotplug, &state);
+ if (rc)
+ return true;
+
+ return !!state;
+}
+
+/**
* eeh_handle_normal_event - Handle EEH events on a specific PE
* @pe: EEH PE - which should not be used after we return, as it may
* have been invalidated.
@@ -799,6 +840,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
enum pci_ers_result result = PCI_ERS_RESULT_NONE;
struct eeh_rmv_data rmv_data =
{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
+ int devices = 0;
bus = eeh_pe_bus_get(pe);
if (!bus) {
@@ -807,6 +849,23 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
return;
}
+ /*
+ * When devices are hot-removed we might get an EEH due to
+ * a driver attempting to touch the MMIO space of a removed
+ * device. In this case we don't have a device to recover
+ * so suppress the event if we can't find any present devices.
+ *
+ * The hotplug driver should take care of tearing down the
+ * device itself.
+ */
+ eeh_for_each_pe(pe, tmp_pe)
+ eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+ if (eeh_slot_presence_check(edev->pdev))
+ devices++;
+
+ if (!devices)
+ goto out; /* nothing to recover */
+
eeh_pe_update_time_stamp(pe);
pe->freeze_count++;
if (pe->freeze_count > eeh_max_freezes) {
@@ -997,6 +1056,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
}
}
+out:
/*
* Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
* we don't want to modify the PE tree structure so we do it here.