nvme: Avoid reset work on watchdog timer function during error recovery

This patch adds a check on nvme_watchdog_timer() function to avoid the call to reset_work() when an error recovery process is ongoing on controller. The check is made by looking at pci_channel_offline() result. If we don't check for this on nvme_watchdog_timer(), error recovery mechanism can't recover well, because reset_work() won't be able to do its job (since we're in the middle of an error) and so the controller is removed from the system before error recovery mechanism can perform slot reset (which would allow the adapter to recover). In this patch we also have split the huge condition expression on nvme_watchdog_timer() by introducing an auxiliary function to help make the code more readable. Reviewed-by: Keith Busch <keith.busch@intel.com> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com> Signed-off-by: Jens Axboe <axboe@fb.com>
author: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com> 2016-04-13 16:08:20 +0200
committer: Jens Axboe <axboe@fb.com> 2016-04-13 16:15:39 +0200
commit: c875a7093f0479215cf9bf51356d7638f2ec5746 (patch)
tree: 84228b9202eec5565ff444006f895004224b0a00
parent: NVMe: silence warning about unused 'dev' (diff)
download: linux-c875a7093f0479215cf9bf51356d7638f2ec5746.tar.xz
linux-c875a7093f0479215cf9bf51356d7638f2ec5746.zip
1 files changed, 30 insertions, 8 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 36b6cdf22de3..ff3c8d7ca882 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1303,22 +1303,44 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+{
+
+	/* If true, indicates loss of adapter communication, possibly by a
+	 * NVMe Subsystem reset.
+	 */
+	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+
+	/* If there is a reset ongoing, we shouldn't reset again. */
+	if (work_busy(&dev->reset_work))
+		return false;
+
+	/* We shouldn't reset unless the controller is on fatal error state
+	 * _or_ if we lost the communication with it.
+	 */
+	if (!(csts & NVME_CSTS_CFS) && !nssro)
+		return false;
+
+	/* If PCI error recovery process is happening, we cannot reset or
+	 * the recovery mechanism will surely fail.
+	 */
+	if (pci_channel_offline(to_pci_dev(dev->dev)))
+		return false;
+
+	return true;
+}
+
 static void nvme_watchdog_timer(unsigned long data)
 {
 	struct nvme_dev *dev = (struct nvme_dev *)data;
 	u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-	/*
-	 * Skip controllers currently under reset.
-	 */
-	if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) &&
-	    ((csts & NVME_CSTS_CFS) ||
-	     (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) {
-		if (queue_work(nvme_workq, &dev->reset_work)) {
+	/* Skip controllers under certain specific conditions. */
+	if (nvme_should_reset(dev, csts)) {
+		if (queue_work(nvme_workq, &dev->reset_work))
 			dev_warn(dev->dev,
 				"Failed status: 0x%x, reset controller.\n",
 				csts);
-		}
 		return;
 	}
author	Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>	2016-04-13 16:08:20 +0200
committer	Jens Axboe <axboe@fb.com>	2016-04-13 16:15:39 +0200
commit	c875a7093f0479215cf9bf51356d7638f2ec5746 (patch)
tree	84228b9202eec5565ff444006f895004224b0a00
parent	NVMe: silence warning about unused 'dev' (diff)
download	linux-c875a7093f0479215cf9bf51356d7638f2ec5746.tar.xz linux-c875a7093f0479215cf9bf51356d7638f2ec5746.zip