nvme-pci: Remove watchdog timer

The controller status polling was added to preemptively reset a failed controller. This early detection would allow commands that would normally timeout a chance for a retry, or find broken links when the platform didn't support hotplug. This once-per-second MMIO read, however, created more problems than it solves. This often races with PCIe Hotplug events that required complicated syncing between work queues, frequently triggered PCIe Completion Timeout errors that also lead to fatal machine checks, and unnecessarily disrupts low power modes by running on idle controllers. This patch removes the watchdog timer, and instead checks controller health only on an IO timeout when we have a reason to believe something is wrong. If the controller is failed, the driver will disable immediately and request scheduling a reset. Suggested-by: Andy Lutomirski <luto@amacapital.net> Signed-off-by: Keith Busch <keith.busch@intel.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
author: Keith Busch <keith.busch@intel.com> 2017-06-07 20:32:50 +0200
committer: Christoph Hellwig <hch@lst.de> 2017-06-15 14:30:08 +0200
commit: b2a0eb1a0ac72869c910a79d935a0b049ec78ad9 (patch)
tree: 5950dcdb125ab720107e27eea4bb83391df04ab8
parent: nvme-pci: remap BAR0 to cover admin CQ doorbell for large stride (diff)
download: linux-b2a0eb1a0ac72869c910a79d935a0b049ec78ad9.tar.xz
linux-b2a0eb1a0ac72869c910a79d935a0b049ec78ad9.zip
1 files changed, 56 insertions, 67 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5278ed9811a6..ef2b1537afe2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -98,7 +98,6 @@ struct nvme_dev {
 	unsigned long bar_mapped_size;
 	struct work_struct reset_work;
 	struct work_struct remove_work;
-	struct timer_list watchdog_timer;
 	struct mutex shutdown_lock;
 	bool subsystem;
 	void __iomem *cmb;
@@ -960,6 +959,51 @@ static void abort_endio(struct request *req, blk_status_t error)
 	blk_mq_free_request(req);
 }
 
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+{
+
+	/* If true, indicates loss of adapter communication, possibly by a
+	 * NVMe Subsystem reset.
+	 */
+	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+
+	/* If there is a reset ongoing, we shouldn't reset again. */
+	if (dev->ctrl.state == NVME_CTRL_RESETTING)
+		return false;
+
+	/* We shouldn't reset unless the controller is on fatal error state
+	 * _or_ if we lost the communication with it.
+	 */
+	if (!(csts & NVME_CSTS_CFS) && !nssro)
+		return false;
+
+	/* If PCI error recovery process is happening, we cannot reset or
+	 * the recovery mechanism will surely fail.
+	 */
+	if (pci_channel_offline(to_pci_dev(dev->dev)))
+		return false;
+
+	return true;
+}
+
+static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
+{
+	/* Read a config register to help see what died. */
+	u16 pci_status;
+	int result;
+
+	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
+				      &pci_status);
+	if (result == PCIBIOS_SUCCESSFUL)
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
+			 csts, pci_status);
+	else
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
+			 csts, result);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -967,6 +1011,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *abort_req;
 	struct nvme_command cmd;
+	u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+	/*
+	 * Reset immediately if the controller is failed
+	 */
+	if (nvme_should_reset(dev, csts)) {
+		nvme_warn_reset(dev, csts);
+		nvme_dev_disable(dev, false);
+		nvme_reset(dev);
+		return BLK_EH_HANDLED;
+	}
 
 	/*
 	 * Did we miss an interrupt?
@@ -1398,66 +1453,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
-{
-
-	/* If true, indicates loss of adapter communication, possibly by a
-	 * NVMe Subsystem reset.
-	 */
-	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
-
-	/* If there is a reset ongoing, we shouldn't reset again. */
-	if (dev->ctrl.state == NVME_CTRL_RESETTING)
-		return false;
-
-	/* We shouldn't reset unless the controller is on fatal error state
-	 * _or_ if we lost the communication with it.
-	 */
-	if (!(csts & NVME_CSTS_CFS) && !nssro)
-		return false;
-
-	/* If PCI error recovery process is happening, we cannot reset or
-	 * the recovery mechanism will surely fail.
-	 */
-	if (pci_channel_offline(to_pci_dev(dev->dev)))
-		return false;
-
-	return true;
-}
-
-static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
-{
-	/* Read a config register to help see what died. */
-	u16 pci_status;
-	int result;
-
-	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
-				      &pci_status);
-	if (result == PCIBIOS_SUCCESSFUL)
-		dev_warn(dev->ctrl.device,
-			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
-			 csts, pci_status);
-	else
-		dev_warn(dev->ctrl.device,
-			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
-			 csts, result);
-}
-
-static void nvme_watchdog_timer(unsigned long data)
-{
-	struct nvme_dev *dev = (struct nvme_dev *)data;
-	u32 csts = readl(dev->bar + NVME_REG_CSTS);
-
-	/* Skip controllers under certain specific conditions. */
-	if (nvme_should_reset(dev, csts)) {
-		if (!nvme_reset(dev))
-			nvme_warn_reset(dev, csts);
-		return;
-	}
-
-	mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
-}
-
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
 	unsigned i, max;
@@ -1986,8 +1981,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	bool dead = true;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-	del_timer_sync(&dev->watchdog_timer);
-
 	mutex_lock(&dev->shutdown_lock);
 	if (pci_is_enabled(pdev)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
@@ -2163,8 +2156,6 @@ static void nvme_reset_work(struct work_struct *work)
 	if (dev->online_queues > 1)
 		nvme_queue_async_events(&dev->ctrl);
 
-	mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
-
 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
 	 * any working I/O queue.
@@ -2318,8 +2309,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
-	setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
-		(unsigned long)dev);
 	mutex_init(&dev->shutdown_lock);
 	init_completion(&dev->ioq_wait);
author	Keith Busch <keith.busch@intel.com>	2017-06-07 20:32:50 +0200
committer	Christoph Hellwig <hch@lst.de>	2017-06-15 14:30:08 +0200
commit	b2a0eb1a0ac72869c910a79d935a0b049ec78ad9 (patch)
tree	5950dcdb125ab720107e27eea4bb83391df04ab8
parent	nvme-pci: remap BAR0 to cover admin CQ doorbell for large stride (diff)
download	linux-b2a0eb1a0ac72869c910a79d935a0b049ec78ad9.tar.xz linux-b2a0eb1a0ac72869c910a79d935a0b049ec78ad9.zip