From b00a726a9fd82ddd4c10344e46f0d371e1674303 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 24 Feb 2016 09:15:52 -0700 Subject: NVMe: Don't unmap controller registers on reset Unmapping the registers on reset or shutdown is not necessary. Keeping the mapping simplifies reset handling. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 71 +++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 29 deletions(-) (limited to 'drivers/nvme/host/pci.c') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a128672472ec..2ea3e398536c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1694,10 +1694,10 @@ static int nvme_dev_add(struct nvme_dev *dev) return 0; } -static int nvme_dev_map(struct nvme_dev *dev) +static int nvme_pci_enable(struct nvme_dev *dev) { u64 cap; - int bars, result = -ENOMEM; + int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_enable_device_mem(pdev)) @@ -1705,24 +1705,14 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->entry[0].vector = pdev->irq; pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (!bars) - goto disable_pci; - - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable_pci; if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) goto disable; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) - goto disable; - if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; - goto unmap; + goto disable; } /* @@ -1732,7 +1722,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (!pdev->irq) { result = pci_enable_msix(pdev, dev->entry, 1); if (result < 0) - goto unmap; + goto disable; } cap = lo_hi_readq(dev->bar + NVME_REG_CAP); @@ -1759,17 +1749,19 @@ static int nvme_dev_map(struct nvme_dev *dev) pci_save_state(pdev); return 0; - unmap: - iounmap(dev->bar); - dev->bar = NULL; disable: - pci_release_regions(pdev); - disable_pci: pci_disable_device(pdev); return result; } static void nvme_dev_unmap(struct nvme_dev *dev) +{ + if (dev->bar) + iounmap(dev->bar); + pci_release_regions(to_pci_dev(dev->dev)); +} + +static void nvme_pci_disable(struct nvme_dev *dev) { struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -1778,12 +1770,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev) else if (pdev->msix_enabled) pci_disable_msix(pdev); - if (dev->bar) { - iounmap(dev->bar); - dev->bar = NULL; - pci_release_regions(pdev); - } - if (pci_is_enabled(pdev)) { pci_disable_pcie_error_reporting(pdev); pci_disable_device(pdev); @@ -1842,7 +1828,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_dev_list_remove(dev); mutex_lock(&dev->shutdown_lock); - if (dev->bar) { + if (pci_is_enabled(to_pci_dev(dev->dev))) { nvme_stop_queues(&dev->ctrl); csts = readl(dev->bar + NVME_REG_CSTS); } @@ -1855,7 +1841,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_disable_io_queues(dev); nvme_disable_admin_queue(dev, shutdown); } - nvme_dev_unmap(dev); + nvme_pci_disable(dev); for (i = dev->queue_count - 1; i >= 0; i--) nvme_clear_queue(dev->queues[i]); @@ -1911,12 +1897,12 @@ static void nvme_reset_work(struct work_struct *work) * If we're called to reset a live controller first shut it down before * moving on. */ - if (dev->bar) + if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); set_bit(NVME_CTRL_RESETTING, &dev->flags); - result = nvme_dev_map(dev); + result = nvme_pci_enable(dev); if (result) goto out; @@ -2042,6 +2028,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .free_ctrl = nvme_pci_free_ctrl, }; +static int nvme_dev_map(struct nvme_dev *dev) +{ + int bars; + struct pci_dev *pdev = to_pci_dev(dev->dev); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (!bars) + return -ENODEV; + if (pci_request_selected_regions(pdev, bars, "nvme")) + return -ENODEV; + + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) + goto release; + + return 0; + release: + pci_release_regions(pdev); + return -ENODEV; +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -2066,6 +2073,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); + result = nvme_dev_map(dev); + if (result) + goto free; + INIT_LIST_HEAD(&dev->node); INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); @@ -2089,6 +2100,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_release_prp_pools(dev); put_pci: put_device(dev->dev); + nvme_dev_unmap(dev); free: kfree(dev->queues); kfree(dev->entry); @@ -2126,6 +2138,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); + nvme_dev_unmap(dev); nvme_put_ctrl(&dev->ctrl); } -- cgit v1.2.3 From 646017a612e72f19bd9f991fe25287a149c5f627 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 24 Feb 2016 09:15:54 -0700 Subject: NVMe: Fix namespace removal deadlock This patch makes nvme namespace removal lockless. It is up to the caller to ensure no active namespace scanning is occuring. To ensure no scan work occurs, the nvme pci driver adds a removing state to the controller device to avoid queueing scan work during removal. The work is flushed after setting the state, so no new scan work can be queued. The lockless removal allows the driver to cleanup a namespace request_queue if the controller fails during removal. Previously this could deadlock trying to acquire the namespace mutex in order to handle such events. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 12 +++++++----- drivers/nvme/host/nvme.h | 4 ++++ drivers/nvme/host/pci.c | 17 +++++++++++++++-- 3 files changed, 26 insertions(+), 7 deletions(-) (limited to 'drivers/nvme/host/pci.c') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6c39dbf0290f..8c2ddd5025ab 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1186,11 +1186,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) static void nvme_ns_remove(struct nvme_ns *ns) { - bool kill = nvme_io_incapable(ns->ctrl) && - !blk_queue_dying(ns->queue); + bool kill; - lockdep_assert_held(&ns->ctrl->namespaces_mutex); + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) + return; + kill = nvme_io_incapable(ns->ctrl) && + !blk_queue_dying(ns->queue); if (kill) { blk_set_queue_dying(ns->queue); @@ -1213,7 +1215,9 @@ static void nvme_ns_remove(struct nvme_ns *ns) blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } + mutex_lock(&ns->ctrl->namespaces_mutex); list_del_init(&ns->list); + mutex_unlock(&ns->ctrl->namespaces_mutex); nvme_put_ns(ns); } @@ -1307,10 +1311,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { struct nvme_ns *ns, *next; - mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) nvme_ns_remove(ns); - mutex_unlock(&ctrl->namespaces_mutex); } static DEFINE_IDA(nvme_instance_ida); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9407f2fa4487..4075fa9e0c34 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -114,6 +114,10 @@ struct nvme_ns { bool ext; u8 pi_type; int type; + unsigned long flags; + +#define NVME_NS_REMOVING 0 + u64 mode_select_num_blocks; u32 mode_select_block_len; }; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2ea3e398536c..122f803f0efb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -120,6 +120,7 @@ struct nvme_dev { unsigned long flags; #define NVME_CTRL_RESETTING 0 +#define NVME_CTRL_REMOVING 1 struct nvme_ctrl ctrl; struct completion ioq_wait; @@ -286,6 +287,17 @@ static int nvme_init_request(void *data, struct request *req, return 0; } +static void nvme_queue_scan(struct nvme_dev *dev) +{ + /* + * Do not queue new scan work when a controller is reset during + * removal. + */ + if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) + return; + queue_work(nvme_workq, &dev->scan_work); +} + static void nvme_complete_async_event(struct nvme_dev *dev, struct nvme_completion *cqe) { @@ -300,7 +312,7 @@ static void nvme_complete_async_event(struct nvme_dev *dev, switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: dev_info(dev->dev, "rescanning\n"); - queue_work(nvme_workq, &dev->scan_work); + nvme_queue_scan(dev); default: dev_warn(dev->dev, "async event result %08x\n", result); } @@ -1690,7 +1702,7 @@ static int nvme_dev_add(struct nvme_dev *dev) return 0; dev->ctrl.tagset = &dev->tagset; } - queue_work(nvme_workq, &dev->scan_work); + nvme_queue_scan(dev); return 0; } @@ -2128,6 +2140,7 @@ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + set_bit(NVME_CTRL_REMOVING, &dev->flags); pci_set_drvdata(pdev, NULL); flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); -- cgit v1.2.3 From f58944e265d4ebe47216a5d7488aee3928823d30 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 24 Feb 2016 09:15:55 -0700 Subject: NVMe: Simplify device reset failure A reset failure schedules the device to unbind from the driver through the pci driver's remove. This cleans up all intialization, so there is no need to duplicate the potentially racy cleanup. To help understand why a reset failed, the status is logged with the existing warning message. Signed-off-by: Keith Busch Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 48 +++++++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 27 deletions(-) (limited to 'drivers/nvme/host/pci.c') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 122f803f0efb..6d2e4257308b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -86,7 +86,6 @@ struct nvme_queue; static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); -static void nvme_remove_dead_ctrl(struct nvme_dev *dev); static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); /* @@ -1897,10 +1896,19 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } +static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) +{ + dev_warn(dev->dev, "Removing after probe failure status: %d\n", status); + + kref_get(&dev->ctrl.kref); + if (!schedule_work(&dev->remove_work)) + nvme_put_ctrl(&dev->ctrl); +} + static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); - int result; + int result = -ENODEV; if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) goto out; @@ -1920,26 +1928,26 @@ static void nvme_reset_work(struct work_struct *work) result = nvme_configure_admin_queue(dev); if (result) - goto unmap; + goto out; nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) - goto disable; + goto out; result = nvme_init_identify(&dev->ctrl); if (result) - goto free_tags; + goto out; result = nvme_setup_io_queues(dev); if (result) - goto free_tags; + goto out; dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; result = nvme_dev_list_add(dev); if (result) - goto remove; + goto out; /* * Keep the controller around but remove all namespaces if we don't have @@ -1956,19 +1964,8 @@ static void nvme_reset_work(struct work_struct *work) clear_bit(NVME_CTRL_RESETTING, &dev->flags); return; - remove: - nvme_dev_list_remove(dev); - free_tags: - nvme_dev_remove_admin(dev); - blk_put_queue(dev->ctrl.admin_q); - dev->ctrl.admin_q = NULL; - dev->queues[0]->tags = NULL; - disable: - nvme_disable_admin_queue(dev, false); - unmap: - nvme_dev_unmap(dev); out: - nvme_remove_dead_ctrl(dev); + nvme_remove_dead_ctrl(dev, result); } static void nvme_remove_dead_ctrl_work(struct work_struct *work) @@ -1981,14 +1978,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) nvme_put_ctrl(&dev->ctrl); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev) -{ - dev_warn(dev->dev, "Removing after probe failure\n"); - kref_get(&dev->ctrl.kref); - if (!schedule_work(&dev->remove_work)) - nvme_put_ctrl(&dev->ctrl); -} - static int nvme_reset(struct nvme_dev *dev) { if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) @@ -2136,6 +2125,11 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_dev_disable(dev, true); } +/* + * The driver's remove may be called on a device in a partially initialized + * state. This function must not have any dependencies on the device state in + * order to proceed. + */ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); -- cgit v1.2.3 From 69d9a99c258eb1d6478fd9608a2070890797eed7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 24 Feb 2016 09:15:56 -0700 Subject: NVMe: Move error handling to failed reset handler This moves failed queue handling out of the namespace removal path and into the reset failure path, fixing a hanging condition if the controller fails or link down during del_gendisk. Previously the driver had to see the controller as degraded prior to calling del_gendisk to setup the queues to fail. But, if the controller happened to fail after this, there was no task to end outstanding requests. On failure, all namespace states are set to dead. This has capacity revalidate to 0, and ends all new requests with error status. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 53 ++++++++++++++++++++++++++++++++---------------- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 13 +++++++++++- 3 files changed, 50 insertions(+), 18 deletions(-) (limited to 'drivers/nvme/host/pci.c') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8c2ddd5025ab..7fd5a7ac8375 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -557,6 +557,10 @@ static int nvme_revalidate_disk(struct gendisk *disk) u16 old_ms; unsigned short bs; + if (test_bit(NVME_NS_DEAD, &ns->flags)) { + set_capacity(disk, 0); + return -ENODEV; + } if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", __func__, ns->ctrl->instance, ns->ns_id); @@ -1186,32 +1190,15 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) static void nvme_ns_remove(struct nvme_ns *ns) { - bool kill; - if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; - kill = nvme_io_incapable(ns->ctrl) && - !blk_queue_dying(ns->queue); - if (kill) { - blk_set_queue_dying(ns->queue); - - /* - * The controller was shutdown first if we got here through - * device removal. The shutdown may requeue outstanding - * requests. These need to be aborted immediately so - * del_gendisk doesn't block indefinitely for their completion. - */ - blk_mq_abort_requeue_list(ns->queue); - } if (ns->disk->flags & GENHD_FL_UP) { if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_attr_group); del_gendisk(ns->disk); - } - if (kill || !blk_queue_dying(ns->queue)) { blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); } @@ -1413,6 +1400,38 @@ out: return ret; } +/** + * nvme_kill_queues(): Ends all namespace queues + * @ctrl: the dead controller that needs to end + * + * Call this function when the driver determines it is unable to get the + * controller in a state capable of servicing IO. + */ +void nvme_kill_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (!kref_get_unless_zero(&ns->kref)) + continue; + + /* + * Revalidating a dead namespace sets capacity to 0. This will + * end buffered writers dirtying pages that can't be synced. + */ + if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + revalidate_disk(ns->disk); + + blk_set_queue_dying(ns->queue); + blk_mq_abort_requeue_list(ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + + nvme_put_ns(ns); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + void nvme_stop_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4075fa9e0c34..fb15ba5f5d19 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -117,6 +117,7 @@ struct nvme_ns { unsigned long flags; #define NVME_NS_REMOVING 0 +#define NVME_NS_DEAD 1 u64 mode_select_num_blocks; u32 mode_select_block_len; @@ -246,6 +247,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); +void nvme_kill_queues(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6d2e4257308b..680f5780750c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -690,7 +690,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, spin_lock_irq(&nvmeq->q_lock); if (unlikely(nvmeq->cq_vector < 0)) { - ret = BLK_MQ_RQ_QUEUE_BUSY; + if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) + ret = BLK_MQ_RQ_QUEUE_BUSY; + else + ret = BLK_MQ_RQ_QUEUE_ERROR; spin_unlock_irq(&nvmeq->q_lock); goto out; } @@ -1261,6 +1264,12 @@ static struct blk_mq_ops nvme_mq_ops = { static void nvme_dev_remove_admin(struct nvme_dev *dev) { if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { + /* + * If the controller was reset during removal, it's possible + * user requests may be waiting on a stopped queue. Start the + * queue to flush these to completion. + */ + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } @@ -1901,6 +1910,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) dev_warn(dev->dev, "Removing after probe failure status: %d\n", status); kref_get(&dev->ctrl.kref); + nvme_dev_disable(dev, false); if (!schedule_work(&dev->remove_work)) nvme_put_ctrl(&dev->ctrl); } @@ -1973,6 +1983,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); struct pci_dev *pdev = to_pci_dev(dev->dev); + nvme_kill_queues(&dev->ctrl); if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); nvme_put_ctrl(&dev->ctrl); -- cgit v1.2.3