From 4cff280a5fccf6513ed9e895bb3a4e7ad8b0cedc Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 14 Nov 2018 16:35:10 -0800 Subject: nvme-fc: resolve io failures during connect If an io error occurs on an io issued while connecting, recovery of the io falls flat as the state checking ends up nooping the error handler. Create an err_work work item that is scheduled upon an io error while connecting. The work thread terminates all io on all queues and marks the queues as not connected. The termination of the io will return back to the callee, which will then back out of the connection attempt and will reschedule, if possible, the connection attempt. The changes: - in case there are several commands hitting the error handler, a state flag is kept so that the error work is only scheduled once, on the first error. The subsequent errors can be ignored. - The calling sequence to stop keep alive and terminate the queues and their io is lifted from the reset routine. Made a small service routine used by both reset and err_work. - During debugging, found that the teardown path can reference an uninitialized pointer, resulting in a NULL pointer oops. The aen_ops weren't initialized yet. Add validation on their initialization before calling the teardown routine. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 73 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 10 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 0b70c8bab045..54032c466636 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -152,6 +152,7 @@ struct nvme_fc_ctrl { bool ioq_live; bool assoc_active; + atomic_t err_work_active; u64 association_id; struct list_head ctrl_list; /* rport->ctrl_list */ @@ -160,6 +161,7 @@ struct nvme_fc_ctrl { struct blk_mq_tag_set tag_set; struct delayed_work connect_work; + struct work_struct err_work; struct kref ref; u32 flags; @@ -1531,6 +1533,10 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops; int i; + /* ensure we've initialized the ops once */ + if (!(aen_op->flags & FCOP_FLAGS_AEN)) + return; + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) __nvme_fc_abort_op(ctrl, aen_op); } @@ -2049,7 +2055,25 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) { - /* only proceed if in LIVE state - e.g. on first error */ + int active; + + /* + * if an error (io timeout, etc) while (re)connecting, + * it's an error on creating the new association. + * Start the error recovery thread if it hasn't already + * been started. It is expected there could be multiple + * ios hitting this path before things are cleaned up. + */ + if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { + active = atomic_xchg(&ctrl->err_work_active, 1); + if (!active && !schedule_work(&ctrl->err_work)) { + atomic_set(&ctrl->err_work_active, 0); + WARN_ON(1); + } + return; + } + + /* Otherwise, only proceed if in LIVE state - e.g. on first error */ if (ctrl->ctrl.state != NVME_CTRL_LIVE) return; @@ -2814,6 +2838,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); + cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->connect_work); /* * kill the association on the link side. this will block @@ -2866,23 +2891,30 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) } static void -nvme_fc_reset_ctrl_work(struct work_struct *work) +__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl) { - struct nvme_fc_ctrl *ctrl = - container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); - int ret; - - nvme_stop_ctrl(&ctrl->ctrl); + nvme_stop_keep_alive(&ctrl->ctrl); /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + if (ctrl->ctrl.state != NVME_CTRL_CONNECTING && + !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " "to CONNECTING\n", ctrl->cnum); - return; - } +} + +static void +nvme_fc_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_fc_ctrl *ctrl = + container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); + int ret; + + __nvme_fc_terminate_io(ctrl); + + nvme_stop_ctrl(&ctrl->ctrl); if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) ret = nvme_fc_create_association(ctrl); @@ -2897,6 +2929,24 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) ctrl->cnum); } +static void +nvme_fc_connect_err_work(struct work_struct *work) +{ + struct nvme_fc_ctrl *ctrl = + container_of(work, struct nvme_fc_ctrl, err_work); + + __nvme_fc_terminate_io(ctrl); + + atomic_set(&ctrl->err_work_active, 0); + + /* + * Rescheduling the connection after recovering + * from the io error is left to the reconnect work + * item, which is what should have stalled waiting on + * the io that had the error that scheduled this work. + */ +} + static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .name = "fc", .module = THIS_MODULE, @@ -3007,6 +3057,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->cnum = idx; ctrl->ioq_live = false; ctrl->assoc_active = false; + atomic_set(&ctrl->err_work_active, 0); init_waitqueue_head(&ctrl->ioabort_wait); get_device(ctrl->dev); @@ -3014,6 +3065,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_fc_connect_err_work); spin_lock_init(&ctrl->lock); /* io queue count */ @@ -3103,6 +3155,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, fail_ctrl: nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); cancel_work_sync(&ctrl->ctrl.reset_work); + cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->connect_work); ctrl->ctrl.opts = NULL; -- cgit v1.2.3 From d6a2b9535d1e52bea269c138614c4801469d10e1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 26 Nov 2018 16:39:47 -0700 Subject: nvme: Free ctrl device name on init failure Free the kobject name that was allocated for the controller device on failure rather than its parent. Fixes: d22524a4782a9 ("nvme: switch controller refcounting to use struct device") Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 559d567693b8..5afda6fe5ae9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3585,7 +3585,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_name: - kfree_const(dev->kobj.name); + kfree_const(ctrl->device->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: -- cgit v1.2.3 From dfa74422d604abc2e16763db12646583219806e4 Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Mon, 26 Nov 2018 12:01:30 -0500 Subject: nvme-fc: initialize nvme_req(rq)->ctrl after calling __nvme_fc_init_request() __nvme_fc_init_request() invokes memset() on the nvme_fcp_op_w_sgl structure, which NULLed-out the nvme_req(req)->ctrl field previously set by nvme_fc_init_request(). This apparently was not referenced until commit faf4a44fff ("nvme: support traffic based keep-alive") which now results in a crash in nvme_complete_rq(): [ 8386.897130] RIP: 0010:panic+0x220/0x26c [ 8386.901406] Code: 83 3d 6f ee 72 01 00 74 05 e8 e8 54 02 00 48 c7 c6 40 fd 5b b4 48 c7 c7 d8 8d c6 b3 31e [ 8386.922359] RSP: 0018:ffff99650019fc40 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 [ 8386.930804] RAX: 0000000000000046 RBX: 0000000000000000 RCX: 0000000000000006 [ 8386.938764] RDX: 0000000000000000 RSI: 0000000000000082 RDI: ffff8e325f8168b0 [ 8386.946725] RBP: ffff99650019fcb0 R08: 0000000000000000 R09: 00000000000004f8 [ 8386.954687] R10: 0000000000000000 R11: ffff99650019f9b8 R12: ffffffffb3c55f3c [ 8386.962648] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 [ 8386.970613] oops_end+0xd1/0xe0 [ 8386.974116] no_context+0x1b2/0x3c0 [ 8386.978006] do_page_fault+0x32/0x140 [ 8386.982090] page_fault+0x1e/0x30 [ 8386.985786] RIP: 0010:nvme_complete_rq+0x65/0x1d0 [nvme_core] [ 8386.992195] Code: 41 bc 03 00 00 00 74 16 0f 86 c3 00 00 00 66 3d 83 00 41 bc 06 00 00 00 0f 85 e7 00 000 [ 8387.013147] RSP: 0018:ffff99650019fe18 EFLAGS: 00010246 [ 8387.018973] RAX: 0000000000000000 RBX: ffff8e322ae51280 RCX: 0000000000000001 [ 8387.026935] RDX: 0000000000000400 RSI: 0000000000000001 RDI: ffff8e322ae51280 [ 8387.034897] RBP: ffff8e322ae51280 R08: 0000000000000000 R09: ffffffffb2f0b890 [ 8387.042859] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 [ 8387.050821] R13: 0000000000000100 R14: 0000000000000004 R15: ffff8e2b0446d990 [ 8387.058782] ? swiotlb_unmap_page+0x40/0x40 [ 8387.063448] nvme_fc_complete_rq+0x2d/0x70 [nvme_fc] [ 8387.068986] blk_done_softirq+0xa1/0xd0 [ 8387.073264] __do_softirq+0xd6/0x2a9 [ 8387.077251] run_ksoftirqd+0x26/0x40 [ 8387.081238] smpboot_thread_fn+0x10e/0x160 [ 8387.085807] kthread+0xf8/0x130 [ 8387.089309] ? sort_range+0x20/0x20 [ 8387.093198] ? kthread_stop+0x110/0x110 [ 8387.097475] ret_from_fork+0x35/0x40 [ 8387.101462] ---[ end trace 7106b0adf5e422f8 ]--- Fixes: faf4a44fff ("nvme: support traffic based keep-alive") Signed-off-by: Ewan D. Milne Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 54032c466636..feb86b59170e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1752,12 +1752,12 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; int res; - nvme_req(rq)->ctrl = &ctrl->ctrl; res = __nvme_fc_init_request(ctrl, queue, &op->op, rq, queue->rqcnt++); if (res) return res; op->op.fcp_req.first_sgl = &op->sgl[0]; op->op.fcp_req.private = &op->priv[0]; + nvme_req(rq)->ctrl = &ctrl->ctrl; return res; } -- cgit v1.2.3 From 751a0cc0cd3a0d51e6aaf6fd3b8bd31f4ecfaf3e Mon Sep 17 00:00:00 2001 From: Igor Konopko Date: Fri, 23 Nov 2018 16:58:10 +0100 Subject: nvme-pci: fix surprise removal When a PCIe NVMe device is not present, nvme_dev_remove_admin() calls blk_cleanup_queue() on the admin queue, which frees the hctx for that queue. Moments later, on the same path nvme_kill_queues() calls blk_mq_unquiesce_queue() on admin queue and tries to access hctx of it, which leads to following OOPS: Oops: 0000 [#1] SMP PTI RIP: 0010:sbitmap_any_bit_set+0xb/0x40 Call Trace: blk_mq_run_hw_queue+0xd5/0x150 blk_mq_run_hw_queues+0x3a/0x50 nvme_kill_queues+0x26/0x50 nvme_remove_namespaces+0xb2/0xc0 nvme_remove+0x60/0x140 pci_device_remove+0x3b/0xb0 Fixes: cb4bfda62afa2 ("nvme-pci: fix hot removal during error handling") Signed-off-by: Igor Konopko Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5afda6fe5ae9..bb39b91253c2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3607,7 +3607,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); /* Forcibly unquiesce queues to avoid blocking dispatch */ - if (ctrl->admin_q) + if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) blk_mq_unquiesce_queue(ctrl->admin_q); list_for_each_entry(ns, &ctrl->namespaces, list) -- cgit v1.2.3 From 14a1336e6fff47dd1028b484d6c802105c58e2ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Nov 2018 16:57:54 +0100 Subject: nvme: warn when finding multi-port subsystems without multipathing enabled Without CONFIG_NVME_MULTIPATH enabled a multi-port subsystem might show up as invididual devices and cause problems, warn about it. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/nvme.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cee79cb388af..081cbdcce880 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -531,6 +531,9 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { + if (ctrl->subsys->cmic & (1 << 3)) + dev_warn(ctrl->device, +"Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); return 0; } static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) -- cgit v1.2.3 From f6c8e432cb0479255322c5d0335b9f1699a0270c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 21 Nov 2018 15:17:37 -0800 Subject: nvme: flush namespace scanning work just before removing namespaces nvme_stop_ctrl can be called also for reset flow and there is no need to flush the scan_work as namespaces are not being removed. This can cause deadlock in rdma, fc and loop drivers since nvme_stop_ctrl barriers before controller teardown (and specifically I/O cancellation of the scan_work itself) takes place, but the scan_work will be blocked anyways so there is no need to flush it. Instead, move scan_work flush to nvme_remove_namespaces() where it really needs to flush. Reported-by: Ming Lei Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed by: James Smart Tested-by: Ewan D. Milne Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bb39b91253c2..3cf1b773158e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3314,6 +3314,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) struct nvme_ns *ns, *next; LIST_HEAD(ns_list); + /* prevent racing with ns scanning */ + flush_work(&ctrl->scan_work); + /* * The dead states indicates the controller was not gracefully * disconnected. In that case, we won't be able to flush any data while @@ -3476,7 +3479,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); - flush_work(&ctrl->scan_work); cancel_work_sync(&ctrl->fw_act_work); if (ctrl->ops->stop_ctrl) ctrl->ops->stop_ctrl(ctrl); -- cgit v1.2.3 From 6344d02dc8f886b6bbcd922ae1a17e4a41500f2d Mon Sep 17 00:00:00 2001 From: Prabhath Sajeepa Date: Wed, 28 Nov 2018 11:11:29 -0700 Subject: nvme-rdma: fix double freeing of async event data Some error paths in configuration of admin queue free data buffer associated with async request SQE without resetting the data buffer pointer to NULL, This buffer is also freed up again if the controller is shutdown or reset. Signed-off-by: Prabhath Sajeepa Reviewed-by: Roland Dreier Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index d181cafedc58..ab6ec7295bf9 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -184,6 +184,7 @@ static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir); if (ib_dma_mapping_error(ibdev, qe->dma)) { kfree(qe->data); + qe->data = NULL; return -ENOMEM; } @@ -823,6 +824,7 @@ out_free_tagset: out_free_async_qe: nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); + ctrl->async_event_sqe.data = NULL; out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; -- cgit v1.2.3