diff options
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/core.c | 152 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 1 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 6 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 69 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 33 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 23 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 72 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 85 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 1 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 8 | ||||
-rw-r--r-- | drivers/nvme/target/fc.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/fcloop.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd-bdev.c | 1 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/passthru.c | 25 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/tcp.c | 10 |
17 files changed, 325 insertions, 173 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88cff309d8e4..d543bc1747fd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -241,17 +241,6 @@ static blk_status_t nvme_error_status(u16 status) } } -static inline bool nvme_req_needs_retry(struct request *req) -{ - if (blk_noretry_request(req)) - return false; - if (nvme_req(req)->status & NVME_SC_DNR) - return false; - if (nvme_req(req)->retries >= nvme_max_retries) - return false; - return true; -} - static void nvme_retry_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; @@ -268,34 +257,67 @@ static void nvme_retry_req(struct request *req) blk_mq_delay_kick_requeue_list(req->q, delay); } -void nvme_complete_rq(struct request *req) +enum nvme_disposition { + COMPLETE, + RETRY, + FAILOVER, +}; + +static inline enum nvme_disposition nvme_decide_disposition(struct request *req) { - blk_status_t status = nvme_error_status(nvme_req(req)->status); + if (likely(nvme_req(req)->status == 0)) + return COMPLETE; - trace_nvme_complete_rq(req); + if (blk_noretry_request(req) || + (nvme_req(req)->status & NVME_SC_DNR) || + nvme_req(req)->retries >= nvme_max_retries) + return COMPLETE; - nvme_cleanup_cmd(req); + if (req->cmd_flags & REQ_NVME_MPATH) { + if (nvme_is_path_error(nvme_req(req)->status) || + blk_queue_dying(req->q)) + return FAILOVER; + } else { + if (blk_queue_dying(req->q)) + return COMPLETE; + } - if (nvme_req(req)->ctrl->kas) - nvme_req(req)->ctrl->comp_seen = true; + return RETRY; +} - if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) - return; +static inline void nvme_end_req(struct request *req) +{ + blk_status_t status = nvme_error_status(nvme_req(req)->status); - if (!blk_queue_dying(req->q)) { - nvme_retry_req(req); - return; - } - } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - req_op(req) == REQ_OP_ZONE_APPEND) { + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = nvme_lba_to_sect(req->q->queuedata, le64_to_cpu(nvme_req(req)->result.u64)); - } nvme_trace_bio_complete(req, status); blk_mq_end_request(req, status); } + +void nvme_complete_rq(struct request *req) +{ + trace_nvme_complete_rq(req); + nvme_cleanup_cmd(req); + + if (nvme_req(req)->ctrl->kas) + nvme_req(req)->ctrl->comp_seen = true; + + switch (nvme_decide_disposition(req)) { + case COMPLETE: + nvme_end_req(req); + return; + case RETRY: + nvme_retry_req(req); + return; + case FAILOVER: + nvme_failover_req(req); + return; + } +} EXPORT_SYMBOL_GPL(nvme_complete_rq); bool nvme_cancel_request(struct request *req, void *data, bool reserved) @@ -330,7 +352,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -340,7 +362,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_NEW: case NVME_CTRL_LIVE: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -350,7 +372,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_NEW: case NVME_CTRL_RESETTING: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -361,7 +383,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -371,7 +393,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_DELETING: case NVME_CTRL_DEAD: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -380,7 +402,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_DELETING: changed = true; - /* FALLTHRU */ + fallthrough; default: break; } @@ -2004,13 +2026,49 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_mq_unfreeze_queue(disk->queue); } +static inline bool nvme_first_scan(struct gendisk *disk) +{ + /* nvme_alloc_ns() scans the disk prior to adding it */ + return !(disk->flags & GENHD_FL_UP); +} + +static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + u32 iob; + + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && + is_power_of_2(ctrl->max_hw_sectors)) + iob = ctrl->max_hw_sectors; + else + iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); + + if (!iob) + return; + + if (!is_power_of_2(iob)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring unaligned IO boundary:%u\n", + ns->disk->disk_name, iob); + return; + } + + if (blk_queue_is_zoned(ns->disk->queue)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring zoned namespace IO boundary\n", + ns->disk->disk_name); + return; + } + + blk_queue_chunk_sectors(ns->queue, iob); +} + static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; int ret; - u32 iob; /* * If identify namespace failed, use default 512 byte block size so @@ -2038,12 +2096,6 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) return -ENODEV; } - if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && - is_power_of_2(ctrl->max_hw_sectors)) - iob = ctrl->max_hw_sectors; - else - iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); - ns->features = 0; ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); /* the PI implementation requires metadata equal t10 pi tuple size */ @@ -2075,8 +2127,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) } } - if (iob) - blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); + nvme_set_chunk_sectors(ns, id); nvme_update_disk_info(disk, ns, id); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { @@ -2965,14 +3016,14 @@ static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) { struct nvme_cel *cel, *ret = NULL; - spin_lock(&ctrl->lock); + spin_lock_irq(&ctrl->lock); list_for_each_entry(cel, &ctrl->cels, entry) { if (cel->csi == csi) { ret = cel; break; } } - spin_unlock(&ctrl->lock); + spin_unlock_irq(&ctrl->lock); return ret; } @@ -2999,9 +3050,9 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, cel->csi = csi; - spin_lock(&ctrl->lock); + spin_lock_irq(&ctrl->lock); list_add_tail(&cel->entry, &ctrl->cels); - spin_unlock(&ctrl->lock); + spin_unlock_irq(&ctrl->lock); out: *log = &cel->log; return 0; @@ -3654,6 +3705,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_hostid.attr && !ctrl->opts) return 0; + if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) + return 0; return a->mode; } @@ -4368,7 +4423,7 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_subsystem *subsys = ctrl->subsys; struct nvme_cel *cel, *next; - if (subsys && ctrl->instance != subsys->instance) + if (!subsys || ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { @@ -4512,7 +4567,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_unfreeze); -void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) { struct nvme_ns *ns; @@ -4523,6 +4578,7 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) break; } up_read(&ctrl->namespaces_rwsem); + return timeout; } EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 4ec4829d6233..32f61fc5f4c5 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -576,7 +576,6 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, * which is require to set the queue live in the appropinquate states. */ switch (ctrl->state) { - case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: if (nvme_is_fabrics(req->cmd) && req->cmd->fabrics.fctype == nvme_fabrics_type_connect) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index eae43bb444e0..a7f474ddfff7 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2035,7 +2035,7 @@ done: } __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); - if (!nvme_end_request(rq, status, result)) + if (!nvme_try_complete_req(rq, status, result)) nvme_fc_complete_rq(rq); check_error: @@ -2078,7 +2078,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) { dev_err(ctrl->dev, "FCP Op failed - cmdiu dma mapping failed.\n"); - ret = EFAULT; + ret = -EFAULT; goto out_on_error; } @@ -2088,7 +2088,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) { dev_err(ctrl->dev, "FCP Op failed - rspiu dma mapping failed.\n"); - ret = EFAULT; + ret = -EFAULT; } atomic_set(&op->state, FCPOP_STATE_IDLE); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3ded54d2c9c6..d4ba736c6c89 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, } } -bool nvme_failover_req(struct request *req) +void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; - u16 status = nvme_req(req)->status; + u16 status = nvme_req(req)->status & 0x7ff; unsigned long flags; - switch (status & 0x7ff) { - case NVME_SC_ANA_TRANSITION: - case NVME_SC_ANA_INACCESSIBLE: - case NVME_SC_ANA_PERSISTENT_LOSS: - /* - * If we got back an ANA error we know the controller is alive, - * but not ready to serve this namespaces. The spec suggests - * we should update our general state here, but due to the fact - * that the admin and I/O queues are not serialized that is - * fundamentally racy. So instead just clear the current path, - * mark the the path as pending and kick of a re-read of the ANA - * log page ASAP. - */ - nvme_mpath_clear_current_path(ns); - if (ns->ctrl->ana_log_buf) { - set_bit(NVME_NS_ANA_PENDING, &ns->flags); - queue_work(nvme_wq, &ns->ctrl->ana_work); - } - break; - case NVME_SC_HOST_PATH_ERROR: - case NVME_SC_HOST_ABORTED_CMD: - /* - * Temporary transport disruption in talking to the controller. - * Try to send on a new path. - */ - nvme_mpath_clear_current_path(ns); - break; - default: - /* This was a non-ANA error so follow the normal error path. */ - return false; + nvme_mpath_clear_current_path(ns); + + /* + * If we got back an ANA error, we know the controller is alive but not + * ready to serve this namespace. Kick of a re-read of the ANA + * information page, and just try any other available path for now. + */ + if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); } spin_lock_irqsave(&ns->head->requeue_lock, flags); blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); + blk_mq_end_request(req, 0); kblockd_schedule_work(&ns->head->requeue_work); - return true; } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) @@ -233,7 +212,7 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, int node, struct nvme_ns *old) { - struct nvme_ns *ns, *found, *fallback = NULL; + struct nvme_ns *ns, *found = NULL; if (list_is_singular(&head->list)) { if (nvme_path_is_disabled(old)) @@ -252,18 +231,22 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, goto out; } if (ns->ana_state == NVME_ANA_NONOPTIMIZED) - fallback = ns; + found = ns; } - /* No optimized path found, re-check the current path */ + /* + * The loop above skips the current path for round-robin semantics. + * Fall back to the current path if either: + * - no other optimized path found and current is optimized, + * - no other usable path found and current is usable. + */ if (!nvme_path_is_disabled(old) && - old->ana_state == NVME_ANA_OPTIMIZED) { - found = old; - goto out; - } - if (!fallback) + (old->ana_state == NVME_ANA_OPTIMIZED || + (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) + return old; + + if (!found) return NULL; - found = fallback; out: rcu_assign_pointer(head->current_path[node], found); return found; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ebb8c3ed3885..2910f6caab7d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -523,7 +523,31 @@ static inline u32 nvme_bytes_to_numd(size_t len) return (len >> 2) - 1; } -static inline bool nvme_end_request(struct request *req, __le16 status, +static inline bool nvme_is_ana_error(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_ANA_TRANSITION: + case NVME_SC_ANA_INACCESSIBLE: + case NVME_SC_ANA_PERSISTENT_LOSS: + return true; + default: + return false; + } +} + +static inline bool nvme_is_path_error(u16 status) +{ + /* check for a status code type of 'path related status' */ + return (status & 0x700) == 0x300; +} + +/* + * Fill in the status and result information from the CQE, and then figure out + * if blk-mq will need to use IPI magic to complete the request, and if yes do + * so. If not let the caller complete the request without an indirect function + * call. + */ +static inline bool nvme_try_complete_req(struct request *req, __le16 status, union nvme_result result) { struct nvme_request *rq = nvme_req(req); @@ -581,7 +605,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); -void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 @@ -629,7 +653,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); -bool nvme_failover_req(struct request *req); +void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); @@ -688,9 +712,8 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); } -static inline bool nvme_failover_req(struct request *req) +static inline void nvme_failover_req(struct request *req) { - return false; } static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ba725ae47305..899d2f4d7ab6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -120,7 +120,7 @@ struct nvme_dev { unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; - u16 q_depth; + u32 q_depth; int io_sqes; u32 db_stride; void __iomem *bar; @@ -157,13 +157,13 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { int ret; - u16 n; + u32 n; - ret = kstrtou16(val, 10, &n); + ret = kstrtou32(val, 10, &n); if (ret != 0 || n < 2) return -EINVAL; - return param_set_ushort(val, kp); + return param_set_uint(val, kp); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -195,7 +195,7 @@ struct nvme_queue { dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; u32 __iomem *q_db; - u16 q_depth; + u32 q_depth; u16 cq_vector; u16 sq_tail; u16 cq_head; @@ -961,7 +961,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id); trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); - if (!nvme_end_request(req, cqe->status, cqe->result)) + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) nvme_pci_complete_rq(req); } @@ -1244,13 +1244,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) switch (dev->ctrl.state) { case NVME_CTRL_CONNECTING: nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - /* fall through */ + fallthrough; case NVME_CTRL_DELETING: dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, true); nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_dev_disable(dev, true); return BLK_EH_DONE; case NVME_CTRL_RESETTING: return BLK_EH_RESET_TIMER; @@ -1267,10 +1267,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); + nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); - nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; } @@ -2320,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, + dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); @@ -2460,7 +2460,8 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_setup_prp_pools(struct nvme_dev *dev) { dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, - PAGE_SIZE, PAGE_SIZE, 0); + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE, 0); if (!dev->prp_page_pool) return -ENOMEM; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 44c76ffbb264..8e5ffe2f117d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -122,6 +122,7 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + struct mutex teardown_lock; bool use_inline_data; u32 io_queues[HCTX_MAX_TYPES]; }; @@ -975,7 +976,15 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) if (!new) { nvme_start_queues(&ctrl->ctrl); - nvme_wait_freeze(&ctrl->ctrl); + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, ctrl->ctrl.queue_count - 1); nvme_unfreeze(&ctrl->ctrl); @@ -983,6 +992,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) return 0; +out_wait_freeze_timed_out: + nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); out_cleanup_connect_q: if (new) blk_cleanup_queue(ctrl->ctrl.connect_q); @@ -997,6 +1009,7 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { + mutex_lock(&ctrl->teardown_lock); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); if (ctrl->ctrl.admin_tagset) { @@ -1007,11 +1020,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); + mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { + mutex_lock(&ctrl->teardown_lock); if (ctrl->ctrl.queue_count > 1) { nvme_start_freeze(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl); @@ -1025,6 +1040,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); } + mutex_unlock(&ctrl->teardown_lock); } static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) @@ -1180,6 +1196,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->ctrl.device, "starting error recovery\n"); queue_work(nvme_reset_wq, &ctrl->err_work); } @@ -1189,7 +1206,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req) if (!refcount_dec_and_test(&req->ref)) return; - if (!nvme_end_request(rq, req->status, req->result)) + if (!nvme_try_complete_req(rq, req->status, req->result)) nvme_rdma_complete_rq(rq); } @@ -1915,7 +1932,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: nvme_rdma_destroy_queue_ib(queue); - /* fall through */ + fallthrough; case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); @@ -1946,6 +1963,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, return 0; } +static void nvme_rdma_complete_timed_out(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&ctrl->teardown_lock); + nvme_rdma_stop_queue(queue); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&ctrl->teardown_lock); +} + static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq, bool reserved) { @@ -1956,29 +1989,29 @@ nvme_rdma_timeout(struct request *rq, bool reserved) dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", rq->tag, nvme_rdma_queue_idx(queue)); - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_rdma_teardown_io_queues(ctrl, false); - nvme_rdma_teardown_admin_queue(ctrl, false); + nvme_rdma_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ nvme_rdma_error_recovery(ctrl); - return BLK_EH_RESET_TIMER; } @@ -2278,6 +2311,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return ERR_PTR(-ENOMEM); ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); + mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 62fbaecdc960..16851ae3bddf 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -124,6 +124,7 @@ struct nvme_tcp_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + struct mutex teardown_lock; struct work_struct err_work; struct delayed_work connect_work; struct nvme_tcp_request async_req; @@ -464,6 +465,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return; + dev_warn(ctrl->device, "starting error recovery\n"); queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); } @@ -481,7 +483,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, return -EINVAL; } - if (!nvme_end_request(rq, cqe->status, cqe->result)) + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) nvme_complete_rq(rq); queue->nr_cqe++; @@ -672,7 +674,7 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status) { union nvme_result res = {}; - if (!nvme_end_request(rq, cpu_to_le16(status << 1), res)) + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) nvme_complete_rq(rq); } @@ -866,7 +868,6 @@ static void nvme_tcp_state_change(struct sock *sk) case TCP_LAST_ACK: case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: - /* fallthrough */ nvme_tcp_error_recovery(&queue->ctrl->ctrl); break; default: @@ -1527,7 +1528,6 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) return; - __nvme_tcp_stop_queue(queue); } @@ -1782,7 +1782,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) if (!new) { nvme_start_queues(ctrl); - nvme_wait_freeze(ctrl); + if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } blk_mq_update_nr_hw_queues(ctrl->tagset, ctrl->queue_count - 1); nvme_unfreeze(ctrl); @@ -1790,6 +1798,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) return 0; +out_wait_freeze_timed_out: + nvme_stop_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: if (new) blk_cleanup_queue(ctrl->connect_q); @@ -1875,6 +1886,7 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); blk_mq_quiesce_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); if (ctrl->admin_tagset) { @@ -1885,13 +1897,16 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, if (remove) blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, bool remove) { + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); if (ctrl->queue_count <= 1) - return; + goto out; + blk_mq_quiesce_queue(ctrl->admin_q); nvme_start_freeze(ctrl); nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); @@ -1903,6 +1918,8 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); +out: + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); } static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) @@ -2149,40 +2166,55 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) nvme_tcp_queue_request(&ctrl->async_req, true, true); } +static void nvme_tcp_complete_timed_out(struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; + + /* fence other contexts that may complete the command */ + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + if (!blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); +} + static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq, bool reserved) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; struct nvme_tcp_cmd_pdu *pdu = req->pdu; - /* - * Restart the timer if a controller reset is already scheduled. Any - * timed out commands would be handled before entering the connecting - * state. - */ - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) - return BLK_EH_RESET_TIMER; - - dev_warn(ctrl->ctrl.device, + dev_warn(ctrl->device, "queue %d: timeout request %#x type %d\n", nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + if (ctrl->state != NVME_CTRL_LIVE) { /* - * Teardown immediately if controller times out while starting - * or we are already started error recovery. all outstanding - * requests are completed on shutdown, so we return BLK_EH_DONE. + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. */ - flush_work(&ctrl->err_work); - nvme_tcp_teardown_io_queues(&ctrl->ctrl, false); - nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false); + nvme_tcp_complete_timed_out(rq); return BLK_EH_DONE; } - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); - nvme_tcp_error_recovery(&ctrl->ctrl); - + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ + nvme_tcp_error_recovery(ctrl); return BLK_EH_RESET_TIMER; } @@ -2423,6 +2455,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, nvme_tcp_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); + mutex_init(&ctrl->teardown_lock); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 74b2b61c773b..37e1d7784e17 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1136,6 +1136,7 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, up_write(&nvmet_config_sem); kfree_rcu(new_model, rcuhead); + kfree(new_model_number); return count; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b92f45f5cd5b..b7b63330b5ef 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -73,7 +73,7 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) status = NVME_SC_ACCESS_DENIED; break; case -EIO: - /* FALLTHRU */ + fallthrough; default: req->error_loc = offsetof(struct nvme_common_command, opcode); status = NVME_SC_INTERNAL | NVME_SC_DNR; @@ -397,6 +397,9 @@ static void nvmet_keep_alive_timer(struct work_struct *work) static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d start keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); @@ -406,6 +409,9 @@ static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); cancel_delayed_work_sync(&ctrl->ka_work); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 55bafd56166a..e6861cc10e7d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -2342,9 +2342,9 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) return; if (fcpreq->fcp_error || fcpreq->transferred_length != fcpreq->transfer_length) { - spin_lock(&fod->flock); + spin_lock_irqsave(&fod->flock, flags); fod->abort = true; - spin_unlock(&fod->flock); + spin_unlock_irqrestore(&fod->flock, flags); nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); return; diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index c97e60b71bbc..3da067a8311e 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -812,7 +812,7 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, break; /* Fall-Thru to RSP handling */ - /* FALLTHRU */ + fallthrough; case NVMET_FCOP_RSP: if (fcpreq) { diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 3dd6f566a240..125dde3f410e 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -139,7 +139,6 @@ static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) req->error_loc = offsetof(struct nvme_rw_command, nsid); break; case BLK_STS_IOERR: - /* fallthru */ default: status = NVME_SC_INTERNAL | NVME_SC_DNR; req->error_loc = offsetof(struct nvme_common_command, opcode); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4884ef1e46a2..0d6008cf66a2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -115,7 +115,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req) return; } - if (!nvme_end_request(rq, cqe->status, cqe->result)) + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) nvme_loop_complete_rq(rq); } } diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 89d91dc999a6..8bd7f656e240 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -165,7 +165,7 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) req->cqe->result = nvme_req(rq)->result; nvmet_req_complete(req, status); - blk_put_request(rq); + blk_mq_free_request(rq); } static void nvmet_passthru_req_done(struct request *rq, @@ -175,7 +175,7 @@ static void nvmet_passthru_req_done(struct request *rq, req->cqe->result = nvme_req(rq)->result; nvmet_req_complete(req, nvme_req(rq)->status); - blk_put_request(rq); + blk_mq_free_request(rq); } static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) @@ -230,7 +230,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) if (unlikely(!ns)) { pr_err("failed to get passthru ns nsid:%u\n", nsid); status = NVME_SC_INVALID_NS | NVME_SC_DNR; - goto fail_out; + goto out; } q = ns->queue; @@ -238,16 +238,15 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); if (IS_ERR(rq)) { - rq = NULL; status = NVME_SC_INTERNAL; - goto fail_out; + goto out_put_ns; } if (req->sg_cnt) { ret = nvmet_passthru_map_sg(req, rq); if (unlikely(ret)) { status = NVME_SC_INTERNAL; - goto fail_out; + goto out_put_req; } } @@ -274,11 +273,13 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) return; -fail_out: +out_put_req: + blk_mq_free_request(rq); +out_put_ns: if (ns) nvme_put_ns(ns); +out: nvmet_req_complete(req, status); - blk_put_request(rq); } /* @@ -326,6 +327,10 @@ static u16 nvmet_setup_passthru_command(struct nvmet_req *req) u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) { + /* Reject any commands with non-sgl flags set (ie. fused commands) */ + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD; + switch (req->cmd->common.opcode) { case nvme_cmd_resv_register: case nvme_cmd_resv_report: @@ -396,6 +401,10 @@ static u16 nvmet_passthru_get_set_features(struct nvmet_req *req) u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) { + /* Reject any commands with non-sgl flags set (ie. fused commands) */ + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD; + /* * Passthru all vendor specific commands */ diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3ccb59260b4a..ae6620489457 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1758,7 +1758,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, schedule_delayed_work(&port->repair_work, 0); break; } - /* FALLTHROUGH */ + fallthrough; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: nvmet_rdma_queue_disconnect(queue); @@ -1769,7 +1769,7 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_REJECTED: pr_debug("Connection rejected: %s\n", rdma_reject_msg(cm_id, event->status)); - /* FALLTHROUGH */ + fallthrough; case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_CONNECT_ERROR: nvmet_rdma_queue_connect_fail(cm_id, queue); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 9eda91162fe4..8e0d766d2722 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -160,6 +160,11 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd) { + if (unlikely(!queue->nr_cmds)) { + /* We didn't allocate cmds yet, send 0xffff */ + return USHRT_MAX; + } + return cmd - queue->cmds; } @@ -866,7 +871,10 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; - cmd = &queue->cmds[data->ttag]; + if (likely(queue->nr_cmds)) + cmd = &queue->cmds[data->ttag]; + else + cmd = &queue->connect; if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { pr_err("ttag %u unexpected data offset %u (expected %u)\n", |