diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/Kconfig | 15 | ||||
-rw-r--r-- | drivers/nvme/host/Makefile | 3 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 191 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 61 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 17 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 43 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 33 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 20 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 24 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 518 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 119 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 2278 | ||||
-rw-r--r-- | drivers/nvme/host/trace.c | 3 | ||||
-rw-r--r-- | drivers/nvme/host/trace.h | 27 |
14 files changed, 3036 insertions, 316 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 88a8b5916624..0f345e207675 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -57,3 +57,18 @@ config NVME_FC from https://github.com/linux-nvme/nvme-cli. If unsure, say N. + +config NVME_TCP + tristate "NVM Express over Fabrics TCP host driver" + depends on INET + depends on BLK_DEV_NVME + select NVME_FABRICS + help + This provides support for the NVMe over Fabrics protocol using + the TCP transport. This allows you to use remote block devices + exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index aea459c65ae1..8a4b671c5f0c 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o +obj-$(CONFIG_NVME_TCP) += nvme-tcp.o nvme-core-y := core.o nvme-core-$(CONFIG_TRACING) += trace.o @@ -21,3 +22,5 @@ nvme-fabrics-y += fabrics.o nvme-rdma-y += rdma.o nvme-fc-y += fc.o + +nvme-tcp-y += tcp.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 962012135b62..08f2c92602f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -97,7 +97,6 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static void nvme_ns_remove(struct nvme_ns *ns); static int nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -245,12 +244,31 @@ static inline bool nvme_req_needs_retry(struct request *req) return true; } +static void nvme_retry_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + unsigned long delay = 0; + u16 crd; + + /* The mask and shift result must be <= 3 */ + crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; + if (ns && crd) + delay = ns->ctrl->crdt[crd - 1] * 100; + + nvme_req(req)->retries++; + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, delay); +} + void nvme_complete_rq(struct request *req) { blk_status_t status = nvme_error_status(req); trace_nvme_complete_rq(req); + if (nvme_req(req)->ctrl->kas) + nvme_req(req)->ctrl->comp_seen = true; + if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { if ((req->cmd_flags & REQ_NVME_MPATH) && blk_path_error(status)) { @@ -259,8 +277,7 @@ void nvme_complete_rq(struct request *req) } if (!blk_queue_dying(req->q)) { - nvme_req(req)->retries++; - blk_mq_requeue_request(req, true); + nvme_retry_req(req); return; } } @@ -268,14 +285,14 @@ void nvme_complete_rq(struct request *req) } EXPORT_SYMBOL_GPL(nvme_complete_rq); -void nvme_cancel_request(struct request *req, void *data, bool reserved) +bool nvme_cancel_request(struct request *req, void *data, bool reserved) { dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); nvme_req(req)->status = NVME_SC_ABORT_REQ; blk_mq_complete_request(req); - + return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); @@ -536,7 +553,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { - memset(cmnd, 0, sizeof(*cmnd)); cmnd->common.opcode = nvme_cmd_flush; cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); } @@ -548,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_dsm_range *range; struct bio *bio; - range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); - if (!range) - return BLK_STS_RESOURCE; + range = kmalloc_array(segments, sizeof(*range), + GFP_ATOMIC | __GFP_NOWARN); + if (!range) { + /* + * If we fail allocation our range, fallback to the controller + * discard page. If that's also busy, it's safe to return + * busy, as we know we can make progress once that's freed. + */ + if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) + return BLK_STS_RESOURCE; + + range = page_address(ns->ctrl->discard_page); + } __rq_for_each_bio(bio, req) { u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); @@ -565,11 +591,13 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, } if (WARN_ON_ONCE(n != segments)) { - kfree(range); + if (virt_to_page(range) == ns->ctrl->discard_page) + clear_bit_unlock(0, &ns->ctrl->discard_page_busy); + else + kfree(range); return BLK_STS_IOERR; } - memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); cmnd->dsm.nr = cpu_to_le32(segments - 1); @@ -598,7 +626,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - memset(cmnd, 0, sizeof(*cmnd)); cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); @@ -650,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req) blk_rq_bytes(req) >> ns->lba_shift); } if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - kfree(page_address(req->special_vec.bv_page) + - req->special_vec.bv_offset); + struct nvme_ns *ns = req->rq_disk->private_data; + struct page *page = req->special_vec.bv_page; + + if (page == ns->ctrl->discard_page) + clear_bit_unlock(0, &ns->ctrl->discard_page_busy); + else + kfree(page_address(page) + req->special_vec.bv_offset); } } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); @@ -663,6 +695,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, nvme_clear_nvme_request(req); + memset(cmd, 0, sizeof(*cmd)); switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: @@ -691,6 +724,31 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } EXPORT_SYMBOL_GPL(nvme_setup_cmd); +static void nvme_end_sync_rq(struct request *rq, blk_status_t error) +{ + struct completion *waiting = rq->end_io_data; + + rq->end_io_data = NULL; + complete(waiting); +} + +static void nvme_execute_rq_polled(struct request_queue *q, + struct gendisk *bd_disk, struct request *rq, int at_head) +{ + DECLARE_COMPLETION_ONSTACK(wait); + + WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)); + + rq->cmd_flags |= REQ_HIPRI; + rq->end_io_data = &wait; + blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); + + while (!completion_done(&wait)) { + blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); + cond_resched(); + } +} + /* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code @@ -698,7 +756,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags) + blk_mq_req_flags_t flags, bool poll) { struct request *req; int ret; @@ -715,7 +773,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, goto out; } - blk_execute_rq(req->q, NULL, req, at_head); + if (poll) + nvme_execute_rq_polled(req->q, NULL, req, at_head); + else + blk_execute_rq(req->q, NULL, req, at_head); if (result) *result = nvme_req(req)->result; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -732,7 +793,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); @@ -843,6 +904,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) return; } + ctrl->comp_seen = false; spin_lock_irqsave(&ctrl->lock, flags); if (ctrl->state == NVME_CTRL_LIVE || ctrl->state == NVME_CTRL_CONNECTING) @@ -873,6 +935,15 @@ static void nvme_keep_alive_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), struct nvme_ctrl, ka_work); + bool comp_seen = ctrl->comp_seen; + + if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) { + dev_dbg(ctrl->device, + "reschedule traffic based keep-alive timer\n"); + ctrl->comp_seen = false; + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + return; + } if (nvme_keep_alive(ctrl)) { /* allocation failure, reset the controller */ @@ -1041,7 +1112,7 @@ static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword c.features.dword11 = cpu_to_le32(dword11); ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, - buffer, buflen, 0, NVME_QID_ANY, 0, 0); + buffer, buflen, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0 && result) *result = le32_to_cpu(res.u32); return ret; @@ -1240,12 +1311,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.nsid = cpu_to_le32(cmd.nsid); c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -1524,8 +1595,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->noiob) nvme_set_chunk_size(ns); nvme_update_disk_info(disk, ns, id); - if (ns->ndev) - nvme_nvm_update_nvm_info(ns); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); @@ -1608,7 +1677,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, memset(&c, 0, sizeof(c)); c.common.opcode = op; c.common.nsid = cpu_to_le32(ns->head->ns_id); - c.common.cdw10[0] = cpu_to_le32(cdw10); + c.common.cdw10 = cpu_to_le32(cdw10); ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); nvme_put_ns_from_disk(head, srcu_idx); @@ -1682,11 +1751,11 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, else cmd.common.opcode = nvme_admin_security_recv; cmd.common.nsid = 0; - cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); - cmd.common.cdw10[1] = cpu_to_le32(len); + cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); + cmd.common.cdw11 = cpu_to_le32(len); return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, - ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0); + ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false); } EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ @@ -1881,6 +1950,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) return ret; } +static int nvme_configure_acre(struct nvme_ctrl *ctrl) +{ + struct nvme_feat_host_behavior *host; + int ret; + + /* Don't bother enabling the feature if retry delay is not reported */ + if (!ctrl->crdt[0]) + return 0; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return 0; + + host->acre = NVME_ENABLE_ACRE; + ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + kfree(host); + return ret; +} + static int nvme_configure_apst(struct nvme_ctrl *ctrl) { /* @@ -2402,6 +2491,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; } + ctrl->crdt[0] = le16_to_cpu(id->crdt1); + ctrl->crdt[1] = le16_to_cpu(id->crdt2); + ctrl->crdt[2] = le16_to_cpu(id->crdt3); + ctrl->oacs = le16_to_cpu(id->oacs); ctrl->oncs = le16_to_cpup(&id->oncs); ctrl->oaes = le32_to_cpu(id->oaes); @@ -2419,6 +2512,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); ctrl->max_namespaces = le32_to_cpu(id->mnan); + ctrl->ctratt = le32_to_cpu(id->ctratt); if (id->rtd3e) { /* us -> s */ @@ -2501,6 +2595,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret < 0) return ret; + ret = nvme_configure_acre(ctrl); + if (ret < 0) + return ret; + ctrl->identified = true; return 0; @@ -2776,6 +2874,7 @@ static ssize_t field##_show(struct device *dev, \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); nvme_show_int_function(cntlid); +nvme_show_int_function(numa_node); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -2855,6 +2954,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_subsysnqn.attr, &dev_attr_address.attr, &dev_attr_state.attr, + &dev_attr_numa_node.attr, NULL }; @@ -3065,7 +3165,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; + int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -3100,13 +3200,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_setup_streams_ns(ctrl, ns); nvme_set_disk_name(disk_name, ns, ctrl, &flags); - if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk_name, node)) { - dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_unlink_ns; - } - } - disk = alloc_disk_node(0, node); if (!disk) goto out_unlink_ns; @@ -3120,6 +3213,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) __nvme_revalidate_disk(disk, id); + if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { + if (nvme_nvm_register(ns, disk_name, node)) { + dev_warn(ctrl->device, "LightNVM init failure\n"); + goto out_put_disk; + } + } + down_write(&ctrl->namespaces_rwsem); list_add_tail(&ns->list, &ctrl->namespaces); up_write(&ctrl->namespaces_rwsem); @@ -3133,6 +3233,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) kfree(id); return; + out_put_disk: + put_disk(ns->disk); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3522,6 +3624,7 @@ static void nvme_free_ctrl(struct device *dev) ida_simple_remove(&nvme_instance_ida, ctrl->instance); kfree(ctrl->effects); nvme_mpath_uninit(ctrl); + __free_page(ctrl->discard_page); if (subsys) { mutex_lock(&subsys->lock); @@ -3562,6 +3665,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > + PAGE_SIZE); + ctrl->discard_page = alloc_page(GFP_KERNEL); + if (!ctrl->discard_page) { + ret = -ENOMEM; + goto out; + } + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); if (ret < 0) goto out; @@ -3599,6 +3710,8 @@ out_free_name: out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: + if (ctrl->discard_page) + __free_page(ctrl->discard_page); return ret; } EXPORT_SYMBOL_GPL(nvme_init_ctrl); @@ -3746,7 +3859,7 @@ out: return result; } -void nvme_core_exit(void) +void __exit nvme_core_exit(void) { ida_destroy(&nvme_subsystems_ida); class_destroy(nvme_subsys_class); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index bd0969db6225..b2ab213f43de 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -159,7 +159,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) cmd.prop_get.offset = cpu_to_le32(off); ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); if (ret >= 0) *val = le64_to_cpu(res.u64); @@ -206,7 +206,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) cmd.prop_get.offset = cpu_to_le32(off); ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); if (ret >= 0) *val = le64_to_cpu(res.u64); @@ -252,7 +252,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) cmd.prop_set.value = cpu_to_le64(val); ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, - NVME_QID_ANY, 0, 0); + NVME_QID_ANY, 0, 0, false); if (unlikely(ret)) dev_err(ctrl->device, "Property Set error: %d, offset %#x\n", @@ -392,6 +392,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); + if (ctrl->opts->disable_sqflow) + cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -403,7 +406,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, data, sizeof(*data), 0, NVME_QID_ANY, 1, - BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false); if (ret) { nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), &cmd, data); @@ -438,7 +441,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue); * > 0: NVMe error status code * < 0: Linux errno error code */ -int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll) { struct nvme_command cmd; struct nvmf_connect_data *data; @@ -451,6 +454,9 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) cmd.connect.qid = cpu_to_le16(qid); cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + if (ctrl->opts->disable_sqflow) + cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -462,7 +468,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, data, sizeof(*data), 0, qid, 1, - BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, poll); if (ret) { nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), &cmd, data); @@ -607,6 +613,11 @@ static const match_table_t opt_tokens = { { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, { NVMF_OPT_HOST_ID, "hostid=%s" }, { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, + { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" }, + { NVMF_OPT_HDR_DIGEST, "hdr_digest" }, + { NVMF_OPT_DATA_DIGEST, "data_digest" }, + { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, + { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -626,6 +637,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; opts->kato = NVME_DEFAULT_KATO; opts->duplicate_connect = false; + opts->hdr_digest = false; + opts->data_digest = false; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -817,6 +830,39 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, case NVMF_OPT_DUP_CONNECT: opts->duplicate_connect = true; break; + case NVMF_OPT_DISABLE_SQFLOW: + opts->disable_sqflow = true; + break; + case NVMF_OPT_HDR_DIGEST: + opts->hdr_digest = true; + break; + case NVMF_OPT_DATA_DIGEST: + opts->data_digest = true; + break; + case NVMF_OPT_NR_WRITE_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid nr_write_queues %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_write_queues = token; + break; + case NVMF_OPT_NR_POLL_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid nr_poll_queues %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_poll_queues = token; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -933,7 +979,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ - NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT) + NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ + NVMF_OPT_DISABLE_SQFLOW) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 6ea6275f332a..478343b73e38 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -58,6 +58,11 @@ enum { NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, NVMF_OPT_HOST_ID = 1 << 12, NVMF_OPT_DUP_CONNECT = 1 << 13, + NVMF_OPT_DISABLE_SQFLOW = 1 << 14, + NVMF_OPT_HDR_DIGEST = 1 << 15, + NVMF_OPT_DATA_DIGEST = 1 << 16, + NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, + NVMF_OPT_NR_POLL_QUEUES = 1 << 18, }; /** @@ -85,6 +90,11 @@ enum { * @max_reconnects: maximum number of allowed reconnect attempts before removing * the controller, (-1) means reconnect forever, zero means remove * immediately; + * @disable_sqflow: disable controller sq flow control + * @hdr_digest: generate/verify header digest (TCP) + * @data_digest: generate/verify data digest (TCP) + * @nr_write_queues: number of queues for write I/O + * @nr_poll_queues: number of queues for polling I/O */ struct nvmf_ctrl_options { unsigned mask; @@ -101,6 +111,11 @@ struct nvmf_ctrl_options { unsigned int kato; struct nvmf_host *host; int max_reconnects; + bool disable_sqflow; + bool hdr_digest; + bool data_digest; + unsigned int nr_write_queues; + unsigned int nr_poll_queues; }; /* @@ -156,7 +171,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); -int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll); int nvmf_register_transport(struct nvmf_transport_ops *ops); void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index feb86b59170e..89accc76d71c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1975,7 +1975,7 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) (qsize / 5)); if (ret) break; - ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false); if (ret) break; @@ -2326,38 +2326,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir); } -static struct blk_mq_tags * -nvme_fc_tagset(struct nvme_fc_queue *queue) -{ - if (queue->qnum == 0) - return queue->ctrl->admin_tag_set.tags[queue->qnum]; - - return queue->ctrl->tag_set.tags[queue->qnum - 1]; -} - -static int -nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) - -{ - struct nvme_fc_queue *queue = hctx->driver_data; - struct nvme_fc_ctrl *ctrl = queue->ctrl; - struct request *req; - struct nvme_fc_fcp_op *op; - - req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag); - if (!req) - return 0; - - op = blk_mq_rq_to_pdu(req); - - if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) && - (ctrl->lport->ops->poll_queue)) - ctrl->lport->ops->poll_queue(&ctrl->lport->localport, - queue->lldd_handle); - - return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE)); -} - static void nvme_fc_submit_async_event(struct nvme_ctrl *arg) { @@ -2410,7 +2378,7 @@ nvme_fc_complete_rq(struct request *rq) * status. The done path will return the io request back to the block * layer with an error status. */ -static void +static bool nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) { struct nvme_ctrl *nctrl = data; @@ -2418,6 +2386,7 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); __nvme_fc_abort_op(ctrl, op); + return true; } @@ -2427,7 +2396,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = { .init_request = nvme_fc_init_request, .exit_request = nvme_fc_exit_request, .init_hctx = nvme_fc_init_hctx, - .poll = nvme_fc_poll, .timeout = nvme_fc_timeout, }; @@ -2457,7 +2425,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) ctrl->tag_set.ops = &nvme_fc_mq_ops; ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; ctrl->tag_set.reserved_tags = 1; /* fabric connect */ - ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ctrl->tag_set.cmd_size = struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, @@ -3050,6 +3018,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = opts; ctrl->ctrl.nr_reconnects = 0; + ctrl->ctrl.numa_node = dev_to_node(lport->dev); INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; @@ -3090,7 +3059,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */ - ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; ctrl->admin_tag_set.cmd_size = struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index a4f3b263cd6c..b759c25c89c8 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -577,7 +577,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, struct ppa_addr ppa; size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); size_t log_pos, offset, len; - int ret, i, max_len; + int i, max_len; + int ret = 0; /* * limit requests to maximum 256K to avoid issuing arbitrary large @@ -731,11 +732,12 @@ static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd) return ret; } -static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) +static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name, + int size) { struct nvme_ns *ns = nvmdev->q->queuedata; - return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0); + return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0); } static void nvme_nvm_destroy_dma_pool(void *pool) @@ -935,9 +937,9 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, /* cdw11-12 */ c.ph_rw.length = cpu_to_le16(vcmd.nppas); c.ph_rw.control = cpu_to_le16(vcmd.control); - c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15); + c.common.cdw13 = cpu_to_le32(vcmd.cdw13); + c.common.cdw14 = cpu_to_le32(vcmd.cdw14); + c.common.cdw15 = cpu_to_le32(vcmd.cdw15); if (vcmd.timeout_ms) timeout = msecs_to_jiffies(vcmd.timeout_ms); @@ -972,22 +974,11 @@ int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) } } -void nvme_nvm_update_nvm_info(struct nvme_ns *ns) -{ - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - - if (geo->version == NVM_OCSSD_SPEC_12) - return; - - geo->csecs = 1 << ns->lba_shift; - geo->sos = ns->ms; -} - int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) { struct request_queue *q = ns->queue; struct nvm_dev *dev; + struct nvm_geo *geo; _nvme_nvm_check_size(); @@ -995,6 +986,12 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) if (!dev) return -ENOMEM; + /* Note that csecs and sos will be overridden if it is a 1.2 drive. */ + geo = &dev->geo; + geo->csecs = 1 << ns->lba_shift; + geo->sos = ns->ms; + geo->ext = ns->ext; + dev->q = q; memcpy(dev->name, disk_name, DISK_NAME_LEN); dev->ops = &nvme_nvm_dev_ops; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9901afd804ce..183ec17ba067 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) test_bit(NVME_NS_ANA_PENDING, &ns->flags)) continue; - distance = node_distance(node, dev_to_node(ns->ctrl->dev)); + distance = node_distance(node, ns->ctrl->numa_node); switch (ns->ana_state) { case NVME_ANA_OPTIMIZED: @@ -220,21 +220,6 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, return ret; } -static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) -{ - struct nvme_ns_head *head = q->queuedata; - struct nvme_ns *ns; - bool found = false; - int srcu_idx; - - srcu_idx = srcu_read_lock(&head->srcu); - ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu); - if (likely(ns && nvme_path_is_optimized(ns))) - found = ns->queue->poll_fn(q, qc); - srcu_read_unlock(&head->srcu, srcu_idx); - return found; -} - static void nvme_requeue_work(struct work_struct *work) { struct nvme_ns_head *head = @@ -276,12 +261,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) return 0; - q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL); + q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node); if (!q) goto out; q->queuedata = head; blk_queue_make_request(q, nvme_ns_head_make_request); - q->poll_fn = nvme_ns_head_poll; blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(q, 512); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 081cbdcce880..2b36ac922596 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -145,6 +145,7 @@ enum nvme_ctrl_state { }; struct nvme_ctrl { + bool comp_seen; enum nvme_ctrl_state state; bool identified; spinlock_t lock; @@ -153,6 +154,7 @@ struct nvme_ctrl { struct request_queue *connect_q; struct device *dev; int instance; + int numa_node; struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; @@ -179,6 +181,7 @@ struct nvme_ctrl { u32 page_size; u32 max_hw_sectors; u32 max_segments; + u16 crdt[3]; u16 oncs; u16 oacs; u16 nssa; @@ -193,6 +196,7 @@ struct nvme_ctrl { u8 apsta; u32 oaes; u32 aen_result; + u32 ctratt; unsigned int shutdown_timeout; unsigned int kato; bool subsystem; @@ -237,6 +241,9 @@ struct nvme_ctrl { u16 maxcmd; int nr_reconnects; struct nvmf_ctrl_options *opts; + + struct page *discard_page; + unsigned long discard_page_busy; }; struct nvme_subsystem { @@ -364,15 +371,6 @@ static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {} static inline void nvme_should_fail(struct request *req) {} #endif -static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) -{ - u32 val = 0; - - if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) - return false; - return val & NVME_CSTS_RDY; -} - static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) { if (!ctrl->subsystem) @@ -408,7 +406,7 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) } void nvme_complete_rq(struct request *req); -void nvme_cancel_request(struct request *req, void *data, bool reserved); +bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); @@ -449,7 +447,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags); + blk_mq_req_flags_t flags, bool poll); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); @@ -545,13 +543,11 @@ static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM -void nvme_nvm_update_nvm_info(struct nvme_ns *ns); int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); extern const struct attribute_group nvme_nvm_attr_group; int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); #else -static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {}; static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) { @@ -572,6 +568,6 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) } int __init nvme_core_init(void); -void nvme_core_exit(void); +void __exit nvme_core_exit(void); #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c33bb201b884..5a0bf6a24d50 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -32,6 +32,7 @@ #include <linux/sed-opal.h> #include <linux/pci-p2pdma.h> +#include "trace.h" #include "nvme.h" #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) @@ -74,6 +75,22 @@ static int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); +static int queue_count_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops queue_count_ops = { + .set = queue_count_set, + .get = param_get_int, +}; + +static int write_queues; +module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644); +MODULE_PARM_DESC(write_queues, + "Number of queues to use for writes. If not set, reads and writes " + "will share a queue set."); + +static int poll_queues = 0; +module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); +MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); + struct nvme_dev; struct nvme_queue; @@ -92,6 +109,7 @@ struct nvme_dev { struct dma_pool *prp_small_pool; unsigned online_queues; unsigned max_qid; + unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; int q_depth; u32 db_stride; @@ -105,7 +123,6 @@ struct nvme_dev { u32 cmbsz; u32 cmbloc; struct nvme_ctrl ctrl; - struct completion ioq_wait; mempool_t *iod_mempool; @@ -134,6 +151,17 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp) return param_set_int(val, kp); } +static int queue_count_set(const char *val, const struct kernel_param *kp) +{ + int n = 0, ret; + + ret = kstrtoint(val, 10, &n); + if (n > num_possible_cpus()) + n = num_possible_cpus(); + + return param_set_int(val, kp); +} + static inline unsigned int sq_idx(unsigned int qid, u32 stride) { return qid * 2 * stride; @@ -158,8 +186,8 @@ struct nvme_queue { struct nvme_dev *dev; spinlock_t sq_lock; struct nvme_command *sq_cmds; - bool sq_cmds_is_io; - spinlock_t cq_lock ____cacheline_aligned_in_smp; + /* only used for poll queues: */ + spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; volatile struct nvme_completion *cqes; struct blk_mq_tags **tags; dma_addr_t sq_dma_addr; @@ -168,14 +196,20 @@ struct nvme_queue { u16 q_depth; s16 cq_vector; u16 sq_tail; + u16 last_sq_tail; u16 cq_head; u16 last_cq_head; u16 qid; u8 cq_phase; + unsigned long flags; +#define NVMEQ_ENABLED 0 +#define NVMEQ_SQ_CMB 1 +#define NVMEQ_DELETE_ERROR 2 u32 *dbbuf_sq_db; u32 *dbbuf_cq_db; u32 *dbbuf_sq_ei; u32 *dbbuf_cq_ei; + struct completion delete_done; }; /* @@ -218,9 +252,20 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); } +static unsigned int max_io_queues(void) +{ + return num_possible_cpus() + write_queues + poll_queues; +} + +static unsigned int max_queue_count(void) +{ + /* IO queues + admin queue */ + return 1 + max_io_queues(); +} + static inline unsigned int nvme_dbbuf_size(u32 stride) { - return ((num_possible_cpus() + 1) * 8 * stride); + return (max_queue_count() * 8 * stride); } static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) @@ -431,30 +476,90 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, return 0; } +static int queue_irq_offset(struct nvme_dev *dev) +{ + /* if we have more than 1 vec, admin queue offsets us by 1 */ + if (dev->num_vecs > 1) + return 1; + + return 0; +} + static int nvme_pci_map_queues(struct blk_mq_tag_set *set) { struct nvme_dev *dev = set->driver_data; + int i, qoff, offset; + + offset = queue_irq_offset(dev); + for (i = 0, qoff = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + map->nr_queues = dev->io_queues[i]; + if (!map->nr_queues) { + BUG_ON(i == HCTX_TYPE_DEFAULT); + continue; + } - return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), - dev->num_vecs > 1 ? 1 /* admin queue */ : 0); + /* + * The poll queue(s) doesn't have an IRQ (and hence IRQ + * affinity), so use the regular blk-mq cpu mapping + */ + map->queue_offset = qoff; + if (i != HCTX_TYPE_POLL) + blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); + else + blk_mq_map_queues(map); + qoff += map->nr_queues; + offset += map->nr_queues; + } + + return 0; +} + +/* + * Write sq tail if we are asked to, or if the next command would wrap. + */ +static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) +{ + if (!write_sq) { + u16 next_tail = nvmeq->sq_tail + 1; + + if (next_tail == nvmeq->q_depth) + next_tail = 0; + if (next_tail != nvmeq->last_sq_tail) + return; + } + + if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, + nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) + writel(nvmeq->sq_tail, nvmeq->q_db); + nvmeq->last_sq_tail = nvmeq->sq_tail; } /** * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use * @cmd: The command to send + * @write_sq: whether to write to the SQ doorbell */ -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, + bool write_sq) { spin_lock(&nvmeq->sq_lock); - memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); - if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; - if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, - nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) - writel(nvmeq->sq_tail, nvmeq->q_db); + nvme_write_sq_db(nvmeq, write_sq); + spin_unlock(&nvmeq->sq_lock); +} + +static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + spin_lock(&nvmeq->sq_lock); + if (nvmeq->sq_tail != nvmeq->last_sq_tail) + nvme_write_sq_db(nvmeq, true); spin_unlock(&nvmeq->sq_lock); } @@ -822,7 +927,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, * We should not need to do this, but we're still using this to * ensure we can drain requests on a dying queue. */ - if (unlikely(nvmeq->cq_vector < 0)) + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; ret = nvme_setup_cmd(ns, req, &cmnd); @@ -840,7 +945,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } blk_mq_start_request(req); - nvme_submit_cmd(nvmeq, &cmnd); + nvme_submit_cmd(nvmeq, &cmnd, bd->last); return BLK_STS_OK; out_cleanup_iod: nvme_free_iod(dev, req); @@ -899,6 +1004,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) } req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); + trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); nvme_end_request(req, cqe->status, cqe->result); } @@ -919,15 +1025,15 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } } -static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, - u16 *end, int tag) +static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, + u16 *end, unsigned int tag) { - bool found = false; + int found = 0; *start = nvmeq->cq_head; - while (!found && nvme_cqe_pending(nvmeq)) { - if (nvmeq->cqes[nvmeq->cq_head].command_id == tag) - found = true; + while (nvme_cqe_pending(nvmeq)) { + if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag) + found++; nvme_update_cq_head(nvmeq); } *end = nvmeq->cq_head; @@ -943,12 +1049,16 @@ static irqreturn_t nvme_irq(int irq, void *data) irqreturn_t ret = IRQ_NONE; u16 start, end; - spin_lock(&nvmeq->cq_lock); + /* + * The rmb/wmb pair ensures we see all updates from a previous run of + * the irq handler, even if that was on another CPU. + */ + rmb(); if (nvmeq->cq_head != nvmeq->last_cq_head) ret = IRQ_HANDLED; nvme_process_cq(nvmeq, &start, &end, -1); nvmeq->last_cq_head = nvmeq->cq_head; - spin_unlock(&nvmeq->cq_lock); + wmb(); if (start != end) { nvme_complete_cqes(nvmeq, start, end); @@ -966,27 +1076,50 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_NONE; } -static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) +/* + * Poll for completions any queue, including those not dedicated to polling. + * Can be called from any context. + */ +static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) { + struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); u16 start, end; - bool found; + int found; - if (!nvme_cqe_pending(nvmeq)) - return 0; - - spin_lock_irq(&nvmeq->cq_lock); - found = nvme_process_cq(nvmeq, &start, &end, tag); - spin_unlock_irq(&nvmeq->cq_lock); + /* + * For a poll queue we need to protect against the polling thread + * using the CQ lock. For normal interrupt driven threads we have + * to disable the interrupt to avoid racing with it. + */ + if (nvmeq->cq_vector == -1) { + spin_lock(&nvmeq->cq_poll_lock); + found = nvme_process_cq(nvmeq, &start, &end, tag); + spin_unlock(&nvmeq->cq_poll_lock); + } else { + disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + found = nvme_process_cq(nvmeq, &start, &end, tag); + enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + } nvme_complete_cqes(nvmeq, start, end); return found; } -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +static int nvme_poll(struct blk_mq_hw_ctx *hctx) { struct nvme_queue *nvmeq = hctx->driver_data; + u16 start, end; + bool found; + + if (!nvme_cqe_pending(nvmeq)) + return 0; + + spin_lock(&nvmeq->cq_poll_lock); + found = nvme_process_cq(nvmeq, &start, &end, -1); + spin_unlock(&nvmeq->cq_poll_lock); - return __nvme_poll(nvmeq, tag); + nvme_complete_cqes(nvmeq, start, end); + return found; } static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) @@ -998,7 +1131,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - nvme_submit_cmd(nvmeq, &c); + nvme_submit_cmd(nvmeq, &c, true); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1016,7 +1149,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq, s16 vector) { struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + int flags = NVME_QUEUE_PHYS_CONTIG; + + if (vector != -1) + flags |= NVME_CQ_IRQ_ENABLED; /* * Note: we (ab)use the fact that the prp fields survive if no data @@ -1028,7 +1164,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cqid = cpu_to_le16(qid); c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); c.create_cq.cq_flags = cpu_to_le16(flags); - c.create_cq.irq_vector = cpu_to_le16(vector); + if (vector != -1) + c.create_cq.irq_vector = cpu_to_le16(vector); + else + c.create_cq.irq_vector = 0; return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } @@ -1157,7 +1296,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) /* * Did we miss an interrupt? */ - if (__nvme_poll(nvmeq, req->tag)) { + if (nvme_poll_irqdisable(nvmeq, req->tag)) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", req->tag, nvmeq->qid); @@ -1237,17 +1376,15 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) { dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + if (!nvmeq->sq_cmds) + return; - if (nvmeq->sq_cmds) { - if (nvmeq->sq_cmds_is_io) - pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev), - nvmeq->sq_cmds, - SQ_SIZE(nvmeq->q_depth)); - else - dma_free_coherent(nvmeq->q_dmadev, - SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, - nvmeq->sq_dma_addr); + if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { + pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev), + nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth)); + } else { + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); } } @@ -1267,47 +1404,32 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) */ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { - int vector; - - spin_lock_irq(&nvmeq->cq_lock); - if (nvmeq->cq_vector == -1) { - spin_unlock_irq(&nvmeq->cq_lock); + if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) return 1; - } - vector = nvmeq->cq_vector; - nvmeq->dev->online_queues--; - nvmeq->cq_vector = -1; - spin_unlock_irq(&nvmeq->cq_lock); - /* - * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without - * having to grab the lock. - */ + /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ mb(); + nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); - - pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); - + if (nvmeq->cq_vector == -1) + return 0; + pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); + nvmeq->cq_vector = -1; return 0; } static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = &dev->queues[0]; - u16 start, end; if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); - spin_lock_irq(&nvmeq->cq_lock); - nvme_process_cq(nvmeq, &start, &end, -1); - spin_unlock_irq(&nvmeq->cq_lock); - - nvme_complete_cqes(nvmeq, start, end); + nvme_poll_irqdisable(nvmeq, -1); } static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, @@ -1343,15 +1465,14 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, nvmeq->sq_cmds); - nvmeq->sq_cmds_is_io = true; - } - - if (!nvmeq->sq_cmds) { - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), - &nvmeq->sq_dma_addr, GFP_KERNEL); - nvmeq->sq_cmds_is_io = false; + if (nvmeq->sq_dma_addr) { + set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); + return 0; + } } + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) return -ENOMEM; return 0; @@ -1375,7 +1496,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; spin_lock_init(&nvmeq->sq_lock); - spin_lock_init(&nvmeq->cq_lock); + spin_lock_init(&nvmeq->cq_poll_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; @@ -1411,28 +1532,34 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { struct nvme_dev *dev = nvmeq->dev; - spin_lock_irq(&nvmeq->cq_lock); nvmeq->sq_tail = 0; + nvmeq->last_sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; - spin_unlock_irq(&nvmeq->cq_lock); + wmb(); /* ensure the first interrupt sees the initialization */ } -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) { struct nvme_dev *dev = nvmeq->dev; int result; s16 vector; + clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); + /* * A queue's vector matches the queue identifier unless the controller * has only one vector available. */ - vector = dev->num_vecs == 1 ? 0 : qid; + if (!polled) + vector = dev->num_vecs == 1 ? 0 : qid; + else + vector = -1; + result = adapter_alloc_cq(dev, qid, nvmeq, vector); if (result) return result; @@ -1443,17 +1570,16 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) else if (result) goto release_cq; - /* - * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will - * invoke free_irq for it and cause a 'Trying to free already-free IRQ - * xxx' warning if the create CQ/SQ command times out. - */ nvmeq->cq_vector = vector; nvme_init_queue(nvmeq, qid); - result = queue_request_irq(nvmeq); - if (result < 0) - goto release_sq; + if (vector != -1) { + result = queue_request_irq(nvmeq); + if (result < 0) + goto release_sq; + } + + set_bit(NVMEQ_ENABLED, &nvmeq->flags); return result; release_sq: @@ -1477,6 +1603,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, + .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request, .map_queues = nvme_pci_map_queues, @@ -1602,12 +1729,13 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) return result; } + set_bit(NVMEQ_ENABLED, &nvmeq->flags); return result; } static int nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i, max; + unsigned i, max, rw_queues; int ret = 0; for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { @@ -1618,8 +1746,17 @@ static int nvme_create_io_queues(struct nvme_dev *dev) } max = min(dev->max_qid, dev->ctrl.queue_count - 1); + if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { + rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + + dev->io_queues[HCTX_TYPE_READ]; + } else { + rw_queues = max; + } + for (i = dev->online_queues; i <= max; i++) { - ret = nvme_create_queue(&dev->queues[i], i); + bool polled = i > rw_queues; + + ret = nvme_create_queue(&dev->queues[i], i, polled); if (ret) break; } @@ -1891,6 +2028,110 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } +static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues) +{ + unsigned int this_w_queues = write_queues; + + /* + * Setup read/write queue split + */ + if (irq_queues == 1) { + dev->io_queues[HCTX_TYPE_DEFAULT] = 1; + dev->io_queues[HCTX_TYPE_READ] = 0; + return; + } + + /* + * If 'write_queues' is set, ensure it leaves room for at least + * one read queue + */ + if (this_w_queues >= irq_queues) + this_w_queues = irq_queues - 1; + + /* + * If 'write_queues' is set to zero, reads and writes will share + * a queue set. + */ + if (!this_w_queues) { + dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues; + dev->io_queues[HCTX_TYPE_READ] = 0; + } else { + dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues; + dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues; + } +} + +static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + int irq_sets[2]; + struct irq_affinity affd = { + .pre_vectors = 1, + .nr_sets = ARRAY_SIZE(irq_sets), + .sets = irq_sets, + }; + int result = 0; + unsigned int irq_queues, this_p_queues; + + /* + * Poll queues don't need interrupts, but we need at least one IO + * queue left over for non-polled IO. + */ + this_p_queues = poll_queues; + if (this_p_queues >= nr_io_queues) { + this_p_queues = nr_io_queues - 1; + irq_queues = 1; + } else { + irq_queues = nr_io_queues - this_p_queues; + } + dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; + + /* + * For irq sets, we have to ask for minvec == maxvec. This passes + * any reduction back to us, so we can adjust our queue counts and + * IRQ vector needs. + */ + do { + nvme_calc_io_queues(dev, irq_queues); + irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT]; + irq_sets[1] = dev->io_queues[HCTX_TYPE_READ]; + if (!irq_sets[1]) + affd.nr_sets = 1; + + /* + * If we got a failure and we're down to asking for just + * 1 + 1 queues, just ask for a single vector. We'll share + * that between the single IO queue and the admin queue. + */ + if (result >= 0 && irq_queues > 1) + irq_queues = irq_sets[0] + irq_sets[1] + 1; + + result = pci_alloc_irq_vectors_affinity(pdev, irq_queues, + irq_queues, + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); + + /* + * Need to reduce our vec counts. If we get ENOSPC, the + * platform should support mulitple vecs, we just need + * to decrease our ask. If we get EINVAL, the platform + * likely does not. Back down to ask for just one vector. + */ + if (result == -ENOSPC) { + irq_queues--; + if (!irq_queues) + return result; + continue; + } else if (result == -EINVAL) { + irq_queues = 1; + continue; + } else if (result <= 0) + return -EIO; + break; + } while (1); + + return result; +} + static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = &dev->queues[0]; @@ -1898,17 +2139,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) int result, nr_io_queues; unsigned long size; - struct irq_affinity affd = { - .pre_vectors = 1 - }; - - nr_io_queues = num_possible_cpus(); + nr_io_queues = max_io_queues(); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; if (nr_io_queues == 0) return 0; + + clear_bit(NVMEQ_ENABLED, &adminq->flags); if (dev->cmb_use_sqes) { result = nvme_cmb_qdepth(dev, nr_io_queues, @@ -1937,12 +2176,19 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * setting up the full range we need. */ pci_free_irq_vectors(pdev); - result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1, - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); + + result = nvme_setup_irqs(dev, nr_io_queues); if (result <= 0) return -EIO; + dev->num_vecs = result; - dev->max_qid = max(result - 1, 1); + result = max(result - 1, 1); + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; + + dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", + dev->io_queues[HCTX_TYPE_DEFAULT], + dev->io_queues[HCTX_TYPE_READ], + dev->io_queues[HCTX_TYPE_POLL]); /* * Should investigate if there's a performance win from allocating @@ -1956,6 +2202,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) adminq->cq_vector = -1; return result; } + set_bit(NVMEQ_ENABLED, &adminq->flags); return nvme_create_io_queues(dev); } @@ -1964,23 +2211,15 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error) struct nvme_queue *nvmeq = req->end_io_data; blk_mq_free_request(req); - complete(&nvmeq->dev->ioq_wait); + complete(&nvmeq->delete_done); } static void nvme_del_cq_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; - u16 start, end; - - if (!error) { - unsigned long flags; - spin_lock_irqsave(&nvmeq->cq_lock, flags); - nvme_process_cq(nvmeq, &start, &end, -1); - spin_unlock_irqrestore(&nvmeq->cq_lock, flags); - - nvme_complete_cqes(nvmeq, start, end); - } + if (error) + set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); nvme_del_queue_end(req, error); } @@ -2002,37 +2241,44 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) req->timeout = ADMIN_TIMEOUT; req->end_io_data = nvmeq; + init_completion(&nvmeq->delete_done); blk_execute_rq_nowait(q, NULL, req, false, opcode == nvme_admin_delete_cq ? nvme_del_cq_end : nvme_del_queue_end); return 0; } -static void nvme_disable_io_queues(struct nvme_dev *dev) +static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) { - int pass, queues = dev->online_queues - 1; + int nr_queues = dev->online_queues - 1, sent = 0; unsigned long timeout; - u8 opcode = nvme_admin_delete_sq; - - for (pass = 0; pass < 2; pass++) { - int sent = 0, i = queues; - reinit_completion(&dev->ioq_wait); retry: - timeout = ADMIN_TIMEOUT; - for (; i > 0; i--, sent++) - if (nvme_delete_queue(&dev->queues[i], opcode)) - break; - - while (sent--) { - timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); - if (timeout == 0) - return; - if (i) - goto retry; - } - opcode = nvme_admin_delete_cq; + timeout = ADMIN_TIMEOUT; + while (nr_queues > 0) { + if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) + break; + nr_queues--; + sent++; } + while (sent) { + struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent]; + + timeout = wait_for_completion_io_timeout(&nvmeq->delete_done, + timeout); + if (timeout == 0) + return false; + + /* handle any remaining CQEs */ + if (opcode == nvme_admin_delete_cq && + !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags)) + nvme_poll_irqdisable(nvmeq, -1); + + sent--; + if (nr_queues) + goto retry; + } + return true; } /* @@ -2045,6 +2291,10 @@ static int nvme_dev_add(struct nvme_dev *dev) if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.nr_maps = 2; /* default + read */ + if (dev->io_queues[HCTX_TYPE_POLL]) + dev->tagset.nr_maps++; + dev->tagset.nr_maps = HCTX_MAX_TYPES; dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.numa_node = dev_to_node(dev->dev); dev->tagset.queue_depth = @@ -2187,7 +2437,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_stop_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { - nvme_disable_io_queues(dev); + if (nvme_disable_io_queues(dev, nvme_admin_delete_sq)) + nvme_disable_io_queues(dev, nvme_admin_delete_cq); nvme_disable_admin_queue(dev, shutdown); } for (i = dev->ctrl.queue_count - 1; i >= 0; i--) @@ -2491,8 +2742,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev) return -ENOMEM; - dev->queues = kcalloc_node(num_possible_cpus() + 1, - sizeof(struct nvme_queue), GFP_KERNEL, node); + dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue), + GFP_KERNEL, node); if (!dev->queues) goto free; @@ -2506,7 +2757,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); - init_completion(&dev->ioq_wait); result = nvme_setup_prp_pools(dev); if (result) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ab6ec7295bf9..0a2fd2949ad7 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -162,6 +162,13 @@ static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) return queue - queue->ctrl->queues; } +static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue) +{ + return nvme_rdma_queue_idx(queue) > + queue->ctrl->ctrl.opts->nr_io_queues + + queue->ctrl->ctrl.opts->nr_write_queues; +} + static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) { return queue->cmnd_capsule_len - sizeof(struct nvme_command); @@ -440,6 +447,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ int comp_vector, idx = nvme_rdma_queue_idx(queue); + enum ib_poll_context poll_ctx; int ret; queue->device = nvme_rdma_find_get_device(queue->cm_id); @@ -456,10 +464,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) */ comp_vector = idx == 0 ? idx : idx - 1; + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) + poll_ctx = IB_POLL_DIRECT; + else + poll_ctx = IB_POLL_SOFTIRQ; + /* +1 for ib_stop_cq */ queue->ib_cq = ib_alloc_cq(ibdev, queue, cq_factor * queue->queue_size + 1, - comp_vector, IB_POLL_SOFTIRQ); + comp_vector, poll_ctx); if (IS_ERR(queue->ib_cq)) { ret = PTR_ERR(queue->ib_cq); goto out_put_dev; @@ -595,15 +609,17 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) { + struct nvme_rdma_queue *queue = &ctrl->queues[idx]; + bool poll = nvme_rdma_poll_queue(queue); int ret; if (idx) - ret = nvmf_connect_io_queue(&ctrl->ctrl, idx); + ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll); else ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (!ret) - set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags); + set_bit(NVME_RDMA_Q_LIVE, &queue->flags); else dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); @@ -645,6 +661,9 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) nr_io_queues = min_t(unsigned int, nr_io_queues, ibdev->num_comp_vectors); + nr_io_queues += min(opts->nr_write_queues, num_online_cpus()); + nr_io_queues += min(opts->nr_poll_queues, num_online_cpus()); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) return ret; @@ -694,7 +713,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->ops = &nvme_rdma_admin_mq_ops; set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; set->reserved_tags = 2; /* connect + keep-alive */ - set->numa_node = NUMA_NO_NODE; + set->numa_node = nctrl->numa_node; set->cmd_size = sizeof(struct nvme_rdma_request) + SG_CHUNK_SIZE * sizeof(struct scatterlist); set->driver_data = ctrl; @@ -707,13 +726,14 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->ops = &nvme_rdma_mq_ops; set->queue_depth = nctrl->sqsize + 1; set->reserved_tags = 1; /* fabric connect */ - set->numa_node = NUMA_NO_NODE; + set->numa_node = nctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE; set->cmd_size = sizeof(struct nvme_rdma_request) + SG_CHUNK_SIZE * sizeof(struct scatterlist); set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; } ret = blk_mq_alloc_tag_set(set); @@ -763,6 +783,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, return error; ctrl->device = ctrl->queues[0].device; + ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device); ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); @@ -1411,12 +1432,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) WARN_ON_ONCE(ret); } -static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, - struct nvme_completion *cqe, struct ib_wc *wc, int tag) +static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, + struct nvme_completion *cqe, struct ib_wc *wc) { struct request *rq; struct nvme_rdma_request *req; - int ret = 0; rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); if (!rq) { @@ -1424,7 +1444,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, "tag 0x%x on QP %#x not found\n", cqe->command_id, queue->qp->qp_num); nvme_rdma_error_recovery(queue->ctrl); - return ret; + return; } req = blk_mq_rq_to_pdu(rq); @@ -1439,6 +1459,8 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, nvme_rdma_error_recovery(queue->ctrl); } } else if (req->mr) { + int ret; + ret = nvme_rdma_inv_rkey(queue, req); if (unlikely(ret < 0)) { dev_err(queue->ctrl->ctrl.device, @@ -1447,19 +1469,14 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, nvme_rdma_error_recovery(queue->ctrl); } /* the local invalidation completion will end the request */ - return 0; + return; } - if (refcount_dec_and_test(&req->ref)) { - if (rq->tag == tag) - ret = 1; + if (refcount_dec_and_test(&req->ref)) nvme_end_request(rq, req->status, req->result); - } - - return ret; } -static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvme_rdma_qe *qe = container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); @@ -1467,11 +1484,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) struct ib_device *ibdev = queue->device->dev; struct nvme_completion *cqe = qe->data; const size_t len = sizeof(struct nvme_completion); - int ret = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { nvme_rdma_wr_error(cq, wc, "RECV"); - return 0; + return; } ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); @@ -1486,16 +1502,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else - ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag); + nvme_rdma_process_nvme_rsp(queue, cqe, wc); ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); nvme_rdma_post_recv(queue, qe); - return ret; -} - -static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) -{ - __nvme_rdma_recv_done(cq, wc, -1); } static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) @@ -1749,25 +1759,11 @@ err: return BLK_STS_IOERR; } -static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) { struct nvme_rdma_queue *queue = hctx->driver_data; - struct ib_cq *cq = queue->ib_cq; - struct ib_wc wc; - int found = 0; - - while (ib_poll_cq(cq, 1, &wc) > 0) { - struct ib_cqe *cqe = wc.wr_cqe; - - if (cqe) { - if (cqe->done == nvme_rdma_recv_done) - found |= __nvme_rdma_recv_done(cq, &wc, tag); - else - cqe->done(cq, &wc); - } - } - return found; + return ib_process_cq_direct(queue->ib_cq, -1); } static void nvme_rdma_complete_rq(struct request *rq) @@ -1782,7 +1778,36 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = set->driver_data; - return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0); + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues; + if (ctrl->ctrl.opts->nr_write_queues) { + /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->ctrl.opts->nr_write_queues; + set->map[HCTX_TYPE_READ].queue_offset = + ctrl->ctrl.opts->nr_write_queues; + } else { + /* mixed read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->ctrl.opts->nr_io_queues; + set->map[HCTX_TYPE_READ].queue_offset = 0; + } + blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT], + ctrl->device->dev, 0); + blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ], + ctrl->device->dev, 0); + + if (ctrl->ctrl.opts->nr_poll_queues) { + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->ctrl.opts->nr_poll_queues; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->ctrl.opts->nr_io_queues; + if (ctrl->ctrl.opts->nr_write_queues) + set->map[HCTX_TYPE_POLL].queue_offset += + ctrl->ctrl.opts->nr_write_queues; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + return 0; } static const struct blk_mq_ops nvme_rdma_mq_ops = { @@ -1791,9 +1816,9 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { .init_request = nvme_rdma_init_request, .exit_request = nvme_rdma_exit_request, .init_hctx = nvme_rdma_init_hctx, - .poll = nvme_rdma_poll, .timeout = nvme_rdma_timeout, .map_queues = nvme_rdma_map_queues, + .poll = nvme_rdma_poll, }; static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { @@ -1938,7 +1963,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); - ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; @@ -1989,7 +2015,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .module = THIS_MODULE, .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | - NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, .create_ctrl = nvme_rdma_create_ctrl, }; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c new file mode 100644 index 000000000000..de174912445e --- /dev/null +++ b/drivers/nvme/host/tcp.c @@ -0,0 +1,2278 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics TCP host. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/nvme-tcp.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <linux/blk-mq.h> +#include <crypto/hash.h> + +#include "nvme.h" +#include "fabrics.h" + +struct nvme_tcp_queue; + +enum nvme_tcp_send_state { + NVME_TCP_SEND_CMD_PDU = 0, + NVME_TCP_SEND_H2C_PDU, + NVME_TCP_SEND_DATA, + NVME_TCP_SEND_DDGST, +}; + +struct nvme_tcp_request { + struct nvme_request req; + void *pdu; + struct nvme_tcp_queue *queue; + u32 data_len; + u32 pdu_len; + u32 pdu_sent; + u16 ttag; + struct list_head entry; + __le32 ddgst; + + struct bio *curr_bio; + struct iov_iter iter; + + /* send state */ + size_t offset; + size_t data_sent; + enum nvme_tcp_send_state state; +}; + +enum nvme_tcp_queue_flags { + NVME_TCP_Q_ALLOCATED = 0, + NVME_TCP_Q_LIVE = 1, +}; + +enum nvme_tcp_recv_state { + NVME_TCP_RECV_PDU = 0, + NVME_TCP_RECV_DATA, + NVME_TCP_RECV_DDGST, +}; + +struct nvme_tcp_ctrl; +struct nvme_tcp_queue { + struct socket *sock; + struct work_struct io_work; + int io_cpu; + + spinlock_t lock; + struct list_head send_list; + + /* recv state */ + void *pdu; + int pdu_remaining; + int pdu_offset; + size_t data_remaining; + size_t ddgst_remaining; + + /* send state */ + struct nvme_tcp_request *request; + + int queue_size; + size_t cmnd_capsule_len; + struct nvme_tcp_ctrl *ctrl; + unsigned long flags; + bool rd_enabled; + + bool hdr_digest; + bool data_digest; + struct ahash_request *rcv_hash; + struct ahash_request *snd_hash; + __le32 exp_ddgst; + __le32 recv_ddgst; + + struct page_frag_cache pf_cache; + + void (*state_change)(struct sock *); + void (*data_ready)(struct sock *); + void (*write_space)(struct sock *); +}; + +struct nvme_tcp_ctrl { + /* read only in the hot path */ + struct nvme_tcp_queue *queues; + struct blk_mq_tag_set tag_set; + + /* other member variables */ + struct list_head list; + struct blk_mq_tag_set admin_tag_set; + struct sockaddr_storage addr; + struct sockaddr_storage src_addr; + struct nvme_ctrl ctrl; + + struct work_struct err_work; + struct delayed_work connect_work; + struct nvme_tcp_request async_req; +}; + +static LIST_HEAD(nvme_tcp_ctrl_list); +static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); +static struct workqueue_struct *nvme_tcp_wq; +static struct blk_mq_ops nvme_tcp_mq_ops; +static struct blk_mq_ops nvme_tcp_admin_mq_ops; + +static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_tcp_ctrl, ctrl); +} + +static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) +{ + u32 queue_idx = nvme_tcp_queue_id(queue); + + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; +} + +static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue) +{ + return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) +{ + return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue) +{ + return queue->cmnd_capsule_len - sizeof(struct nvme_command); +} + +static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) +{ + return req == &req->queue->ctrl->async_req; +} + +static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) +{ + struct request *rq; + unsigned int bytes; + + if (unlikely(nvme_tcp_async_req(req))) + return false; /* async events don't have a request */ + + rq = blk_mq_rq_from_pdu(req); + bytes = blk_rq_payload_bytes(rq); + + return rq_data_dir(rq) == WRITE && bytes && + bytes <= nvme_tcp_inline_data_size(req->queue); +} + +static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) +{ + return req->iter.bvec->bv_page; +} + +static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req) +{ + return req->iter.bvec->bv_offset + req->iter.iov_offset; +} + +static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req) +{ + return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset, + req->pdu_len - req->pdu_sent); +} + +static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req) +{ + return req->iter.iov_offset; +} + +static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req) +{ + return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ? + req->pdu_len - req->pdu_sent : 0; +} + +static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, + int len) +{ + return nvme_tcp_pdu_data_left(req) <= len; +} + +static void nvme_tcp_init_iter(struct nvme_tcp_request *req, + unsigned int dir) +{ + struct request *rq = blk_mq_rq_from_pdu(req); + struct bio_vec *vec; + unsigned int size; + int nsegs; + size_t offset; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + vec = &rq->special_vec; + nsegs = 1; + size = blk_rq_payload_bytes(rq); + offset = 0; + } else { + struct bio *bio = req->curr_bio; + + vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + nsegs = bio_segments(bio); + size = bio->bi_iter.bi_size; + offset = bio->bi_iter.bi_bvec_done; + } + + iov_iter_bvec(&req->iter, dir, vec, nsegs, size); + req->iter.iov_offset = offset; +} + +static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, + int len) +{ + req->data_sent += len; + req->pdu_sent += len; + iov_iter_advance(&req->iter, len); + if (!iov_iter_count(&req->iter) && + req->data_sent < req->data_len) { + req->curr_bio = req->curr_bio->bi_next; + nvme_tcp_init_iter(req, WRITE); + } +} + +static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + + spin_lock(&queue->lock); + list_add_tail(&req->entry, &queue->send_list); + spin_unlock(&queue->lock); + + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + +static inline struct nvme_tcp_request * +nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + + spin_lock(&queue->lock); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (req) + list_del(&req->entry); + spin_unlock(&queue->lock); + + return req; +} + +static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, + __le32 *dgst) +{ + ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); + crypto_ahash_final(hash); +} + +static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, + struct page *page, off_t off, size_t len) +{ + struct scatterlist sg; + + sg_init_marker(&sg, 1); + sg_set_page(&sg, page, len, off); + ahash_request_set_crypt(hash, &sg, NULL, len); + crypto_ahash_update(hash); +} + +static inline void nvme_tcp_hdgst(struct ahash_request *hash, + void *pdu, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, pdu, len); + ahash_request_set_crypt(hash, &sg, pdu + len, len); + crypto_ahash_digest(hash); +} + +static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, + void *pdu, size_t pdu_len) +{ + struct nvme_tcp_hdr *hdr = pdu; + __le32 recv_digest; + __le32 exp_digest; + + if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d: header digest flag is cleared\n", + nvme_tcp_queue_id(queue)); + return -EPROTO; + } + + recv_digest = *(__le32 *)(pdu + hdr->hlen); + nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); + exp_digest = *(__le32 *)(pdu + hdr->hlen); + if (recv_digest != exp_digest) { + dev_err(queue->ctrl->ctrl.device, + "header digest error: recv %#x expected %#x\n", + le32_to_cpu(recv_digest), le32_to_cpu(exp_digest)); + return -EIO; + } + + return 0; +} + +static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) +{ + struct nvme_tcp_hdr *hdr = pdu; + u8 digest_len = nvme_tcp_hdgst_len(queue); + u32 len; + + len = le32_to_cpu(hdr->plen) - hdr->hlen - + ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0); + + if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d: data digest flag is cleared\n", + nvme_tcp_queue_id(queue)); + return -EPROTO; + } + crypto_ahash_init(queue->rcv_hash); + + return 0; +} + +static void nvme_tcp_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + page_frag_free(req->pdu); +} + +static int nvme_tcp_init_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx, + unsigned int numa_node) +{ + struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + req->pdu = page_frag_alloc(&queue->pf_cache, + sizeof(struct nvme_tcp_cmd_pdu) + hdgst, + GFP_KERNEL | __GFP_ZERO); + if (!req->pdu) + return -ENOMEM; + + req->queue = queue; + nvme_req(rq)->ctrl = &ctrl->ctrl; + + return 0; +} + +static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; + + hctx->driver_data = queue; + return 0; +} + +static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + + hctx->driver_data = queue; + return 0; +} + +static enum nvme_tcp_recv_state +nvme_tcp_recv_state(struct nvme_tcp_queue *queue) +{ + return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU : + (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST : + NVME_TCP_RECV_DATA; +} + +static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue) +{ + queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) + + nvme_tcp_hdgst_len(queue); + queue->pdu_offset = 0; + queue->data_remaining = -1; + queue->ddgst_remaining = 0; +} + +static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + return; + + queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work); +} + +static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, + struct nvme_completion *cqe) +{ + struct request *rq; + + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag 0x%x not found\n", + nvme_tcp_queue_id(queue), cqe->command_id); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EINVAL; + } + + nvme_end_request(rq, cqe->status, cqe->result); + + return 0; +} + +static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, + struct nvme_tcp_data_pdu *pdu) +{ + struct request *rq; + + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x not found\n", + nvme_tcp_queue_id(queue), pdu->command_id); + return -ENOENT; + } + + if (!blk_rq_payload_bytes(rq)) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x unexpected data\n", + nvme_tcp_queue_id(queue), rq->tag); + return -EIO; + } + + queue->data_remaining = le32_to_cpu(pdu->data_length); + + return 0; + +} + +static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, + struct nvme_tcp_rsp_pdu *pdu) +{ + struct nvme_completion *cqe = &pdu->cqe; + int ret = 0; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_tcp_queue_id(queue) == 0 && + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) + nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, + &cqe->result); + else + ret = nvme_tcp_process_nvme_cqe(queue, cqe); + + return ret; +} + +static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, + struct nvme_tcp_r2t_pdu *pdu) +{ + struct nvme_tcp_data_pdu *data = req->pdu; + struct nvme_tcp_queue *queue = req->queue; + struct request *rq = blk_mq_rq_from_pdu(req); + u8 hdgst = nvme_tcp_hdgst_len(queue); + u8 ddgst = nvme_tcp_ddgst_len(queue); + + req->pdu_len = le32_to_cpu(pdu->r2t_length); + req->pdu_sent = 0; + + if (unlikely(req->data_sent + req->pdu_len > req->data_len)) { + dev_err(queue->ctrl->ctrl.device, + "req %d r2t len %u exceeded data len %u (%zu sent)\n", + rq->tag, req->pdu_len, req->data_len, + req->data_sent); + return -EPROTO; + } + + if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) { + dev_err(queue->ctrl->ctrl.device, + "req %d unexpected r2t offset %u (expected %zu)\n", + rq->tag, le32_to_cpu(pdu->r2t_offset), + req->data_sent); + return -EPROTO; + } + + memset(data, 0, sizeof(*data)); + data->hdr.type = nvme_tcp_h2c_data; + data->hdr.flags = NVME_TCP_F_DATA_LAST; + if (queue->hdr_digest) + data->hdr.flags |= NVME_TCP_F_HDGST; + if (queue->data_digest) + data->hdr.flags |= NVME_TCP_F_DDGST; + data->hdr.hlen = sizeof(*data); + data->hdr.pdo = data->hdr.hlen + hdgst; + data->hdr.plen = + cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); + data->ttag = pdu->ttag; + data->command_id = rq->tag; + data->data_offset = cpu_to_le32(req->data_sent); + data->data_length = cpu_to_le32(req->pdu_len); + return 0; +} + +static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, + struct nvme_tcp_r2t_pdu *pdu) +{ + struct nvme_tcp_request *req; + struct request *rq; + int ret; + + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x not found\n", + nvme_tcp_queue_id(queue), pdu->command_id); + return -ENOENT; + } + req = blk_mq_rq_to_pdu(rq); + + ret = nvme_tcp_setup_h2c_data_pdu(req, pdu); + if (unlikely(ret)) + return ret; + + req->state = NVME_TCP_SEND_H2C_PDU; + req->offset = 0; + + nvme_tcp_queue_request(req); + + return 0; +} + +static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, + unsigned int *offset, size_t *len) +{ + struct nvme_tcp_hdr *hdr; + char *pdu = queue->pdu; + size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); + int ret; + + ret = skb_copy_bits(skb, *offset, + &pdu[queue->pdu_offset], rcv_len); + if (unlikely(ret)) + return ret; + + queue->pdu_remaining -= rcv_len; + queue->pdu_offset += rcv_len; + *offset += rcv_len; + *len -= rcv_len; + if (queue->pdu_remaining) + return 0; + + hdr = queue->pdu; + if (queue->hdr_digest) { + ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); + if (unlikely(ret)) + return ret; + } + + + if (queue->data_digest) { + ret = nvme_tcp_check_ddgst(queue, queue->pdu); + if (unlikely(ret)) + return ret; + } + + switch (hdr->type) { + case nvme_tcp_c2h_data: + ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); + break; + case nvme_tcp_rsp: + nvme_tcp_init_recv_ctx(queue); + ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); + break; + case nvme_tcp_r2t: + nvme_tcp_init_recv_ctx(queue); + ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); + break; + default: + dev_err(queue->ctrl->ctrl.device, + "unsupported pdu type (%d)\n", hdr->type); + return -EINVAL; + } + + return ret; +} + +static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, + unsigned int *offset, size_t *len) +{ + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; + struct nvme_tcp_request *req; + struct request *rq; + + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x not found\n", + nvme_tcp_queue_id(queue), pdu->command_id); + return -ENOENT; + } + req = blk_mq_rq_to_pdu(rq); + + while (true) { + int recv_len, ret; + + recv_len = min_t(size_t, *len, queue->data_remaining); + if (!recv_len) + break; + + if (!iov_iter_count(&req->iter)) { + req->curr_bio = req->curr_bio->bi_next; + + /* + * If we don`t have any bios it means that controller + * sent more data than we requested, hence error + */ + if (!req->curr_bio) { + dev_err(queue->ctrl->ctrl.device, + "queue %d no space in request %#x", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_init_recv_ctx(queue); + return -EIO; + } + nvme_tcp_init_iter(req, READ); + } + + /* we can read only from what is left in this bio */ + recv_len = min_t(size_t, recv_len, + iov_iter_count(&req->iter)); + + if (queue->data_digest) + ret = skb_copy_and_hash_datagram_iter(skb, *offset, + &req->iter, recv_len, queue->rcv_hash); + else + ret = skb_copy_datagram_iter(skb, *offset, + &req->iter, recv_len); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "queue %d failed to copy request %#x data", + nvme_tcp_queue_id(queue), rq->tag); + return ret; + } + + *len -= recv_len; + *offset += recv_len; + queue->data_remaining -= recv_len; + } + + if (!queue->data_remaining) { + if (queue->data_digest) { + nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); + queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; + } else { + nvme_tcp_init_recv_ctx(queue); + } + } + + return 0; +} + +static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int *offset, size_t *len) +{ + char *ddgst = (char *)&queue->recv_ddgst; + size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); + off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; + int ret; + + ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len); + if (unlikely(ret)) + return ret; + + queue->ddgst_remaining -= recv_len; + *offset += recv_len; + *len -= recv_len; + if (queue->ddgst_remaining) + return 0; + + if (queue->recv_ddgst != queue->exp_ddgst) { + dev_err(queue->ctrl->ctrl.device, + "data digest error: recv %#x expected %#x\n", + le32_to_cpu(queue->recv_ddgst), + le32_to_cpu(queue->exp_ddgst)); + return -EIO; + } + + nvme_tcp_init_recv_ctx(queue); + return 0; +} + +static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct nvme_tcp_queue *queue = desc->arg.data; + size_t consumed = len; + int result; + + while (len) { + switch (nvme_tcp_recv_state(queue)) { + case NVME_TCP_RECV_PDU: + result = nvme_tcp_recv_pdu(queue, skb, &offset, &len); + break; + case NVME_TCP_RECV_DATA: + result = nvme_tcp_recv_data(queue, skb, &offset, &len); + break; + case NVME_TCP_RECV_DDGST: + result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len); + break; + default: + result = -EFAULT; + } + if (result) { + dev_err(queue->ctrl->ctrl.device, + "receive failed: %d\n", result); + queue->rd_enabled = false; + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return result; + } + } + + return consumed; +} + +static void nvme_tcp_data_ready(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue && queue->rd_enabled)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + read_unlock(&sk->sk_callback_lock); +} + +static void nvme_tcp_write_space(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue && sk_stream_is_writeable(sk))) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvme_tcp_state_change(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (!queue) + goto done; + + switch (sk->sk_state) { + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* fallthrough */ + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + break; + default: + dev_info(queue->ctrl->ctrl.device, + "queue %d socket state %d\n", + nvme_tcp_queue_id(queue), sk->sk_state); + } + + queue->state_change(sk); +done: + read_unlock(&sk->sk_callback_lock); +} + +static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) +{ + queue->request = NULL; +} + +static void nvme_tcp_fail_request(struct nvme_tcp_request *req) +{ + union nvme_result res = {}; + + nvme_end_request(blk_mq_rq_from_pdu(req), + cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res); +} + +static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + + while (true) { + struct page *page = nvme_tcp_req_cur_page(req); + size_t offset = nvme_tcp_req_cur_offset(req); + size_t len = nvme_tcp_req_cur_length(req); + bool last = nvme_tcp_pdu_last_send(req, len); + int ret, flags = MSG_DONTWAIT; + + if (last && !queue->data_digest) + flags |= MSG_EOR; + else + flags |= MSG_MORE; + + ret = kernel_sendpage(queue->sock, page, offset, len, flags); + if (ret <= 0) + return ret; + + nvme_tcp_advance_req(req, ret); + if (queue->data_digest) + nvme_tcp_ddgst_update(queue->snd_hash, page, + offset, ret); + + /* fully successful last write*/ + if (last && ret == len) { + if (queue->data_digest) { + nvme_tcp_ddgst_final(queue->snd_hash, + &req->ddgst); + req->state = NVME_TCP_SEND_DDGST; + req->offset = 0; + } else { + nvme_tcp_done_send_req(queue); + } + return 1; + } + } + return -EAGAIN; +} + +static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + bool inline_data = nvme_tcp_has_inline_data(req); + int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR); + u8 hdgst = nvme_tcp_hdgst_len(queue); + int len = sizeof(*pdu) + hdgst - req->offset; + int ret; + + if (queue->hdr_digest && !req->offset) + nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, flags); + if (unlikely(ret <= 0)) + return ret; + + len -= ret; + if (!len) { + if (inline_data) { + req->state = NVME_TCP_SEND_DATA; + if (queue->data_digest) + crypto_ahash_init(queue->snd_hash); + nvme_tcp_init_iter(req, WRITE); + } else { + nvme_tcp_done_send_req(queue); + } + return 1; + } + req->offset += ret; + + return -EAGAIN; +} + +static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + struct nvme_tcp_data_pdu *pdu = req->pdu; + u8 hdgst = nvme_tcp_hdgst_len(queue); + int len = sizeof(*pdu) - req->offset + hdgst; + int ret; + + if (queue->hdr_digest && !req->offset) + nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, + MSG_DONTWAIT | MSG_MORE); + if (unlikely(ret <= 0)) + return ret; + + len -= ret; + if (!len) { + req->state = NVME_TCP_SEND_DATA; + if (queue->data_digest) + crypto_ahash_init(queue->snd_hash); + if (!req->data_sent) + nvme_tcp_init_iter(req, WRITE); + return 1; + } + req->offset += ret; + + return -EAGAIN; +} + +static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + int ret; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct kvec iov = { + .iov_base = &req->ddgst + req->offset, + .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset + }; + + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (unlikely(ret <= 0)) + return ret; + + if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) { + nvme_tcp_done_send_req(queue); + return 1; + } + + req->offset += ret; + return -EAGAIN; +} + +static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + int ret = 1; + + if (!queue->request) { + queue->request = nvme_tcp_fetch_request(queue); + if (!queue->request) + return 0; + } + req = queue->request; + + if (req->state == NVME_TCP_SEND_CMD_PDU) { + ret = nvme_tcp_try_send_cmd_pdu(req); + if (ret <= 0) + goto done; + if (!nvme_tcp_has_inline_data(req)) + return ret; + } + + if (req->state == NVME_TCP_SEND_H2C_PDU) { + ret = nvme_tcp_try_send_data_pdu(req); + if (ret <= 0) + goto done; + } + + if (req->state == NVME_TCP_SEND_DATA) { + ret = nvme_tcp_try_send_data(req); + if (ret <= 0) + goto done; + } + + if (req->state == NVME_TCP_SEND_DDGST) + ret = nvme_tcp_try_send_ddgst(req); +done: + if (ret == -EAGAIN) + ret = 0; + return ret; +} + +static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) +{ + struct sock *sk = queue->sock->sk; + read_descriptor_t rd_desc; + int consumed; + + rd_desc.arg.data = queue; + rd_desc.count = 1; + lock_sock(sk); + consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + release_sock(sk); + return consumed; +} + +static void nvme_tcp_io_work(struct work_struct *w) +{ + struct nvme_tcp_queue *queue = + container_of(w, struct nvme_tcp_queue, io_work); + unsigned long start = jiffies + msecs_to_jiffies(1); + + do { + bool pending = false; + int result; + + result = nvme_tcp_try_send(queue); + if (result > 0) { + pending = true; + } else if (unlikely(result < 0)) { + dev_err(queue->ctrl->ctrl.device, + "failed to send request %d\n", result); + if (result != -EPIPE) + nvme_tcp_fail_request(queue->request); + nvme_tcp_done_send_req(queue); + return; + } + + result = nvme_tcp_try_recv(queue); + if (result > 0) + pending = true; + + if (!pending) + return; + + } while (time_after(jiffies, start)); /* quota is exhausted */ + + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + +static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); + + ahash_request_free(queue->rcv_hash); + ahash_request_free(queue->snd_hash); + crypto_free_ahash(tfm); +} + +static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) +{ + struct crypto_ahash *tfm; + + tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->snd_hash) + goto free_tfm; + ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); + + queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->rcv_hash) + goto free_snd_hash; + ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); + + return 0; +free_snd_hash: + ahash_request_free(queue->snd_hash); +free_tfm: + crypto_free_ahash(tfm); + return -ENOMEM; +} + +static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) +{ + struct nvme_tcp_request *async = &ctrl->async_req; + + page_frag_free(async->pdu); +} + +static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) +{ + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + struct nvme_tcp_request *async = &ctrl->async_req; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + async->pdu = page_frag_alloc(&queue->pf_cache, + sizeof(struct nvme_tcp_cmd_pdu) + hdgst, + GFP_KERNEL | __GFP_ZERO); + if (!async->pdu) + return -ENOMEM; + + async->queue = &ctrl->queues[0]; + return 0; +} + +static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + + if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + return; + + if (queue->hdr_digest || queue->data_digest) + nvme_tcp_free_crypto(queue); + + sock_release(queue->sock); + kfree(queue->pdu); +} + +static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_icreq_pdu *icreq; + struct nvme_tcp_icresp_pdu *icresp; + struct msghdr msg = {}; + struct kvec iov; + bool ctrl_hdgst, ctrl_ddgst; + int ret; + + icreq = kzalloc(sizeof(*icreq), GFP_KERNEL); + if (!icreq) + return -ENOMEM; + + icresp = kzalloc(sizeof(*icresp), GFP_KERNEL); + if (!icresp) { + ret = -ENOMEM; + goto free_icreq; + } + + icreq->hdr.type = nvme_tcp_icreq; + icreq->hdr.hlen = sizeof(*icreq); + icreq->hdr.pdo = 0; + icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen); + icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); + icreq->maxr2t = 0; /* single inflight r2t supported */ + icreq->hpda = 0; /* no alignment constraint */ + if (queue->hdr_digest) + icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE; + if (queue->data_digest) + icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE; + + iov.iov_base = icreq; + iov.iov_len = sizeof(*icreq); + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (ret < 0) + goto free_icresp; + + memset(&msg, 0, sizeof(msg)); + iov.iov_base = icresp; + iov.iov_len = sizeof(*icresp); + ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (ret < 0) + goto free_icresp; + + ret = -EINVAL; + if (icresp->hdr.type != nvme_tcp_icresp) { + pr_err("queue %d: bad type returned %d\n", + nvme_tcp_queue_id(queue), icresp->hdr.type); + goto free_icresp; + } + + if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) { + pr_err("queue %d: bad pdu length returned %d\n", + nvme_tcp_queue_id(queue), icresp->hdr.plen); + goto free_icresp; + } + + if (icresp->pfv != NVME_TCP_PFV_1_0) { + pr_err("queue %d: bad pfv returned %d\n", + nvme_tcp_queue_id(queue), icresp->pfv); + goto free_icresp; + } + + ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE); + if ((queue->data_digest && !ctrl_ddgst) || + (!queue->data_digest && ctrl_ddgst)) { + pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n", + nvme_tcp_queue_id(queue), + queue->data_digest ? "enabled" : "disabled", + ctrl_ddgst ? "enabled" : "disabled"); + goto free_icresp; + } + + ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE); + if ((queue->hdr_digest && !ctrl_hdgst) || + (!queue->hdr_digest && ctrl_hdgst)) { + pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n", + nvme_tcp_queue_id(queue), + queue->hdr_digest ? "enabled" : "disabled", + ctrl_hdgst ? "enabled" : "disabled"); + goto free_icresp; + } + + if (icresp->cpda != 0) { + pr_err("queue %d: unsupported cpda returned %d\n", + nvme_tcp_queue_id(queue), icresp->cpda); + goto free_icresp; + } + + ret = 0; +free_icresp: + kfree(icresp); +free_icreq: + kfree(icreq); + return ret; +} + +static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, + int qid, size_t queue_size) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + struct linger sol = { .l_onoff = 1, .l_linger = 0 }; + int ret, opt, rcv_pdu_size, n; + + queue->ctrl = ctrl; + INIT_LIST_HEAD(&queue->send_list); + spin_lock_init(&queue->lock); + INIT_WORK(&queue->io_work, nvme_tcp_io_work); + queue->queue_size = queue_size; + + if (qid > 0) + queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + else + queue->cmnd_capsule_len = sizeof(struct nvme_command) + + NVME_TCP_ADMIN_CCSZ; + + ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &queue->sock); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to create socket: %d\n", ret); + return ret; + } + + /* Single syn retry */ + opt = 1; + ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, + (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set TCP_SYNCNT sock opt %d\n", ret); + goto err_sock; + } + + /* Set TCP no delay */ + opt = 1; + ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, + TCP_NODELAY, (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set TCP_NODELAY sock opt %d\n", ret); + goto err_sock; + } + + /* + * Cleanup whatever is sitting in the TCP transmit queue on socket + * close. This is done to prevent stale data from being sent should + * the network connection be restored before TCP times out. + */ + ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, + (char *)&sol, sizeof(sol)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set SO_LINGER sock opt %d\n", ret); + goto err_sock; + } + + queue->sock->sk->sk_allocation = GFP_ATOMIC; + if (!qid) + n = 0; + else + n = (qid - 1) % num_online_cpus(); + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + queue->request = NULL; + queue->data_remaining = 0; + queue->ddgst_remaining = 0; + queue->pdu_remaining = 0; + queue->pdu_offset = 0; + sk_set_memalloc(queue->sock->sk); + + if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, + sizeof(ctrl->src_addr)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to bind queue %d socket %d\n", + qid, ret); + goto err_sock; + } + } + + queue->hdr_digest = nctrl->opts->hdr_digest; + queue->data_digest = nctrl->opts->data_digest; + if (queue->hdr_digest || queue->data_digest) { + ret = nvme_tcp_alloc_crypto(queue); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to allocate queue %d crypto\n", qid); + goto err_sock; + } + } + + rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + + nvme_tcp_hdgst_len(queue); + queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); + if (!queue->pdu) { + ret = -ENOMEM; + goto err_crypto; + } + + dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", + nvme_tcp_queue_id(queue)); + + ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, + sizeof(ctrl->addr), 0); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to connect socket: %d\n", ret); + goto err_rcv_pdu; + } + + ret = nvme_tcp_init_connection(queue); + if (ret) + goto err_init_connect; + + queue->rd_enabled = true; + set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags); + nvme_tcp_init_recv_ctx(queue); + + write_lock_bh(&queue->sock->sk->sk_callback_lock); + queue->sock->sk->sk_user_data = queue; + queue->state_change = queue->sock->sk->sk_state_change; + queue->data_ready = queue->sock->sk->sk_data_ready; + queue->write_space = queue->sock->sk->sk_write_space; + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; + queue->sock->sk->sk_state_change = nvme_tcp_state_change; + queue->sock->sk->sk_write_space = nvme_tcp_write_space; + write_unlock_bh(&queue->sock->sk->sk_callback_lock); + + return 0; + +err_init_connect: + kernel_sock_shutdown(queue->sock, SHUT_RDWR); +err_rcv_pdu: + kfree(queue->pdu); +err_crypto: + if (queue->hdr_digest || queue->data_digest) + nvme_tcp_free_crypto(queue); +err_sock: + sock_release(queue->sock); + queue->sock = NULL; + return ret; +} + +static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = NULL; + sock->sk->sk_data_ready = queue->data_ready; + sock->sk->sk_state_change = queue->state_change; + sock->sk->sk_write_space = queue->write_space; + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) +{ + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + nvme_tcp_restore_sock_calls(queue); + cancel_work_sync(&queue->io_work); +} + +static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + + if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return; + + __nvme_tcp_stop_queue(queue); +} + +static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + int ret; + + if (idx) + ret = nvmf_connect_io_queue(nctrl, idx, false); + else + ret = nvmf_connect_admin_queue(nctrl); + + if (!ret) { + set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); + } else { + __nvme_tcp_stop_queue(&ctrl->queues[idx]); + dev_err(nctrl->device, + "failed to connect queue: %d ret=%d\n", idx, ret); + } + return ret; +} + +static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, + bool admin) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct blk_mq_tag_set *set; + int ret; + + if (admin) { + set = &ctrl->admin_tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_tcp_admin_mq_ops; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; + set->reserved_tags = 2; /* connect + keep-alive */ + set->numa_node = NUMA_NO_NODE; + set->cmd_size = sizeof(struct nvme_tcp_request); + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = ADMIN_TIMEOUT; + } else { + set = &ctrl->tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_tcp_mq_ops; + set->queue_depth = nctrl->sqsize + 1; + set->reserved_tags = 1; /* fabric connect */ + set->numa_node = NUMA_NO_NODE; + set->flags = BLK_MQ_F_SHOULD_MERGE; + set->cmd_size = sizeof(struct nvme_tcp_request); + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; + set->nr_maps = 2 /* default + read */; + } + + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ERR_PTR(ret); + + return set; +} + +static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) +{ + if (to_tcp_ctrl(ctrl)->async_req.pdu) { + nvme_tcp_free_async_req(to_tcp_ctrl(ctrl)); + to_tcp_ctrl(ctrl)->async_req.pdu = NULL; + } + + nvme_tcp_free_queue(ctrl, 0); +} + +static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_free_queue(ctrl, i); +} + +static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_stop_queue(ctrl, i); +} + +static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) +{ + int i, ret = 0; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_tcp_start_queue(ctrl, i); + if (ret) + goto out_stop_queues; + } + + return 0; + +out_stop_queues: + for (i--; i >= 1; i--) + nvme_tcp_stop_queue(ctrl, i); + return ret; +} + +static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); + if (ret) + return ret; + + ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl)); + if (ret) + goto out_free_queue; + + return 0; + +out_free_queue: + nvme_tcp_free_queue(ctrl, 0); + return ret; +} + +static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) +{ + int i, ret; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_tcp_alloc_queue(ctrl, i, + ctrl->sqsize + 1); + if (ret) + goto out_free_queues; + } + + return 0; + +out_free_queues: + for (i--; i >= 1; i--) + nvme_tcp_free_queue(ctrl, i); + + return ret; +} + +static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) +{ + unsigned int nr_io_queues; + + nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); + + return nr_io_queues; +} + +static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl) +{ + unsigned int nr_io_queues; + int ret; + + nr_io_queues = nvme_tcp_nr_io_queues(ctrl); + ret = nvme_set_queue_count(ctrl, &nr_io_queues); + if (ret) + return ret; + + ctrl->queue_count = nr_io_queues + 1; + if (ctrl->queue_count < 2) + return 0; + + dev_info(ctrl->device, + "creating %d I/O queues.\n", nr_io_queues); + + return nvme_tcp_alloc_io_queues(ctrl); +} + +static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) +{ + nvme_tcp_stop_io_queues(ctrl); + if (remove) { + if (ctrl->ops->flags & NVME_F_FABRICS) + blk_cleanup_queue(ctrl->connect_q); + blk_mq_free_tag_set(ctrl->tagset); + } + nvme_tcp_free_io_queues(ctrl); +} + +static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) +{ + int ret; + + ret = nvme_alloc_io_queues(ctrl); + if (ret) + return ret; + + if (new) { + ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false); + if (IS_ERR(ctrl->tagset)) { + ret = PTR_ERR(ctrl->tagset); + goto out_free_io_queues; + } + + if (ctrl->ops->flags & NVME_F_FABRICS) { + ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ctrl->connect_q)) { + ret = PTR_ERR(ctrl->connect_q); + goto out_free_tag_set; + } + } + } else { + blk_mq_update_nr_hw_queues(ctrl->tagset, + ctrl->queue_count - 1); + } + + ret = nvme_tcp_start_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + return 0; + +out_cleanup_connect_q: + if (new && (ctrl->ops->flags & NVME_F_FABRICS)) + blk_cleanup_queue(ctrl->connect_q); +out_free_tag_set: + if (new) + blk_mq_free_tag_set(ctrl->tagset); +out_free_io_queues: + nvme_tcp_free_io_queues(ctrl); + return ret; +} + +static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) +{ + nvme_tcp_stop_queue(ctrl, 0); + if (remove) { + free_opal_dev(ctrl->opal_dev); + blk_cleanup_queue(ctrl->admin_q); + blk_mq_free_tag_set(ctrl->admin_tagset); + } + nvme_tcp_free_admin_queue(ctrl); +} + +static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) +{ + int error; + + error = nvme_tcp_alloc_admin_queue(ctrl); + if (error) + return error; + + if (new) { + ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true); + if (IS_ERR(ctrl->admin_tagset)) { + error = PTR_ERR(ctrl->admin_tagset); + goto out_free_queue; + } + + ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->admin_q)) { + error = PTR_ERR(ctrl->admin_q); + goto out_free_tagset; + } + } + + error = nvme_tcp_start_queue(ctrl, 0); + if (error) + goto out_cleanup_queue; + + error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->device, + "prop_get NVME_REG_CAP failed\n"); + goto out_stop_queue; + } + + ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + + error = nvme_enable_ctrl(ctrl, ctrl->cap); + if (error) + goto out_stop_queue; + + error = nvme_init_identify(ctrl); + if (error) + goto out_stop_queue; + + return 0; + +out_stop_queue: + nvme_tcp_stop_queue(ctrl, 0); +out_cleanup_queue: + if (new) + blk_cleanup_queue(ctrl->admin_q); +out_free_tagset: + if (new) + blk_mq_free_tag_set(ctrl->admin_tagset); +out_free_queue: + nvme_tcp_free_admin_queue(ctrl); + return error; +} + +static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, + bool remove) +{ + blk_mq_quiesce_queue(ctrl->admin_q); + nvme_tcp_stop_queue(ctrl, 0); + blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); + blk_mq_unquiesce_queue(ctrl->admin_q); + nvme_tcp_destroy_admin_queue(ctrl, remove); +} + +static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, + bool remove) +{ + if (ctrl->queue_count <= 1) + return; + nvme_stop_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); + blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl); + if (remove) + nvme_start_queues(ctrl); + nvme_tcp_destroy_io_queues(ctrl, remove); +} + +static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) +{ + /* If we are resetting/deleting then do nothing */ + if (ctrl->state != NVME_CTRL_CONNECTING) { + WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW || + ctrl->state == NVME_CTRL_LIVE); + return; + } + + if (nvmf_should_reconnect(ctrl)) { + dev_info(ctrl->device, "Reconnecting in %d seconds...\n", + ctrl->opts->reconnect_delay); + queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work, + ctrl->opts->reconnect_delay * HZ); + } else { + dev_info(ctrl->device, "Removing controller...\n"); + nvme_delete_ctrl(ctrl); + } +} + +static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) +{ + struct nvmf_ctrl_options *opts = ctrl->opts; + int ret = -EINVAL; + + ret = nvme_tcp_configure_admin_queue(ctrl, new); + if (ret) + return ret; + + if (ctrl->icdoff) { + dev_err(ctrl->device, "icdoff is not supported!\n"); + goto destroy_admin; + } + + if (opts->queue_size > ctrl->sqsize + 1) + dev_warn(ctrl->device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + opts->queue_size, ctrl->sqsize + 1); + + if (ctrl->sqsize + 1 > ctrl->maxcmd) { + dev_warn(ctrl->device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->sqsize + 1, ctrl->maxcmd); + ctrl->sqsize = ctrl->maxcmd - 1; + } + + if (ctrl->queue_count > 1) { + ret = nvme_tcp_configure_io_queues(ctrl, new); + if (ret) + goto destroy_admin; + } + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + ret = -EINVAL; + goto destroy_io; + } + + nvme_start_ctrl(ctrl); + return 0; + +destroy_io: + if (ctrl->queue_count > 1) + nvme_tcp_destroy_io_queues(ctrl, new); +destroy_admin: + nvme_tcp_stop_queue(ctrl, 0); + nvme_tcp_destroy_admin_queue(ctrl, new); + return ret; +} + +static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work), + struct nvme_tcp_ctrl, connect_work); + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + + ++ctrl->nr_reconnects; + + if (nvme_tcp_setup_ctrl(ctrl, false)) + goto requeue; + + dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n", + ctrl->nr_reconnects); + + ctrl->nr_reconnects = 0; + + return; + +requeue: + dev_info(ctrl->device, "Failed reconnect attempt %d\n", + ctrl->nr_reconnects); + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_error_recovery_work(struct work_struct *work) +{ + struct nvme_tcp_ctrl *tcp_ctrl = container_of(work, + struct nvme_tcp_ctrl, err_work); + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_io_queues(ctrl, false); + /* unquiesce to fail fast pending requests */ + nvme_start_queues(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, false); + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + return; + } + + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) +{ + nvme_tcp_teardown_io_queues(ctrl, shutdown); + if (shutdown) + nvme_shutdown_ctrl(ctrl); + else + nvme_disable_ctrl(ctrl, ctrl->cap); + nvme_tcp_teardown_admin_queue(ctrl, shutdown); +} + +static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl) +{ + nvme_tcp_teardown_ctrl(ctrl, true); +} + +static void nvme_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, reset_work); + + nvme_stop_ctrl(ctrl); + nvme_tcp_teardown_ctrl(ctrl, false); + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + return; + } + + if (nvme_tcp_setup_ctrl(ctrl, false)) + goto out_fail; + + return; + +out_fail: + ++ctrl->nr_reconnects; + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) +{ + cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); + cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); +} + +static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_tcp_ctrl_mutex); + + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl->queues); + kfree(ctrl); +} + +static void nvme_tcp_set_sg_null(struct nvme_command *c) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = 0; + sg->length = 0; + sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; +} + +static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue, + struct nvme_command *c, u32 data_len) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); + sg->length = cpu_to_le32(data_len); + sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; +} + +static void nvme_tcp_set_sg_host_data(struct nvme_command *c, + u32 data_len) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = 0; + sg->length = cpu_to_le32(data_len); + sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; +} + +static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg); + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu; + struct nvme_command *cmd = &pdu->cmd; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + memset(pdu, 0, sizeof(*pdu)); + pdu->hdr.type = nvme_tcp_cmd; + if (queue->hdr_digest) + pdu->hdr.flags |= NVME_TCP_F_HDGST; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + + cmd->common.opcode = nvme_admin_async_event; + cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; + cmd->common.flags |= NVME_CMD_SGL_METABUF; + nvme_tcp_set_sg_null(cmd); + + ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU; + ctrl->async_req.offset = 0; + ctrl->async_req.curr_bio = NULL; + ctrl->async_req.data_len = 0; + + nvme_tcp_queue_request(&ctrl->async_req); +} + +static enum blk_eh_timer_return +nvme_tcp_timeout(struct request *rq, bool reserved) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + + dev_dbg(ctrl->ctrl.device, + "queue %d: timeout request %#x type %d\n", + nvme_tcp_queue_id(req->queue), rq->tag, + pdu->hdr.type); + + if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + union nvme_result res = {}; + + nvme_req(rq)->flags |= NVME_REQ_CANCELLED; + nvme_end_request(rq, cpu_to_le16(NVME_SC_ABORT_REQ), res); + return BLK_EH_DONE; + } + + /* queue error recovery */ + nvme_tcp_error_recovery(&ctrl->ctrl); + + return BLK_EH_RESET_TIMER; +} + +static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, + struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_command *c = &pdu->cmd; + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (rq_data_dir(rq) == WRITE && req->data_len && + req->data_len <= nvme_tcp_inline_data_size(queue)) + nvme_tcp_set_sg_inline(queue, c, req->data_len); + else + nvme_tcp_set_sg_host_data(c, req->data_len); + + return 0; +} + +static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, + struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_queue *queue = req->queue; + u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; + blk_status_t ret; + + ret = nvme_setup_cmd(ns, rq, &pdu->cmd); + if (ret) + return ret; + + req->state = NVME_TCP_SEND_CMD_PDU; + req->offset = 0; + req->data_sent = 0; + req->pdu_len = 0; + req->pdu_sent = 0; + req->data_len = blk_rq_payload_bytes(rq); + req->curr_bio = rq->bio; + + if (rq_data_dir(rq) == WRITE && + req->data_len <= nvme_tcp_inline_data_size(queue)) + req->pdu_len = req->data_len; + else if (req->curr_bio) + nvme_tcp_init_iter(req, READ); + + pdu->hdr.type = nvme_tcp_cmd; + pdu->hdr.flags = 0; + if (queue->hdr_digest) + pdu->hdr.flags |= NVME_TCP_F_HDGST; + if (queue->data_digest && req->pdu_len) { + pdu->hdr.flags |= NVME_TCP_F_DDGST; + ddgst = nvme_tcp_ddgst_len(queue); + } + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0; + pdu->hdr.plen = + cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst); + + ret = nvme_tcp_map_data(queue, rq); + if (unlikely(ret)) { + dev_err(queue->ctrl->ctrl.device, + "Failed to map data (%d)\n", ret); + return ret; + } + + return 0; +} + +static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_tcp_queue *queue = hctx->driver_data; + struct request *rq = bd->rq; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags); + blk_status_t ret; + + if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + + ret = nvme_tcp_setup_cmd_pdu(ns, rq); + if (unlikely(ret)) + return ret; + + blk_mq_start_request(rq); + + nvme_tcp_queue_request(req); + + return BLK_STS_OK; +} + +static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_tcp_ctrl *ctrl = set->driver_data; + + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues; + if (ctrl->ctrl.opts->nr_write_queues) { + /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->ctrl.opts->nr_write_queues; + set->map[HCTX_TYPE_READ].queue_offset = + ctrl->ctrl.opts->nr_write_queues; + } else { + /* mixed read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->ctrl.opts->nr_io_queues; + set->map[HCTX_TYPE_READ].queue_offset = 0; + } + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + return 0; +} + +static struct blk_mq_ops nvme_tcp_mq_ops = { + .queue_rq = nvme_tcp_queue_rq, + .complete = nvme_complete_rq, + .init_request = nvme_tcp_init_request, + .exit_request = nvme_tcp_exit_request, + .init_hctx = nvme_tcp_init_hctx, + .timeout = nvme_tcp_timeout, + .map_queues = nvme_tcp_map_queues, +}; + +static struct blk_mq_ops nvme_tcp_admin_mq_ops = { + .queue_rq = nvme_tcp_queue_rq, + .complete = nvme_complete_rq, + .init_request = nvme_tcp_init_request, + .exit_request = nvme_tcp_exit_request, + .init_hctx = nvme_tcp_init_admin_hctx, + .timeout = nvme_tcp_timeout, +}; + +static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { + .name = "tcp", + .module = THIS_MODULE, + .flags = NVME_F_FABRICS, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .free_ctrl = nvme_tcp_free_ctrl, + .submit_async_event = nvme_tcp_submit_async_event, + .delete_ctrl = nvme_tcp_delete_ctrl, + .get_address = nvmf_get_address, + .stop_ctrl = nvme_tcp_stop_ctrl, +}; + +static bool +nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + bool found = false; + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) { + found = nvmf_ip_options_match(&ctrl->ctrl, opts); + if (found) + break; + } + mutex_unlock(&nvme_tcp_ctrl_mutex); + + return found; +} + +static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + int ret; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ctrl->list); + ctrl->ctrl.opts = opts; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; + ctrl->ctrl.sqsize = opts->queue_size - 1; + ctrl->ctrl.kato = opts->kato; + + INIT_DELAYED_WORK(&ctrl->connect_work, + nvme_tcp_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); + + if (!(opts->mask & NVMF_OPT_TRSVCID)) { + opts->trsvcid = + kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL); + if (!opts->trsvcid) { + ret = -ENOMEM; + goto out_free_ctrl; + } + opts->mask |= NVMF_OPT_TRSVCID; + } + + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->traddr, opts->trsvcid, &ctrl->addr); + if (ret) { + pr_err("malformed address passed: %s:%s\n", + opts->traddr, opts->trsvcid); + goto out_free_ctrl; + } + + if (opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->host_traddr, NULL, &ctrl->src_addr); + if (ret) { + pr_err("malformed src address passed: %s\n", + opts->host_traddr); + goto out_free_ctrl; + } + } + + if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { + ret = -EALREADY; + goto out_free_ctrl; + } + + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) { + ret = -ENOMEM; + goto out_free_ctrl; + } + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0); + if (ret) + goto out_kfree_queues; + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + WARN_ON_ONCE(1); + ret = -EINTR; + goto out_uninit_ctrl; + } + + ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true); + if (ret) + goto out_uninit_ctrl; + + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", + ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + + nvme_get_ctrl(&ctrl->ctrl); + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); + mutex_unlock(&nvme_tcp_ctrl_mutex); + + return &ctrl->ctrl; + +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +out_kfree_queues: + kfree(ctrl->queues); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvmf_transport_ops nvme_tcp_transport = { + .name = "tcp", + .module = THIS_MODULE, + .required_opts = NVMF_OPT_TRADDR, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | + NVMF_OPT_NR_WRITE_QUEUES, + .create_ctrl = nvme_tcp_create_ctrl, +}; + +static int __init nvme_tcp_init_module(void) +{ + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!nvme_tcp_wq) + return -ENOMEM; + + nvmf_register_transport(&nvme_tcp_transport); + return 0; +} + +static void __exit nvme_tcp_cleanup_module(void) +{ + struct nvme_tcp_ctrl *ctrl; + + nvmf_unregister_transport(&nvme_tcp_transport); + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) + nvme_delete_ctrl(&ctrl->ctrl); + mutex_unlock(&nvme_tcp_ctrl_mutex); + flush_workqueue(nvme_delete_wq); + + destroy_workqueue(nvme_tcp_wq); +} + +module_init(nvme_tcp_init_module); +module_exit(nvme_tcp_cleanup_module); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 25b0e310f4a8..5566dda3237a 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -139,3 +139,6 @@ const char *nvme_trace_disk_name(struct trace_seq *p, char *name) return ret; } +EXPORT_SYMBOL_GPL(nvme_trace_disk_name); + +EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq); diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 196d5bd56718..3564120aa7b3 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -115,8 +115,8 @@ TRACE_EVENT(nvme_setup_cmd, __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = le64_to_cpu(cmd->common.metadata); __assign_disk_name(__entry->disk, req->rq_disk); - memcpy(__entry->cdw10, cmd->common.cdw10, - sizeof(__entry->cdw10)); + memcpy(__entry->cdw10, &cmd->common.cdw10, + 6 * sizeof(__entry->cdw10)); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", __entry->ctrl_id, __print_disk_name(__entry->disk), @@ -184,6 +184,29 @@ TRACE_EVENT(nvme_async_event, #undef aer_name +TRACE_EVENT(nvme_sq, + TP_PROTO(struct request *req, __le16 sq_head, int sq_tail), + TP_ARGS(req, sq_head, sq_tail), + TP_STRUCT__entry( + __field(int, ctrl_id) + __array(char, disk, DISK_NAME_LEN) + __field(int, qid) + __field(u16, sq_head) + __field(u16, sq_tail) + ), + TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; + __assign_disk_name(__entry->disk, req->rq_disk); + __entry->qid = nvme_req_qid(req); + __entry->sq_head = le16_to_cpu(sq_head); + __entry->sq_tail = sq_tail; + ), + TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->sq_head, __entry->sq_tail + ) +); + #endif /* _TRACE_NVME_H */ #undef TRACE_INCLUDE_PATH |