diff options
author | Jens Axboe <axboe@kernel.dk> | 2019-08-30 22:21:27 +0200 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2019-08-30 22:21:27 +0200 |
commit | 8f5914bceef03827c3feb413874b2e6f29c821be (patch) | |
tree | a4639c5ce9d83e5ebb980eb1e703ae5d9470ea6d | |
parent | writeback: add tracepoints for cgroup foreign writebacks (diff) | |
parent | nvme-rdma: Use rq_dma_dir macro (diff) | |
download | linux-8f5914bceef03827c3feb413874b2e6f29c821be.tar.xz linux-8f5914bceef03827c3feb413874b2e6f29c821be.zip |
Merge branch 'nvme-5.4' of git://git.infradead.org/nvme into for-5.4/block
Pull NVMe changes from Sagi:
"The nvme updates include:
- ana log parse fix from Anton
- nvme quirks support for Apple devices from Ben
- fix missing bio completion tracing for multipath stack devices from
Hannes and Mikhail
- IP TOS settings for nvme rdma and tcp transports from Israel
- rq_dma_dir cleanups from Israel
- tracing for Get LBA Status command from Minwoo
- Some nvme-tcp cleanups from Minwoo, Potnuri and Myself
- Some consolidation between the fabrics transports for handling the CAP
register
- reset race with ns scanning fix for fabrics (move fabrics commands to
a dedicated request queue with a different lifetime from the admin
request queue)."
* 'nvme-5.4' of git://git.infradead.org/nvme: (30 commits)
nvme-rdma: Use rq_dma_dir macro
nvme-fc: Use rq_dma_dir macro
nvme-pci: Tidy up nvme_unmap_data
nvme: make fabrics command run on a separate request queue
nvme-pci: Support shared tags across queues for Apple 2018 controllers
nvme-pci: Add support for Apple 2018+ models
nvme-pci: Add support for variable IO SQ element size
nvme-pci: Pass the queue to SQ_SIZE/CQ_SIZE macros
nvme: trace bio completion
nvme-multipath: fix ana log nsid lookup when nsid is not found
nvmet-tcp: Add TOS for tcp transport
nvme-tcp: Add TOS for tcp transport
nvme-tcp: Use struct nvme_ctrl directly
nvme-rdma: Add TOS for rdma transport
nvme-fabrics: Add type of service (TOS) configuration
nvmet-tcp: fix possible memory leak
nvmet-tcp: fix possible NULL deref
nvmet: trace: parse Get LBA Status command in detail
nvme: trace: parse Get LBA Status command in detail
nvme: trace: support for Get LBA Status opcode parsed
...
-rw-r--r-- | drivers/nvme/host/Kconfig | 1 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 37 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 26 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 3 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 34 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 8 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 36 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 99 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 53 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 132 | ||||
-rw-r--r-- | drivers/nvme/host/trace.c | 18 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 14 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 28 | ||||
-rw-r--r-- | drivers/nvme/target/tcp.c | 24 | ||||
-rw-r--r-- | drivers/nvme/target/trace.c | 18 | ||||
-rw-r--r-- | include/linux/nvme.h | 5 |
16 files changed, 379 insertions, 157 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index ec43ac9199e2..2b36f052bfb9 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -64,6 +64,7 @@ config NVME_TCP depends on INET depends on BLK_DEV_NVME select NVME_FABRICS + select CRYPTO_CRC32C help This provides support for the NVMe over Fabrics protocol using the TCP transport. This allows you to use remote block devices diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4ba374633dc8..4660505eded9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -22,12 +22,12 @@ #include <linux/pm_qos.h> #include <asm/unaligned.h> -#define CREATE_TRACE_POINTS -#include "trace.h" - #include "nvme.h" #include "fabrics.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + #define NVME_MINORS (1U << MINORBITS) unsigned int admin_timeout = 60; @@ -279,6 +279,8 @@ void nvme_complete_rq(struct request *req) return; } } + + nvme_trace_bio_complete(req, status); blk_mq_end_request(req, status); } EXPORT_SYMBOL_GPL(nvme_complete_rq); @@ -1950,7 +1952,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) * bits', but doing so may cause the device to complete commands to the * admin queue ... and we don't know what memory that might be pointing at! */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_disable_ctrl(struct nvme_ctrl *ctrl) { int ret; @@ -1964,20 +1966,27 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - return nvme_wait_ready(ctrl, cap, false); + return nvme_wait_ready(ctrl, ctrl->cap, false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { /* * Default to a 4K page size, with the intention to update this * path in the future to accomodate architectures with differing * kernel and IO page sizes. */ - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + unsigned dev_page_min, page_shift = 12; int ret; + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (ret) { + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); + return ret; + } + dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; + if (page_shift < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", @@ -1996,7 +2005,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, cap, true); + return nvme_wait_ready(ctrl, ctrl->cap, true); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); @@ -2562,7 +2571,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; - u64 cap; int ret, page_shift; u32 max_hw_sectors; bool prev_apst_enabled; @@ -2572,16 +2580,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); return ret; } - - ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); - if (ret) { - dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); - return ret; - } - page_shift = NVME_CAP_MPSMIN(cap) + 12; + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(cap); + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); ret = nvme_identify_ctrl(ctrl, &id); if (ret) { diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 1994d5b42f94..145c210edb03 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -150,7 +150,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) cmd.prop_get.fctype = nvme_fabrics_type_property_get; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0) @@ -197,7 +197,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) cmd.prop_get.attrib = 1; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0) @@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) cmd.prop_set.offset = cpu_to_le32(off); cmd.prop_set.value = cpu_to_le64(val); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (unlikely(ret)) dev_err(ctrl->device, @@ -396,7 +396,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, data, sizeof(*data), 0, NVME_QID_ANY, 1, BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false); if (ret) { @@ -611,6 +611,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_DATA_DIGEST, "data_digest" }, { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, + { NVMF_OPT_TOS, "tos=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -632,6 +633,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->duplicate_connect = false; opts->hdr_digest = false; opts->data_digest = false; + opts->tos = -1; /* < 0 == use transport default */ options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -856,6 +858,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->nr_poll_queues = token; break; + case NVMF_OPT_TOS: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < 0) { + pr_err("Invalid type of service %d\n", token); + ret = -EINVAL; + goto out; + } + if (token > 255) { + pr_warn("Clamping type of service to 255\n"); + token = 255; + } + opts->tos = token; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 3044d8b99a24..93f08d77c896 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -55,6 +55,7 @@ enum { NVMF_OPT_DATA_DIGEST = 1 << 16, NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, NVMF_OPT_NR_POLL_QUEUES = 1 << 18, + NVMF_OPT_TOS = 1 << 19, }; /** @@ -87,6 +88,7 @@ enum { * @data_digest: generate/verify data digest (TCP) * @nr_write_queues: number of queues for write I/O * @nr_poll_queues: number of queues for polling I/O + * @tos: type of service */ struct nvmf_ctrl_options { unsigned mask; @@ -108,6 +110,7 @@ struct nvmf_ctrl_options { bool data_digest; unsigned int nr_write_queues; unsigned int nr_poll_queues; + int tos; }; /* diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f39ed8cc23a2..bafe35bdffac 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2006,6 +2006,7 @@ nvme_fc_ctrl_free(struct kref *ref) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); kfree(ctrl->queues); @@ -2107,7 +2108,6 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, struct nvme_fc_fcp_op *op) { struct nvmefc_fcp_req *freq = &op->fcp_req; - enum dma_data_direction dir; int ret; freq->sg_cnt = 0; @@ -2124,9 +2124,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); - dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, - op->nents, dir); + op->nents, rq_dma_dir(rq)); if (unlikely(freq->sg_cnt <= 0)) { sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE); freq->sg_cnt = 0; @@ -2149,8 +2148,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, return; fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, - ((rq_data_dir(rq) == WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE)); + rq_dma_dir(rq)); nvme_cleanup_cmd(rq); @@ -2633,8 +2631,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queue; - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; @@ -2648,23 +2644,15 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * prior connection values */ - ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); - if (ret) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_disconnect_admin_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + ret = nvme_enable_ctrl(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; ctrl->ctrl.max_hw_sectors = (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + ret = nvme_init_identify(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; @@ -3111,10 +3099,16 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_free_queues; ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + ret = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_admin_tag_set; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { ret = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_admin_tag_set; + goto out_cleanup_fabrics_q; } /* @@ -3186,6 +3180,8 @@ fail_ctrl: out_cleanup_admin_q: blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_admin_tag_set: blk_mq_free_tag_set(&ctrl->admin_tag_set); out_free_queues: diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 4f0d0d12744e..961011abd2ab 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -444,14 +444,16 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, down_write(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) + unsigned nsid = le32_to_cpu(desc->nsids[n]); + + if (ns->head->ns_id < nsid) continue; - nvme_update_ns_ana_state(desc, ns); + if (ns->head->ns_id == nsid) + nvme_update_ns_ana_state(desc, ns); if (++n == nr_nsids) break; } up_write(&ctrl->namespaces_rwsem); - WARN_ON_ONCE(n < nr_nsids); return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 26b563f9985b..a818313a1f15 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -16,6 +16,8 @@ #include <linux/fault-inject.h> #include <linux/rcupdate.h> +#include <trace/events/block.h> + extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) @@ -92,6 +94,21 @@ enum nvme_quirks { * Broken Write Zeroes. */ NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), + + /* + * Use only one interrupt vector for all queues + */ + NVME_QUIRK_SINGLE_VECTOR = (1 << 10), + + /* + * Use non-standard 128 bytes SQEs. + */ + NVME_QUIRK_128_BYTES_SQES = (1 << 11), + + /* + * Prevent tag overlap between queues + */ + NVME_QUIRK_SHARED_TAGS = (1 << 12), }; /* @@ -164,6 +181,7 @@ struct nvme_ctrl { const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; struct request_queue *connect_q; + struct request_queue *fabrics_q; struct device *dev; int instance; int numa_node; @@ -426,8 +444,8 @@ void nvme_complete_rq(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); @@ -511,6 +529,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } +static inline void nvme_trace_bio_complete(struct request *req, + blk_status_t status) +{ + struct nvme_ns *ns = req->q->queuedata; + + if (req->cmd_flags & REQ_NVME_MPATH) + trace_block_bio_complete(ns->head->disk->queue, + req->bio, status); +} + extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; @@ -554,6 +582,10 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } +static inline void nvme_trace_bio_complete(struct request *req, + blk_status_t status) +{ +} static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 45a80b708ef4..5c3732fd02bc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -28,8 +28,8 @@ #include "trace.h" #include "nvme.h" -#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) -#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) +#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) @@ -100,6 +100,7 @@ struct nvme_dev { unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; int q_depth; + int io_sqes; u32 db_stride; void __iomem *bar; unsigned long bar_mapped_size; @@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) struct nvme_queue { struct nvme_dev *dev; spinlock_t sq_lock; - struct nvme_command *sq_cmds; + void *sq_cmds; /* only used for poll queues: */ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; volatile struct nvme_completion *cqes; @@ -178,6 +179,7 @@ struct nvme_queue { u16 last_cq_head; u16 qid; u8 cq_phase; + u8 sqes; unsigned long flags; #define NVMEQ_ENABLED 0 #define NVMEQ_SQ_CMB 1 @@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, bool write_sq) { spin_lock(&nvmeq->sq_lock); - memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); + memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), + cmd, sizeof(*cmd)); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; nvme_write_sq_db(nvmeq, write_sq); @@ -534,14 +537,13 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - enum dma_data_direction dma_dir = rq_data_dir(req) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; if (iod->dma_len) { - dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir); + dma_unmap_page(dev->dev, dma_addr, iod->dma_len, + rq_dma_dir(req)); return; } @@ -1344,16 +1346,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) static void nvme_free_queue(struct nvme_queue *nvmeq) { - dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth), + dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); if (!nvmeq->sq_cmds) return; if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), - nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth)); + nvmeq->sq_cmds, SQ_SIZE(nvmeq)); } else { - dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth), + dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), nvmeq->sq_cmds, nvmeq->sq_dma_addr); } } @@ -1403,7 +1405,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else - nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + nvme_disable_ctrl(&dev->ctrl); nvme_poll_irqdisable(nvmeq, -1); } @@ -1433,12 +1435,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, } static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, - int qid, int depth) + int qid) { struct pci_dev *pdev = to_pci_dev(dev->dev); if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { - nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); + nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); if (nvmeq->sq_cmds) { nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, nvmeq->sq_cmds); @@ -1447,11 +1449,11 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } - pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); } } - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) return -ENOMEM; @@ -1465,12 +1467,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) if (dev->ctrl.queue_count > qid) return 0; - nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth), + nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; + nvmeq->q_depth = depth; + nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) + if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) goto free_cqdma; nvmeq->dev = dev; @@ -1479,15 +1483,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - nvmeq->q_depth = depth; nvmeq->qid = qid; dev->ctrl.queue_count++; return 0; free_cqdma: - dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, - nvmeq->cq_dma_addr); + dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); free_nvmeq: return -ENOMEM; } @@ -1515,7 +1518,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; wmb(); /* ensure the first interrupt sees the initialization */ @@ -1679,7 +1682,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + result = nvme_disable_ctrl(&dev->ctrl); if (result < 0) return result; @@ -1695,7 +1698,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); + result = nvme_enable_ctrl(&dev->ctrl); if (result) return result; @@ -2077,6 +2080,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) dev->io_queues[HCTX_TYPE_DEFAULT] = 1; dev->io_queues[HCTX_TYPE_READ] = 0; + /* + * Some Apple controllers require all queues to use the + * first vector. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) + irq_queues = 1; + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } @@ -2095,6 +2105,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) unsigned long size; nr_io_queues = max_io_queues(); + + /* + * If tags are shared with admin queue (Apple bug), then + * make sure we only use one IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + nr_io_queues = 1; + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; @@ -2265,6 +2283,14 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; + /* + * Some Apple controllers requires tags to be unique + * across admin and IO queue, so reserve the first 32 + * tags of the IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + dev->tagset.reserved_tags = NVME_AQ_DEPTH; + ret = blk_mq_alloc_tag_set(&dev->tagset); if (ret) { dev_warn(dev->ctrl.device, @@ -2314,10 +2340,21 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; /* + * Some Apple controllers require a non-standard SQE size. + * Interestingly they also seem to ignore the CC:IOSQES register + * so we don't bother updating it here. + */ + if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) + dev->io_sqes = 7; + else + dev->io_sqes = NVME_NVM_IOSQES; + + /* * Temporary fix for the Apple controller found in the MacBook8,1 and * some MacBook7,1 to avoid controller resets and data loss. */ @@ -2334,6 +2371,18 @@ static int nvme_pci_enable(struct nvme_dev *dev) "set queue depth=%u\n", dev->q_depth); } + /* + * Controllers with the shared tags quirk need the IO queue to be + * big enough so that we get 32 tags for the admin queue + */ + if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && + (dev->q_depth < (NVME_AQ_DEPTH + 2))) { + dev->q_depth = NVME_AQ_DEPTH + 2; + dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", + dev->q_depth); + } + + nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); @@ -3034,6 +3083,10 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_128_BYTES_SQES | + NVME_QUIRK_SHARED_TAGS }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b313a60be1ca..5e30bcf3fe37 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -751,6 +751,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, { if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); } if (ctrl->async_event_sqe.data) { @@ -792,10 +793,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_async_qe; } + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -803,24 +810,15 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_cleanup_queue; - error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP, - &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + error = nvme_enable_ctrl(&ctrl->ctrl); if (error) goto out_stop_queue; ctrl->ctrl.max_hw_sectors = (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + error = nvme_init_identify(&ctrl->ctrl); if (error) goto out_stop_queue; @@ -832,6 +830,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); @@ -906,7 +907,8 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, nvme_cancel_request, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); } - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + if (remove) + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); } @@ -1057,6 +1059,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1143,9 +1146,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, req->mr = NULL; } - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, - req->nents, rq_data_dir(rq) == - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); nvme_cleanup_cmd(rq); sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); @@ -1271,7 +1272,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, - rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + rq_dma_dir(rq)); if (unlikely(count <= 0)) { ret = -EIO; goto out_free_table; @@ -1300,9 +1301,7 @@ out: return 0; out_unmap_sg: - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, - req->nents, rq_data_dir(rq) == - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); out_free_table: sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); return ret; @@ -1545,16 +1544,18 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) { + struct nvme_ctrl *ctrl = &queue->ctrl->ctrl; int ret; ret = nvme_rdma_create_queue_ib(queue); if (ret) return ret; + if (ctrl->opts->tos >= 0) + rdma_set_service_type(queue->cm_id, ctrl->opts->tos); ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { - dev_err(queue->ctrl->ctrl.device, - "rdma_resolve_route failed (%d).\n", + dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", queue->cm_error); goto out_destroy_queue; } @@ -1867,10 +1868,11 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&ctrl->reconnect_work); nvme_rdma_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else - nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + nvme_disable_ctrl(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, shutdown); } @@ -2049,7 +2051,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | - NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_rdma_create_ctrl, }; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index cf2eaf834b36..2d8ba31cb691 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -13,6 +13,7 @@ #include <net/tcp.h> #include <linux/blk-mq.h> #include <crypto/hash.h> +#include <net/busy_poll.h> #include "nvme.h" #include "fabrics.h" @@ -72,6 +73,7 @@ struct nvme_tcp_queue { int pdu_offset; size_t data_remaining; size_t ddgst_remaining; + unsigned int nr_cqe; /* send state */ struct nvme_tcp_request *request; @@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, } nvme_end_request(rq, cqe->status, cqe->result); + queue->nr_cqe++; return 0; } @@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, switch (hdr->type) { case nvme_tcp_c2h_data: - ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); case nvme_tcp_rsp: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_comp(queue, (void *)queue->pdu); case nvme_tcp_r2t: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: dev_err(queue->ctrl->ctrl.device, "unsupported pdu type (%d)\n", hdr->type); return -EINVAL; } - - return ret; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) @@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { - if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; + } nvme_tcp_init_recv_ctx(queue); } } @@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; } nvme_tcp_init_recv_ctx(queue); @@ -1023,14 +1024,16 @@ done: static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) { - struct sock *sk = queue->sock->sk; + struct socket *sock = queue->sock; + struct sock *sk = sock->sk; read_descriptor_t rd_desc; int consumed; rd_desc.arg.data = queue; rd_desc.count = 1; lock_sock(sk); - consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + queue->nr_cqe = 0; + consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); return consumed; } @@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->queue_size = queue_size; if (qid > 0) - queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + queue->cmnd_capsule_len = nctrl->ioccsz * 16; else queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; @@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to create socket: %d\n", ret); return ret; } @@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_SYNCNT sock opt %d\n", ret); goto err_sock; } @@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_NODELAY sock opt %d\n", ret); goto err_sock; } @@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, (char *)&sol, sizeof(sol)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set SO_LINGER sock opt %d\n", ret); goto err_sock; } + /* Set socket type of service */ + if (nctrl->opts->tos >= 0) { + opt = nctrl->opts->tos; + ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, + (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(nctrl->device, + "failed to set IP_TOS sock opt %d\n", ret); + goto err_sock; + } + } + queue->sock->sk->sk_allocation = GFP_ATOMIC; if (!qid) n = 0; @@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->pdu_offset = 0; sk_set_memalloc(queue->sock->sk); - if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, sizeof(ctrl->src_addr)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to bind queue %d socket %d\n", qid, ret); goto err_sock; @@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, if (queue->hdr_digest || queue->data_digest) { ret = nvme_tcp_alloc_crypto(queue); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to allocate queue %d crypto\n", qid); goto err_sock; } @@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_crypto; } - dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", + dev_dbg(nctrl->device, "connecting queue %d\n", nvme_tcp_queue_id(queue)); ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, sizeof(ctrl->addr), 0); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to connect socket: %d\n", ret); goto err_rcv_pdu; } @@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; queue->sock->sk->sk_state_change = nvme_tcp_state_change; queue->sock->sk->sk_write_space = nvme_tcp_write_space; + queue->sock->sk->sk_ll_usec = 1; write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; - set->nr_maps = 2 /* default + read */; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; } ret = blk_mq_alloc_tag_set(set); @@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); return nr_io_queues; } @@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, min(opts->nr_io_queues, nr_io_queues); nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } } static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) @@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) nvme_tcp_stop_queue(ctrl, 0); if (remove) { blk_cleanup_queue(ctrl->admin_q); + blk_cleanup_queue(ctrl->fabrics_q); blk_mq_free_tag_set(ctrl->admin_tagset); } nvme_tcp_free_admin_queue(ctrl); @@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) goto out_free_queue; } + ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->fabrics_q)) { + error = PTR_ERR(ctrl->fabrics_q); + goto out_free_tagset; + } + ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); if (IS_ERR(ctrl->admin_q)) { error = PTR_ERR(ctrl->admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_cleanup_queue; - error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); - if (error) { - dev_err(ctrl->device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - - error = nvme_enable_ctrl(ctrl, ctrl->cap); + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; + blk_mq_unquiesce_queue(ctrl->admin_q); + error = nvme_init_identify(ctrl); if (error) goto out_stop_queue; @@ -1735,6 +1758,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->admin_tagset); @@ -1753,7 +1779,8 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, nvme_cancel_request, ctrl); blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); } - blk_mq_unquiesce_queue(ctrl->admin_q); + if (remove) + blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1880,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1896,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); nvme_tcp_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->admin_q); if (shutdown) nvme_shutdown_ctrl(ctrl); else - nvme_disable_ctrl(ctrl, ctrl->cap); + nvme_disable_ctrl(ctrl); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } @@ -2155,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + dev_info(ctrl->ctrl.device, - "mapped %d/%d default/read queues.\n", + "mapped %d/%d/%d default/read/poll queues.\n", ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ]); + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); return 0; } +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + struct sock *sk = queue->sock->sk; + + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue)) + sk_busy_loop(sk, true); + nvme_tcp_try_recv(queue); + return queue->nr_cqe; +} + static struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, .complete = nvme_complete_rq, @@ -2171,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = { .init_hctx = nvme_tcp_init_hctx, .timeout = nvme_tcp_timeout, .map_queues = nvme_tcp_map_queues, + .poll = nvme_tcp_poll, }; static struct blk_mq_ops nvme_tcp_admin_mq_ops = { @@ -2224,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); ctrl->ctrl.opts = opts; - ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; @@ -2318,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = { .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | - NVMF_OPT_NR_WRITE_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_tcp_create_ctrl, }; diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 9778eb0406b3..5c3cb6928f3c 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -86,6 +86,22 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p, return ret; } +static const char *nvme_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -141,6 +157,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_admin_identify(p, cdw10); case nvme_admin_get_features: return nvme_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvme_trace_get_lba_status(p, cdw10); default: return nvme_trace_common(p, cdw10); } diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 4dc12ea52f23..51800a9ce9a9 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -81,9 +81,11 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, goto out; host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); - data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]); + data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, + sectors[READ]), 1000); host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); - data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, + sectors[WRITE]), 1000); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); @@ -111,11 +113,11 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, if (!ns->bdev) continue; host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); - data_units_read += - part_stat_read(ns->bdev->bd_part, sectors[READ]); + data_units_read += DIV_ROUND_UP( + part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000); host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); - data_units_written += - part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + data_units_written += DIV_ROUND_UP( + part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); } rcu_read_unlock(); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 95c8f1732215..9ee093b9fc74 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -253,6 +253,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); } @@ -357,10 +358,16 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) goto out_free_sq; ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } error = nvmf_connect_admin_queue(&ctrl->ctrl); @@ -369,23 +376,15 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_cleanup_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + error = nvme_enable_ctrl(&ctrl->ctrl); if (error) goto out_cleanup_queue; ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + error = nvme_init_identify(&ctrl->ctrl); if (error) goto out_cleanup_queue; @@ -394,6 +393,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) out_cleanup_queue: blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_tagset: blk_mq_free_tag_set(&ctrl->admin_tag_set); out_free_sq: @@ -411,14 +412,13 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_loop_destroy_io_queues(ctrl); } + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_loop_destroy_admin_queue(ctrl); } diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 69b83fa0c76c..bf4f03474e89 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -348,7 +348,8 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) return 0; err: - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); return NVME_SC_INTERNAL; } @@ -553,7 +554,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd) if (queue->nvme_sq.sqhd_disabled) { kfree(cmd->iov); - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); } return 1; @@ -584,7 +586,8 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, return -EAGAIN; kfree(cmd->iov); - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); cmd->queue->snd_cmd = NULL; nvmet_tcp_put_cmd(cmd); return 1; @@ -1306,7 +1309,9 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) { nvmet_req_uninit(&cmd->req); nvmet_tcp_unmap_pdu_iovec(cmd); - sgl_free(cmd->req.sg); + kfree(cmd->iov); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); } static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) @@ -1410,6 +1415,7 @@ done: static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; + struct inet_sock *inet = inet_sk(sock->sk); struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret; @@ -1433,6 +1439,16 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) if (ret) return ret; + /* Set socket type of service */ + if (inet->rcv_tos > 0) { + int tos = inet->rcv_tos; + + ret = kernel_setsockopt(sock, SOL_IP, IP_TOS, + (char *)&tos, sizeof(tos)); + if (ret) + return ret; + } + write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = queue; queue->data_ready = sock->sk->sk_data_ready; diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 6af11d493271..1373a3c67962 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -33,6 +33,22 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p, return ret; } +static const char *nvmet_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -80,6 +96,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, return nvmet_trace_admin_identify(p, cdw10); case nvme_admin_get_features: return nvmet_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvmet_trace_get_lba_status(p, cdw10); default: return nvmet_trace_common(p, cdw10); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 01aa6a6c241d..f61d6906e59d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -140,6 +140,7 @@ enum { * Submission and Completion Queue Entry Sizes for the NVM command set. * (In bytes and specified as a power of two (2^n)). */ +#define NVME_ADM_SQES 6 #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 @@ -814,6 +815,7 @@ enum nvme_admin_opcode { nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, nvme_admin_sanitize_nvm = 0x84, + nvme_admin_get_lba_status = 0x86, }; #define nvme_admin_opcode_name(opcode) { opcode, #opcode } @@ -840,7 +842,8 @@ enum nvme_admin_opcode { nvme_admin_opcode_name(nvme_admin_format_nvm), \ nvme_admin_opcode_name(nvme_admin_security_send), \ nvme_admin_opcode_name(nvme_admin_security_recv), \ - nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) + nvme_admin_opcode_name(nvme_admin_sanitize_nvm), \ + nvme_admin_opcode_name(nvme_admin_get_lba_status)) enum { NVME_QUEUE_PHYS_CONTIG = (1 << 0), |