diff options
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/Kconfig | 4 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 11 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 23 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 2 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 27 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 229 | ||||
-rw-r--r-- | drivers/nvme/target/Kconfig | 2 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 6 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 8 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 1 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 105 |
12 files changed, 250 insertions, 172 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index db39d53cdfb9..f7d37a62f874 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -30,8 +30,8 @@ config NVME_FABRICS config NVME_RDMA tristate "NVM Express over Fabrics RDMA host driver" - depends on INFINIBAND - depends on BLK_DEV_NVME + depends on INFINIBAND && BLOCK + select NVME_CORE select NVME_FABRICS select SG_POOL help diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7ff2e820bbf4..2feacc70bf61 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -81,10 +81,12 @@ EXPORT_SYMBOL_GPL(nvme_cancel_request); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state) { - enum nvme_ctrl_state old_state = ctrl->state; + enum nvme_ctrl_state old_state; bool changed = false; spin_lock_irq(&ctrl->lock); + + old_state = ctrl->state; switch (new_state) { case NVME_CTRL_LIVE: switch (old_state) { @@ -140,11 +142,12 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, default: break; } - spin_unlock_irq(&ctrl->lock); if (changed) ctrl->state = new_state; + spin_unlock_irq(&ctrl->lock); + return changed; } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); @@ -608,7 +611,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, NVME_QID_ANY, 0, 0); - if (ret >= 0) + if (ret >= 0 && result) *result = le32_to_cpu(cqe.result); return ret; } @@ -628,7 +631,7 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, NVME_QID_ANY, 0, 0); - if (ret >= 0) + if (ret >= 0 && result) *result = le32_to_cpu(cqe.result); return ret; } diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index dc996761042f..4eff49174466 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -47,8 +47,10 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) mutex_lock(&nvmf_hosts_mutex); host = __nvmf_host_find(hostnqn); - if (host) + if (host) { + kref_get(&host->ref); goto out_unlock; + } host = kmalloc(sizeof(*host), GFP_KERNEL); if (!host) @@ -56,7 +58,7 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) kref_init(&host->ref); memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); - uuid_le_gen(&host->id); + uuid_be_gen(&host->id); list_add_tail(&host->list, &nvmf_hosts); out_unlock: @@ -73,9 +75,9 @@ static struct nvmf_host *nvmf_host_default(void) return NULL; kref_init(&host->ref); - uuid_le_gen(&host->id); + uuid_be_gen(&host->id); snprintf(host->nqn, NVMF_NQN_SIZE, - "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUl", &host->id); + "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUb", &host->id); mutex_lock(&nvmf_hosts_mutex); list_add_tail(&host->list, &nvmf_hosts); @@ -363,7 +365,14 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) cmd.connect.opcode = nvme_fabrics_command; cmd.connect.fctype = nvme_fabrics_type_connect; cmd.connect.qid = 0; - cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + + /* + * fabrics spec sets a minimum of depth 32 for admin queue, + * so set the queue with this depth always until + * justification otherwise. + */ + cmd.connect.sqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + /* * Set keep-alive timeout in seconds granularity (ms * 1000) * and add a grace period for controller kato enforcement @@ -375,7 +384,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) if (!data) return -ENOMEM; - memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le)); + memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_be)); data->cntlid = cpu_to_le16(0xffff); strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); @@ -434,7 +443,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) if (!data) return -ENOMEM; - memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le)); + memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_be)); data->cntlid = cpu_to_le16(ctrl->cntlid); strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 89df52c8be97..46e460aee52d 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -34,7 +34,7 @@ struct nvmf_host { struct kref ref; struct list_head list; char nqn[NVMF_NQN_SIZE]; - uuid_le id; + uuid_be id; }; /** diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d7c33f9361aa..60f7eab11865 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1543,15 +1543,10 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) reinit_completion(&dev->ioq_wait); retry: timeout = ADMIN_TIMEOUT; - for (; i > 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; - - if (!pass) - nvme_suspend_queue(nvmeq); - if (nvme_delete_queue(nvmeq, opcode)) + for (; i > 0; i--, sent++) + if (nvme_delete_queue(dev->queues[i], opcode)) break; - ++sent; - } + while (sent--) { timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); if (timeout == 0) @@ -1693,11 +1688,17 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_stop_queues(&dev->ctrl); csts = readl(dev->bar + NVME_REG_CSTS); } + + for (i = dev->queue_count - 1; i > 0; i--) + nvme_suspend_queue(dev->queues[i]); + if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { - for (i = dev->queue_count - 1; i >= 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; - nvme_suspend_queue(nvmeq); - } + /* A device might become IO incapable very soon during + * probe, before the admin queue is configured. Thus, + * queue_count can be 0 here. + */ + if (dev->queue_count) + nvme_suspend_queue(dev->queues[0]); } else { nvme_disable_io_queues(dev); nvme_disable_admin_queue(dev, shutdown); @@ -2116,6 +2117,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3e3ce2b0424e..fbdb2267e460 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -12,13 +12,11 @@ * more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/delay.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/string.h> -#include <linux/jiffies.h> #include <linux/atomic.h> #include <linux/blk-mq.h> #include <linux/types.h> @@ -26,7 +24,6 @@ #include <linux/mutex.h> #include <linux/scatterlist.h> #include <linux/nvme.h> -#include <linux/t10-pi.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> @@ -46,10 +43,6 @@ #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 -#define NVME_RDMA_MAX_PAGES_PER_MR 512 - -#define NVME_RDMA_DEF_RECONNECT_DELAY 20 - /* * We handle AEN commands ourselves and don't even let the * block layer know about them. @@ -80,7 +73,6 @@ struct nvme_rdma_request { u32 num_sge; int nents; bool inline_data; - bool need_inval; struct ib_reg_wr reg_wr; struct ib_cqe reg_cqe; struct nvme_rdma_queue *queue; @@ -90,6 +82,8 @@ struct nvme_rdma_request { enum nvme_rdma_queue_flags { NVME_RDMA_Q_CONNECTED = (1 << 0), + NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1), + NVME_RDMA_Q_DELETING = (1 << 2), }; struct nvme_rdma_queue { @@ -169,7 +163,6 @@ MODULE_PARM_DESC(register_always, static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); -static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl); /* XXX: really should move to a generic header sooner or later.. */ static inline void put_unaligned_le24(u32 val, u8 *p) @@ -290,7 +283,7 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq) struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int ret = 0; - if (!req->need_inval) + if (!req->mr->need_inval) goto out; ib_dereg_mr(req->mr); @@ -300,9 +293,10 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq) if (IS_ERR(req->mr)) { ret = PTR_ERR(req->mr); req->mr = NULL; + goto out; } - req->need_inval = false; + req->mr->need_inval = false; out: return ret; @@ -489,9 +483,14 @@ out_err: static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { - struct nvme_rdma_device *dev = queue->device; - struct ib_device *ibdev = dev->dev; + struct nvme_rdma_device *dev; + struct ib_device *ibdev; + + if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags)) + return; + dev = queue->device; + ibdev = dev->dev; rdma_destroy_qp(queue->cm_id); ib_free_cq(queue->ib_cq); @@ -542,6 +541,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, ret = -ENOMEM; goto out_destroy_qp; } + set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags); return 0; @@ -594,11 +594,13 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, goto out_destroy_cm_id; } + clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags); return 0; out_destroy_cm_id: + nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); return ret; } @@ -617,7 +619,7 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue) { - if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) + if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) return; nvme_rdma_stop_queue(queue); nvme_rdma_free_queue(queue); @@ -649,7 +651,8 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) int i, ret; for (i = 1; i < ctrl->queue_count; i++) { - ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.sqsize); + ret = nvme_rdma_init_queue(ctrl, i, + ctrl->ctrl.opts->queue_size); if (ret) { dev_info(ctrl->ctrl.device, "failed to initialize i/o queue: %d\n", ret); @@ -660,7 +663,7 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) return 0; out_free_queues: - for (; i >= 1; i--) + for (i--; i >= 1; i--) nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); return ret; @@ -687,11 +690,6 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) list_del(&ctrl->list); mutex_unlock(&nvme_rdma_ctrl_mutex); - if (ctrl->ctrl.tagset) { - blk_cleanup_queue(ctrl->ctrl.connect_q); - blk_mq_free_tag_set(&ctrl->tag_set); - nvme_rdma_dev_put(ctrl->device); - } kfree(ctrl->queues); nvmf_free_options(nctrl->opts); free_ctrl: @@ -748,8 +746,11 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - if (ctrl->queue_count > 1) + if (ctrl->queue_count > 1) { nvme_start_queues(&ctrl->ctrl); + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); @@ -771,8 +772,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); + int i; nvme_stop_keep_alive(&ctrl->ctrl); + + for (i = 0; i < ctrl->queue_count; i++) + clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags); + if (ctrl->queue_count > 1) nvme_stop_queues(&ctrl->ctrl); blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); @@ -855,7 +861,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, if (!blk_rq_bytes(rq)) return; - if (req->need_inval) { + if (req->mr->need_inval) { res = nvme_rdma_inv_rkey(queue, req); if (res < 0) { dev_err(ctrl->ctrl.device, @@ -941,7 +947,7 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; - req->need_inval = true; + req->mr->need_inval = true; sg->addr = cpu_to_le64(req->mr->iova); put_unaligned_le24(req->mr->length, sg->length); @@ -964,7 +970,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->num_sge = 1; req->inline_data = false; - req->need_inval = false; + req->mr->need_inval = false; c->common.flags |= NVME_CMD_SGL_METABUF; @@ -1151,7 +1157,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && wc->ex.invalidate_rkey == req->mr->rkey) - req->need_inval = false; + req->mr->need_inval = false; blk_mq_complete_request(rq, status); @@ -1269,7 +1275,7 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) { struct nvme_rdma_ctrl *ctrl = queue->ctrl; struct rdma_conn_param param = { }; - struct nvme_rdma_cm_req priv; + struct nvme_rdma_cm_req priv = { }; int ret; param.qp_num = queue->qp->qp_num; @@ -1284,8 +1290,22 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue)); - priv.hrqsize = cpu_to_le16(queue->queue_size); - priv.hsqsize = cpu_to_le16(queue->queue_size); + /* + * set the admin queue depth to the minimum size + * specified by the Fabrics standard. + */ + if (priv.qid == 0) { + priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH); + priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + } else { + /* + * current interpretation of the fabrics spec + * is at minimum you make hrqsize sqsize+1, or a + * 1's based representation of sqsize. + */ + priv.hrqsize = cpu_to_le16(queue->queue_size); + priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); + } ret = rdma_connect(queue->cm_id, ¶m); if (ret) { @@ -1301,56 +1321,6 @@ out_destroy_queue_ib: return ret; } -/** - * nvme_rdma_device_unplug() - Handle RDMA device unplug - * @queue: Queue that owns the cm_id that caught the event - * - * DEVICE_REMOVAL event notifies us that the RDMA device is about - * to unplug so we should take care of destroying our RDMA resources. - * This event will be generated for each allocated cm_id. - * - * In our case, the RDMA resources are managed per controller and not - * only per queue. So the way we handle this is we trigger an implicit - * controller deletion upon the first DEVICE_REMOVAL event we see, and - * hold the event inflight until the controller deletion is completed. - * - * One exception that we need to handle is the destruction of the cm_id - * that caught the event. Since we hold the callout until the controller - * deletion is completed, we'll deadlock if the controller deletion will - * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership - * of destroying this queue before-hand, destroy the queue resources - * after the controller deletion completed with the exception of destroying - * the cm_id implicitely by returning a non-zero rc to the callout. - */ -static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue) -{ - struct nvme_rdma_ctrl *ctrl = queue->ctrl; - int ret, ctrl_deleted = 0; - - /* First disable the queue so ctrl delete won't free it */ - if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) - goto out; - - /* delete the controller */ - ret = __nvme_rdma_del_ctrl(ctrl); - if (!ret) { - dev_warn(ctrl->ctrl.device, - "Got rdma device removal event, deleting ctrl\n"); - flush_work(&ctrl->delete_work); - - /* Return non-zero so the cm_id will destroy implicitly */ - ctrl_deleted = 1; - - /* Free this queue ourselves */ - rdma_disconnect(queue->cm_id); - ib_drain_qp(queue->qp); - nvme_rdma_destroy_queue_ib(queue); - } - -out: - return ctrl_deleted; -} - static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *ev) { @@ -1392,8 +1362,8 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, nvme_rdma_error_recovery(queue->ctrl); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: - /* return 1 means impliciy CM ID destroy */ - return nvme_rdma_device_unplug(queue); + /* device removal is handled via the ib_client API */ + break; default: dev_err(queue->ctrl->ctrl.device, "Unexpected RDMA CM event (%d)\n", ev->event); @@ -1465,7 +1435,7 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH) flush = true; ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, - req->need_inval ? &req->reg_wr.wr : NULL, flush); + req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); if (ret) { nvme_rdma_unmap_data(queue, rq); goto err; @@ -1648,7 +1618,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) nvme_rdma_free_io_queues(ctrl); } - if (ctrl->ctrl.state == NVME_CTRL_LIVE) + if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags)) nvme_shutdown_ctrl(&ctrl->ctrl); blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); @@ -1657,15 +1627,27 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) nvme_rdma_destroy_admin_queue(ctrl); } +static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) +{ + nvme_uninit_ctrl(&ctrl->ctrl); + if (shutdown) + nvme_rdma_shutdown_ctrl(ctrl); + + if (ctrl->ctrl.tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + nvme_rdma_dev_put(ctrl->device); + } + + nvme_put_ctrl(&ctrl->ctrl); +} + static void nvme_rdma_del_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, delete_work); - nvme_remove_namespaces(&ctrl->ctrl); - nvme_rdma_shutdown_ctrl(ctrl); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); + __nvme_rdma_remove_ctrl(ctrl, true); } static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) @@ -1682,15 +1664,19 @@ static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - int ret; + int ret = 0; + /* + * Keep a reference until all work is flushed since + * __nvme_rdma_del_ctrl can free the ctrl mem + */ + if (!kref_get_unless_zero(&ctrl->ctrl.kref)) + return -EBUSY; ret = __nvme_rdma_del_ctrl(ctrl); - if (ret) - return ret; - - flush_work(&ctrl->delete_work); - - return 0; + if (!ret) + flush_work(&ctrl->delete_work); + nvme_put_ctrl(&ctrl->ctrl); + return ret; } static void nvme_rdma_remove_ctrl_work(struct work_struct *work) @@ -1698,9 +1684,7 @@ static void nvme_rdma_remove_ctrl_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, delete_work); - nvme_remove_namespaces(&ctrl->ctrl); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); + __nvme_rdma_remove_ctrl(ctrl, false); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) @@ -1739,6 +1723,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) if (ctrl->queue_count > 1) { nvme_start_queues(&ctrl->ctrl); nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); } return; @@ -1809,7 +1794,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); ctrl->tag_set.ops = &nvme_rdma_mq_ops; - ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize; + ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; ctrl->tag_set.reserved_tags = 1; /* fabric connect */ ctrl->tag_set.numa_node = NUMA_NO_NODE; ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; @@ -1907,7 +1892,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, spin_lock_init(&ctrl->lock); ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ - ctrl->ctrl.sqsize = opts->queue_size; + ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ret = -ENOMEM; @@ -1988,27 +1973,57 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .create_ctrl = nvme_rdma_create_ctrl, }; +static void nvme_rdma_add_one(struct ib_device *ib_device) +{ +} + +static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) +{ + struct nvme_rdma_ctrl *ctrl; + + /* Delete all controllers using this device */ + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { + if (ctrl->device->dev != ib_device) + continue; + dev_info(ctrl->ctrl.device, + "Removing ctrl: NQN \"%s\", addr %pISp\n", + ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + __nvme_rdma_del_ctrl(ctrl); + } + mutex_unlock(&nvme_rdma_ctrl_mutex); + + flush_workqueue(nvme_rdma_wq); +} + +static struct ib_client nvme_rdma_ib_client = { + .name = "nvme_rdma", + .add = nvme_rdma_add_one, + .remove = nvme_rdma_remove_one +}; + static int __init nvme_rdma_init_module(void) { + int ret; + nvme_rdma_wq = create_workqueue("nvme_rdma_wq"); if (!nvme_rdma_wq) return -ENOMEM; + ret = ib_register_client(&nvme_rdma_ib_client); + if (ret) { + destroy_workqueue(nvme_rdma_wq); + return ret; + } + nvmf_register_transport(&nvme_rdma_transport); return 0; } static void __exit nvme_rdma_cleanup_module(void) { - struct nvme_rdma_ctrl *ctrl; - nvmf_unregister_transport(&nvme_rdma_transport); - - mutex_lock(&nvme_rdma_ctrl_mutex); - list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) - __nvme_rdma_del_ctrl(ctrl); - mutex_unlock(&nvme_rdma_ctrl_mutex); - + ib_unregister_client(&nvme_rdma_ib_client); destroy_workqueue(nvme_rdma_wq); } diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index a5c31cbeb481..3a5b9d0576cb 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -15,8 +15,8 @@ config NVME_TARGET config NVME_TARGET_LOOP tristate "NVMe loopback device support" - depends on BLK_DEV_NVME depends on NVME_TARGET + select NVME_CORE select NVME_FABRICS select SG_POOL help diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 2fac17a5ad53..47c564b5a289 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -13,7 +13,6 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> -#include <linux/random.h> #include <generated/utsrelease.h> #include "nvmet.h" @@ -83,7 +82,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ctrl *id; - u64 serial; u16 status = 0; id = kzalloc(sizeof(*id), GFP_KERNEL); @@ -96,10 +94,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->vid = 0; id->ssvid = 0; - /* generate a random serial number as our controllers are ephemeral: */ - get_random_bytes(&serial, sizeof(serial)); memset(id->sn, ' ', sizeof(id->sn)); - snprintf(id->sn, sizeof(id->sn), "%llx", serial); + snprintf(id->sn, sizeof(id->sn), "%llx", ctrl->serial); memset(id->mn, ' ', sizeof(id->mn)); strncpy((char *)id->mn, "Linux", sizeof(id->mn)); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 8a891ca53367..6559d5afa7bf 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -13,6 +13,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> +#include <linux/random.h> #include "nvmet.h" static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; @@ -728,6 +729,9 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); + /* generate a random serial number as our controllers are ephemeral: */ + get_random_bytes(&ctrl->serial, sizeof(ctrl->serial)); + kref_init(&ctrl->ref); ctrl->subsys = subsys; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 94e782987cc9..395e60dad835 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -414,9 +414,8 @@ static void nvme_loop_del_ctrl_work(struct work_struct *work) struct nvme_loop_ctrl *ctrl = container_of(work, struct nvme_loop_ctrl, delete_work); - nvme_remove_namespaces(&ctrl->ctrl); - nvme_loop_shutdown_ctrl(ctrl); nvme_uninit_ctrl(&ctrl->ctrl); + nvme_loop_shutdown_ctrl(ctrl); nvme_put_ctrl(&ctrl->ctrl); } @@ -501,7 +500,6 @@ out_free_queues: nvme_loop_destroy_admin_queue(ctrl); out_disable: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - nvme_remove_namespaces(&ctrl->ctrl); nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); } @@ -558,7 +556,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); ctrl->tag_set.ops = &nvme_loop_mq_ops; - ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize; + ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; ctrl->tag_set.reserved_tags = 1; /* fabric connect */ ctrl->tag_set.numa_node = NUMA_NO_NODE; ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; @@ -622,7 +620,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, ret = -ENOMEM; - ctrl->ctrl.sqsize = opts->queue_size; + ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues), diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 57dd6d834c28..76b6eedccaf9 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -113,6 +113,7 @@ struct nvmet_ctrl { struct mutex lock; u64 cap; + u64 serial; u32 cc; u32 csts; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index e06d504bdf0c..1cbe6e053b5b 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -77,6 +77,7 @@ enum nvmet_rdma_queue_state { NVMET_RDMA_Q_CONNECTING, NVMET_RDMA_Q_LIVE, NVMET_RDMA_Q_DISCONNECTING, + NVMET_RDMA_IN_DEVICE_REMOVAL, }; struct nvmet_rdma_queue { @@ -615,15 +616,10 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, if (!len) return 0; - /* use the already allocated data buffer if possible */ - if (len <= NVMET_RDMA_INLINE_DATA_SIZE && rsp->queue->host_qid) { - nvmet_rdma_use_inline_sg(rsp, len, 0); - } else { - status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, - len); - if (status) - return status; - } + status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, + len); + if (status) + return status; ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, @@ -982,9 +978,13 @@ static void nvmet_rdma_release_queue_work(struct work_struct *w) container_of(w, struct nvmet_rdma_queue, release_work); struct rdma_cm_id *cm_id = queue->cm_id; struct nvmet_rdma_device *dev = queue->dev; + enum nvmet_rdma_queue_state state = queue->state; nvmet_rdma_free_queue(queue); - rdma_destroy_id(cm_id); + + if (state != NVMET_RDMA_IN_DEVICE_REMOVAL) + rdma_destroy_id(cm_id); + kref_put(&dev->ref, nvmet_rdma_free_dev); } @@ -1004,10 +1004,10 @@ nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, queue->host_qid = le16_to_cpu(req->qid); /* - * req->hsqsize corresponds to our recv queue size + * req->hsqsize corresponds to our recv queue size plus 1 * req->hrqsize corresponds to our send queue size */ - queue->recv_queue_size = le16_to_cpu(req->hsqsize); + queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; queue->send_queue_size = le16_to_cpu(req->hrqsize); if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH) @@ -1233,8 +1233,9 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) switch (queue->state) { case NVMET_RDMA_Q_CONNECTING: case NVMET_RDMA_Q_LIVE: - disconnect = true; queue->state = NVMET_RDMA_Q_DISCONNECTING; + case NVMET_RDMA_IN_DEVICE_REMOVAL: + disconnect = true; break; case NVMET_RDMA_Q_DISCONNECTING: break; @@ -1272,6 +1273,62 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, schedule_work(&queue->release_work); } +/** + * nvme_rdma_device_removal() - Handle RDMA device removal + * @queue: nvmet rdma queue (cm id qp_context) + * @addr: nvmet address (cm_id context) + * + * DEVICE_REMOVAL event notifies us that the RDMA device is about + * to unplug so we should take care of destroying our RDMA resources. + * This event will be generated for each allocated cm_id. + * + * Note that this event can be generated on a normal queue cm_id + * and/or a device bound listener cm_id (where in this case + * queue will be null). + * + * we claim ownership on destroying the cm_id. For queues we move + * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port + * we nullify the priv to prevent double cm_id destruction and destroying + * the cm_id implicitely by returning a non-zero rc to the callout. + */ +static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue) +{ + unsigned long flags; + + if (!queue) { + struct nvmet_port *port = cm_id->context; + + /* + * This is a listener cm_id. Make sure that + * future remove_port won't invoke a double + * cm_id destroy. use atomic xchg to make sure + * we don't compete with remove_port. + */ + if (xchg(&port->priv, NULL) != cm_id) + return 0; + } else { + /* + * This is a queue cm_id. Make sure that + * release queue will not destroy the cm_id + * and schedule all ctrl queues removal (only + * if the queue is not disconnecting already). + */ + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state != NVMET_RDMA_Q_DISCONNECTING) + queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL; + spin_unlock_irqrestore(&queue->state_lock, flags); + nvmet_rdma_queue_disconnect(queue); + flush_scheduled_work(); + } + + /* + * We need to return 1 so that the core will destroy + * it's own ID. What a great API design.. + */ + return 1; +} + static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -1294,20 +1351,11 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_DISCONNECTED: - case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_TIMEWAIT_EXIT: - /* - * We can get the device removal callback even for a - * CM ID that we aren't actually using. In that case - * the context pointer is NULL, so we shouldn't try - * to disconnect a non-existing queue. But we also - * need to return 1 so that the core will destroy - * it's own ID. What a great API design.. - */ - if (queue) - nvmet_rdma_queue_disconnect(queue); - else - ret = 1; + nvmet_rdma_queue_disconnect(queue); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + ret = nvmet_rdma_device_removal(cm_id, queue); break; case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_UNREACHABLE: @@ -1396,9 +1444,10 @@ out_destroy_id: static void nvmet_rdma_remove_port(struct nvmet_port *port) { - struct rdma_cm_id *cm_id = port->priv; + struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); - rdma_destroy_id(cm_id); + if (cm_id) + rdma_destroy_id(cm_id); } static struct nvmet_fabrics_ops nvmet_rdma_ops = { |