diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/core.c | 324 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 577 | ||||
-rw-r--r-- | drivers/nvme/host/fc.h | 227 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 7 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 16 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 28 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 117 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 321 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 64 |
9 files changed, 1328 insertions, 353 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f3c037f5a9ba..0585efa47d8f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -19,7 +19,6 @@ #include <linux/pr.h> #include <linux/ptrace.h> #include <linux/nvme_ioctl.h> -#include <linux/t10-pi.h> #include <linux/pm_qos.h> #include <asm/unaligned.h> @@ -204,11 +203,6 @@ static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) nvme_put_ctrl(ctrl); } -static inline bool nvme_ns_has_pi(struct nvme_ns *ns) -{ - return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); -} - static blk_status_t nvme_error_status(u16 status) { switch (status & 0x7ff) { @@ -310,7 +304,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) return true; nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; - blk_mq_complete_request(req); + blk_mq_force_complete_rq(req); return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); @@ -433,7 +427,6 @@ static void nvme_free_ns_head(struct kref *ref) nvme_mpath_remove_disk(head); ida_simple_remove(&head->subsys->ns_ida, head->instance); - list_del_init(&head->entry); cleanup_srcu_struct(&head->srcu); nvme_put_subsystem(head->subsys); kfree(head); @@ -530,7 +523,7 @@ static int nvme_get_stream_params(struct nvme_ctrl *ctrl, c.directive.opcode = nvme_admin_directive_recv; c.directive.nsid = cpu_to_le32(nsid); - c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1); + c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s))); c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; c.directive.dtype = NVME_DIR_STREAMS; @@ -553,19 +546,22 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); if (ret) - return ret; + goto out_disable_stream; ctrl->nssa = le16_to_cpu(s.nssa); if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { dev_info(ctrl->device, "too few streams (%u) available\n", ctrl->nssa); - nvme_disable_streams(ctrl); - return 0; + goto out_disable_stream; } ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; + +out_disable_stream: + nvme_disable_streams(ctrl); + return ret; } /* @@ -1027,6 +1023,19 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); +/* + * In NVMe 1.0 the CNS field was just a binary controller or namespace + * flag, thus sending any new CNS opcodes has a big chance of not working. + * Qemu unfortunately had that bug after reporting a 1.1 version compliance + * (but not for any later version). + */ +static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl) +{ + if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS) + return ctrl->vs < NVME_VS(1, 2, 0); + return ctrl->vs < NVME_VS(1, 1, 0); +} + static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; @@ -1290,7 +1299,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) meta_len = (io.nblocks + 1) * ns->ms; metadata = nvme_to_user_ptr(io.metadata); - if (ns->ext) { + if (ns->features & NVME_NS_EXT_LBAS) { length += meta_len; meta_len = 0; } else if (meta_len) { @@ -1392,8 +1401,10 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) } if (effects & NVME_CMD_EFFECTS_CCC) nvme_init_identify(ctrl); - if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); + flush_work(&ctrl->scan_work); + } } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -1682,7 +1693,8 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) +static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, + u32 max_integrity_segments) { struct blk_integrity integrity; @@ -1705,20 +1717,15 @@ static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) } integrity.tuple_size = ms; blk_integrity_register(disk, &integrity); - blk_queue_max_integrity_segments(disk->queue, 1); + blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); } #else -static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) +static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, + u32 max_integrity_segments) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ -static void nvme_set_chunk_size(struct nvme_ns *ns) -{ - u32 chunk_size = nvme_lba_to_sect(ns, ns->noiob); - blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); -} - static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -1804,12 +1811,37 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; } +static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u32 *phys_bs, u32 *io_opt) +{ + struct streams_directive_params s; + int ret; + + if (!ctrl->nr_streams) + return 0; + + ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); + if (ret) + return ret; + + ns->sws = le32_to_cpu(s.sws); + ns->sgs = le16_to_cpu(s.sgs); + + if (ns->sws) { + *phys_bs = ns->sws * (1 << ns->lba_shift); + if (ns->sgs) + *io_opt = *phys_bs * ns->sgs; + } + + return 0; +} + static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); unsigned short bs = 1 << ns->lba_shift; - u32 atomic_bs, phys_bs, io_opt; + u32 atomic_bs, phys_bs, io_opt = 0; if (ns->lba_shift > PAGE_SHIFT) { /* unsupported block size, set capacity to 0 later */ @@ -1818,26 +1850,25 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); + atomic_bs = phys_bs = bs; + nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt); if (id->nabo == 0) { /* * Bit 1 indicates whether NAWUPF is defined for this namespace * and whether it should be used instead of AWUPF. If NAWUPF == * 0 then AWUPF must be used instead. */ - if (id->nsfeat & (1 << 1) && id->nawupf) + if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; - } else { - atomic_bs = bs; } - phys_bs = bs; - io_opt = bs; - if (id->nsfeat & (1 << 4)) { + + if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { /* NPWG = Namespace Preferred Write Granularity */ - phys_bs *= 1 + le16_to_cpu(id->npwg); + phys_bs = bs * (1 + le16_to_cpu(id->npwg)); /* NOWS = Namespace Optimal Write Size */ - io_opt *= 1 + le16_to_cpu(id->nows); + io_opt = bs * (1 + le16_to_cpu(id->nows)); } blk_queue_logical_block_size(disk->queue, bs); @@ -1850,19 +1881,34 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_queue_io_min(disk->queue, phys_bs); blk_queue_io_opt(disk->queue, io_opt); - if (ns->ms && !ns->ext && - (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - nvme_init_integrity(disk, ns->ms, ns->pi_type); - if ((ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) || - ns->lba_shift > PAGE_SHIFT) + /* + * The block layer can't support LBA sizes larger than the page size + * yet, so catch this early and don't allow block I/O. + */ + if (ns->lba_shift > PAGE_SHIFT) capacity = 0; + /* + * Register a metadata profile for PI, or the plain non-integrity NVMe + * metadata masquerading as Type 0 if supported, otherwise reject block + * I/O to namespaces with metadata except when the namespace supports + * PI, as it can strip/insert in that case. + */ + if (ns->ms) { + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + (ns->features & NVME_NS_METADATA_SUPPORTED)) + nvme_init_integrity(disk, ns->ms, ns->pi_type, + ns->ctrl->max_integrity_segments); + else if (!nvme_ns_has_pi(ns)) + capacity = 0; + } + set_capacity_revalidate_and_notify(disk, capacity, false); nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); - if (id->nsattr & (1 << 0)) + if (id->nsattr & NVME_NS_ATTR_RO) set_disk_ro(disk, true); else set_disk_ro(disk, false); @@ -1870,9 +1916,11 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_mq_unfreeze_queue(disk->queue); } -static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) +static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + u32 iob; /* * If identify namespace failed, use default 512 byte block size so @@ -1881,32 +1929,55 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; - ns->noiob = le16_to_cpu(id->noiob); + + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && + is_power_of_2(ctrl->max_hw_sectors)) + iob = ctrl->max_hw_sectors; + else + iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); + + ns->features = 0; ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); /* the PI implementation requires metadata equal t10 pi tuple size */ if (ns->ms == sizeof(struct t10_pi_tuple)) ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; else ns->pi_type = 0; - if (ns->noiob) - nvme_set_chunk_size(ns); + if (ns->ms) { + /* + * For PCIe only the separate metadata pointer is supported, + * as the block layer supplies metadata in a separate bio_vec + * chain. For Fabrics, only metadata as part of extended data + * LBA is supported on the wire per the Fabrics specification, + * but the HBA/HCA will do the remapping from the separate + * metadata buffers for us. + */ + if (id->flbas & NVME_NS_FLBAS_META_EXT) { + ns->features |= NVME_NS_EXT_LBAS; + if ((ctrl->ops->flags & NVME_F_FABRICS) && + (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) && + ctrl->max_integrity_segments) + ns->features |= NVME_NS_METADATA_SUPPORTED; + } else { + if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS)) + return -EINVAL; + if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) + ns->features |= NVME_NS_METADATA_SUPPORTED; + } + } + + if (iob) + blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); nvme_update_disk_info(disk, ns, id); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); - if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { - struct backing_dev_info *info = - ns->head->disk->queue->backing_dev_info; - - info->capabilities |= BDI_CAP_STABLE_WRITES; - } - revalidate_disk(ns->head->disk); } #endif + return 0; } static int nvme_revalidate_disk(struct gendisk *disk) @@ -1931,7 +2002,6 @@ static int nvme_revalidate_disk(struct gendisk *disk) goto free_id; } - __nvme_revalidate_disk(disk, id); ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); if (ret) goto free_id; @@ -1940,8 +2010,10 @@ static int nvme_revalidate_disk(struct gendisk *disk) dev_err(ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); ret = -ENODEV; + goto free_id; } + ret = __nvme_revalidate_disk(disk, id); free_id: kfree(id); out: @@ -2249,10 +2321,8 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } - if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && - is_power_of_2(ctrl->max_hw_sectors)) - blk_queue_chunk_sectors(q, ctrl->max_hw_sectors); blk_queue_virt_boundary(q, ctrl->page_size - 1); + blk_queue_dma_alignment(q, 7); if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; blk_queue_write_cache(q, vwc, vwc); @@ -2655,7 +2725,7 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, return false; } - if ((id->cmic & (1 << 1)) || + if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || (ctrl->opts && ctrl->opts->discovery_nqn)) continue; @@ -2746,7 +2816,7 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, void *log, size_t size, u64 offset) { struct nvme_command c = { }; - unsigned long dwlen = size / 4 - 1; + u32 dwlen = nvme_bytes_to_numd(size); c.get_log_page.opcode = nvme_admin_get_log_page; c.get_log_page.nsid = cpu_to_le32(nsid); @@ -3401,7 +3471,6 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, list_for_each_entry(h, &subsys->nsheads, entry) { if (nvme_ns_ids_valid(&new->ids) && - !list_empty(&h->list) && nvme_ns_ids_equal(&new->ids, &h->ids)) return -EINVAL; } @@ -3410,8 +3479,7 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, } static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_id_ns *id, - struct nvme_ns_ids *ids) + unsigned nsid, struct nvme_ns_ids *ids) { struct nvme_ns_head *head; size_t size = sizeof(*head); @@ -3469,42 +3537,51 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, struct nvme_id_ns *id) { struct nvme_ctrl *ctrl = ns->ctrl; - bool is_shared = id->nmic & (1 << 0); + bool is_shared = id->nmic & NVME_NS_NMIC_SHARED; struct nvme_ns_head *head = NULL; struct nvme_ns_ids ids; int ret = 0; ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); - if (ret) - goto out; + if (ret) { + if (ret < 0) + return ret; + return blk_status_to_errno(nvme_error_status(ret)); + } mutex_lock(&ctrl->subsys->lock); - if (is_shared) - head = nvme_find_ns_head(ctrl->subsys, nsid); + head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { - head = nvme_alloc_ns_head(ctrl, nsid, id, &ids); + head = nvme_alloc_ns_head(ctrl, nsid, &ids); if (IS_ERR(head)) { ret = PTR_ERR(head); goto out_unlock; } + head->shared = is_shared; } else { + ret = -EINVAL; + if (!is_shared || !head->shared) { + dev_err(ctrl->device, + "Duplicate unshared namespace %d\n", nsid); + goto out_put_ns_head; + } if (!nvme_ns_ids_equal(&head->ids, &ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", nsid); - ret = -EINVAL; - goto out_unlock; + goto out_put_ns_head; } } list_add_tail(&ns->siblings, &head->list); ns->head = head; + mutex_unlock(&ctrl->subsys->lock); + return 0; +out_put_ns_head: + nvme_put_ns_head(head); out_unlock: mutex_unlock(&ctrl->subsys->lock); -out: - if (ret > 0) - ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -3535,32 +3612,6 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) return ret; } -static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) -{ - struct streams_directive_params s; - int ret; - - if (!ctrl->nr_streams) - return 0; - - ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); - if (ret) - return ret; - - ns->sws = le32_to_cpu(s.sws); - ns->sgs = le16_to_cpu(s.sgs); - - if (ns->sws) { - unsigned int bs = 1 << ns->lba_shift; - - blk_queue_io_min(ns->queue, bs * ns->sws); - if (ns->sgs) - blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs); - } - - return 0; -} - static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; @@ -3604,7 +3655,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ret = nvme_init_ns_head(ns, nsid, id); if (ret) goto out_free_id; - nvme_setup_streams_ns(ctrl, ns); nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); @@ -3618,7 +3668,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); ns->disk = disk; - __nvme_revalidate_disk(disk, id); + if (__nvme_revalidate_disk(disk, id)) + goto out_free_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { ret = nvme_nvm_register(ns, disk_name, node); @@ -3645,9 +3696,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) /* prevent double queue cleanup */ ns->disk->queue = NULL; put_disk(ns->disk); + out_free_disk: + del_gendisk(ns->disk); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); + if (list_empty(&ns->head->list)) + list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); out_free_id: @@ -3667,7 +3722,10 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_lock(&ns->ctrl->subsys->lock); list_del_rcu(&ns->siblings); + if (list_empty(&ns->head->list)) + list_del_init(&ns->head->entry); mutex_unlock(&ns->ctrl->subsys->lock); + synchronize_rcu(); /* guarantee not available in head->list */ nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ @@ -3687,6 +3745,16 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_put_ns(ns); } +static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) +{ + struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid); + + if (ns) { + nvme_ns_remove(ns); + nvme_put_ns(ns); + } +} + static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; @@ -3718,39 +3786,34 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, } -static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; + const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32); __le32 *ns_list; - unsigned i, j, nsid, prev = 0; - unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024); - int ret = 0; + u32 prev = 0; + int ret = 0, i; + + if (nvme_ctrl_limited_cns(ctrl)) + return -EOPNOTSUPP; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; - for (i = 0; i < num_lists; i++) { + for (;;) { ret = nvme_identify_ns_list(ctrl, prev, ns_list); if (ret) goto free; - for (j = 0; j < min(nn, 1024U); j++) { - nsid = le32_to_cpu(ns_list[j]); - if (!nsid) - goto out; + for (i = 0; i < nr_entries; i++) { + u32 nsid = le32_to_cpu(ns_list[i]); + if (!nsid) /* end of the list? */ + goto out; nvme_validate_ns(ctrl, nsid); - - while (++prev < nsid) { - ns = nvme_find_get_ns(ctrl, prev); - if (ns) { - nvme_ns_remove(ns); - nvme_put_ns(ns); - } - } + while (++prev < nsid) + nvme_ns_remove_by_nsid(ctrl, prev); } - nn -= j; } out: nvme_remove_invalid_namespaces(ctrl, prev); @@ -3759,9 +3822,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) return ret; } -static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) +static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) { - unsigned i; + struct nvme_id_ctrl *id; + u32 nn, i; + + if (nvme_identify_ctrl(ctrl, &id)) + return; + nn = le32_to_cpu(id->nn); + kfree(id); for (i = 1; i <= nn; i++) nvme_validate_ns(ctrl, i); @@ -3798,8 +3867,6 @@ static void nvme_scan_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, scan_work); - struct nvme_id_ctrl *id; - unsigned nn; /* No tagset on a live ctrl means IO queues could not created */ if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) @@ -3810,20 +3877,11 @@ static void nvme_scan_work(struct work_struct *work) nvme_clear_changed_ns_log(ctrl); } - if (nvme_identify_ctrl(ctrl, &id)) - return; - mutex_lock(&ctrl->scan_lock); - nn = le32_to_cpu(id->nn); - if (ctrl->vs >= NVME_VS(1, 1, 0) && - !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { - if (!nvme_scan_ns_list(ctrl, nn)) - goto out_free_id; - } - nvme_scan_ns_sequential(ctrl, nn); -out_free_id: + if (nvme_scan_ns_list(ctrl) != 0) + nvme_scan_ns_sequential(ctrl); mutex_unlock(&ctrl->scan_lock); - kfree(id); + down_write(&ctrl->namespaces_rwsem); list_sort(NULL, &ctrl->namespaces, ns_cmp); up_write(&ctrl->namespaces_rwsem); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 7dfc4a2ecf1e..cb0007592c12 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -14,6 +14,7 @@ #include "fabrics.h" #include <linux/nvme-fc-driver.h> #include <linux/nvme-fc.h> +#include "fc.h" #include <scsi/scsi_transport_fc.h> /* *************************** Data Structures/Defines ****************** */ @@ -61,6 +62,17 @@ struct nvmefc_ls_req_op { bool req_queued; }; +struct nvmefc_ls_rcv_op { + struct nvme_fc_rport *rport; + struct nvmefc_ls_rsp *lsrsp; + union nvmefc_ls_requests *rqstbuf; + union nvmefc_ls_responses *rspbuf; + u16 rqstdatalen; + bool handled; + dma_addr_t rspdma; + struct list_head lsrcv_list; /* rport->ls_rcv_list */ +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + enum nvme_fcpop_state { FCPOP_STATE_UNINIT = 0, FCPOP_STATE_IDLE = 1, @@ -96,7 +108,7 @@ struct nvme_fc_fcp_op { struct nvme_fcp_op_w_sgl { struct nvme_fc_fcp_op op; struct scatterlist sgl[NVME_INLINE_SG_CNT]; - uint8_t priv[0]; + uint8_t priv[]; }; struct nvme_fc_lport { @@ -117,6 +129,7 @@ struct nvme_fc_rport { struct list_head endp_list; /* for lport->endp_list */ struct list_head ctrl_list; struct list_head ls_req_list; + struct list_head ls_rcv_list; struct list_head disc_list; struct device *dev; /* physical device for dma */ struct nvme_fc_lport *lport; @@ -124,11 +137,12 @@ struct nvme_fc_rport { struct kref ref; atomic_t act_ctrl_cnt; unsigned long dev_loss_end; + struct work_struct lsrcv_work; } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ -enum nvme_fcctrl_flags { - FCCTRL_TERMIO = (1 << 0), -}; +/* fc_ctrl flags values - specified as bit positions */ +#define ASSOC_ACTIVE 0 +#define FCCTRL_TERMIO 1 struct nvme_fc_ctrl { spinlock_t lock; @@ -139,9 +153,9 @@ struct nvme_fc_ctrl { u32 cnum; bool ioq_live; - bool assoc_active; atomic_t err_work_active; u64 association_id; + struct nvmefc_ls_rcv_op *rcv_disconn; struct list_head ctrl_list; /* rport->ctrl_list */ @@ -152,7 +166,7 @@ struct nvme_fc_ctrl { struct work_struct err_work; struct kref ref; - u32 flags; + unsigned long flags; u32 iocnt; wait_queue_head_t ioabort_wait; @@ -219,6 +233,9 @@ static struct device *fc_udev_device; static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, struct nvme_fc_queue *, unsigned int); +static void nvme_fc_handle_ls_rqst_work(struct work_struct *work); + + static void nvme_fc_free_lport(struct kref *ref) { @@ -394,7 +411,10 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, newrec->ops = template; newrec->dev = dev; ida_init(&newrec->endp_cnt); - newrec->localport.private = &newrec[1]; + if (template->local_priv_sz) + newrec->localport.private = &newrec[1]; + else + newrec->localport.private = NULL; newrec->localport.node_name = pinfo->node_name; newrec->localport.port_name = pinfo->port_name; newrec->localport.port_role = pinfo->port_role; @@ -701,9 +721,13 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, atomic_set(&newrec->act_ctrl_cnt, 0); spin_lock_init(&newrec->lock); newrec->remoteport.localport = &lport->localport; + INIT_LIST_HEAD(&newrec->ls_rcv_list); newrec->dev = lport->dev; newrec->lport = lport; - newrec->remoteport.private = &newrec[1]; + if (lport->ops->remote_priv_sz) + newrec->remoteport.private = &newrec[1]; + else + newrec->remoteport.private = NULL; newrec->remoteport.port_role = pinfo->port_role; newrec->remoteport.node_name = pinfo->node_name; newrec->remoteport.port_name = pinfo->port_name; @@ -711,6 +735,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; newrec->remoteport.port_num = idx; __nvme_fc_set_dev_loss_tmo(newrec, pinfo); + INIT_WORK(&newrec->lsrcv_work, nvme_fc_handle_ls_rqst_work); spin_lock_irqsave(&nvme_fc_lock, flags); list_add_tail(&newrec->endp_list, &lport->endp_list); @@ -1000,6 +1025,7 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *); static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *); +static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); static void __nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop) @@ -1140,41 +1166,6 @@ nvme_fc_send_ls_req_async(struct nvme_fc_rport *rport, return __nvme_fc_send_ls_req(rport, lsop, done); } -/* Validation Error indexes into the string table below */ -enum { - VERR_NO_ERROR = 0, - VERR_LSACC = 1, - VERR_LSDESC_RQST = 2, - VERR_LSDESC_RQST_LEN = 3, - VERR_ASSOC_ID = 4, - VERR_ASSOC_ID_LEN = 5, - VERR_CONN_ID = 6, - VERR_CONN_ID_LEN = 7, - VERR_CR_ASSOC = 8, - VERR_CR_ASSOC_ACC_LEN = 9, - VERR_CR_CONN = 10, - VERR_CR_CONN_ACC_LEN = 11, - VERR_DISCONN = 12, - VERR_DISCONN_ACC_LEN = 13, -}; - -static char *validation_errors[] = { - "OK", - "Not LS_ACC", - "Not LSDESC_RQST", - "Bad LSDESC_RQST Length", - "Not Association ID", - "Bad Association ID Length", - "Not Connection ID", - "Bad Connection ID Length", - "Not CR_ASSOC Rqst", - "Bad CR_ASSOC ACC Length", - "Not CR_CONN Rqst", - "Bad CR_CONN ACC Length", - "Not Disconnect Rqst", - "Bad Disconnect ACC Length", -}; - static int nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, u16 qsize, u16 ersp_ratio) @@ -1183,21 +1174,27 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req *lsreq; struct fcnvme_ls_cr_assoc_rqst *assoc_rqst; struct fcnvme_ls_cr_assoc_acc *assoc_acc; + unsigned long flags; int ret, fcret = 0; lsop = kzalloc((sizeof(*lsop) + - ctrl->lport->ops->lsrqst_priv_sz + - sizeof(*assoc_rqst) + sizeof(*assoc_acc)), GFP_KERNEL); + sizeof(*assoc_rqst) + sizeof(*assoc_acc) + + ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL); if (!lsop) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: send Create Association failed: ENOMEM\n", + ctrl->cnum); ret = -ENOMEM; goto out_no_memory; } - lsreq = &lsop->ls_req; - lsreq->private = (void *)&lsop[1]; - assoc_rqst = (struct fcnvme_ls_cr_assoc_rqst *) - (lsreq->private + ctrl->lport->ops->lsrqst_priv_sz); + assoc_rqst = (struct fcnvme_ls_cr_assoc_rqst *)&lsop[1]; assoc_acc = (struct fcnvme_ls_cr_assoc_acc *)&assoc_rqst[1]; + lsreq = &lsop->ls_req; + if (ctrl->lport->ops->lsrqst_priv_sz) + lsreq->private = &assoc_acc[1]; + else + lsreq->private = NULL; assoc_rqst->w0.ls_cmd = FCNVME_LS_CREATE_ASSOCIATION; assoc_rqst->desc_list_len = @@ -1267,11 +1264,13 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl, "q %d Create Association LS failed: %s\n", queue->qnum, validation_errors[fcret]); } else { + spin_lock_irqsave(&ctrl->lock, flags); ctrl->association_id = be64_to_cpu(assoc_acc->associd.association_id); queue->connection_id = be64_to_cpu(assoc_acc->connectid.connection_id); set_bit(NVME_FC_Q_CONNECTED, &queue->flags); + spin_unlock_irqrestore(&ctrl->lock, flags); } out_free_buffer: @@ -1295,18 +1294,23 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, int ret, fcret = 0; lsop = kzalloc((sizeof(*lsop) + - ctrl->lport->ops->lsrqst_priv_sz + - sizeof(*conn_rqst) + sizeof(*conn_acc)), GFP_KERNEL); + sizeof(*conn_rqst) + sizeof(*conn_acc) + + ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL); if (!lsop) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: send Create Connection failed: ENOMEM\n", + ctrl->cnum); ret = -ENOMEM; goto out_no_memory; } - lsreq = &lsop->ls_req; - lsreq->private = (void *)&lsop[1]; - conn_rqst = (struct fcnvme_ls_cr_conn_rqst *) - (lsreq->private + ctrl->lport->ops->lsrqst_priv_sz); + conn_rqst = (struct fcnvme_ls_cr_conn_rqst *)&lsop[1]; conn_acc = (struct fcnvme_ls_cr_conn_acc *)&conn_rqst[1]; + lsreq = &lsop->ls_req; + if (ctrl->lport->ops->lsrqst_priv_sz) + lsreq->private = (void *)&conn_acc[1]; + else + lsreq->private = NULL; conn_rqst->w0.ls_cmd = FCNVME_LS_CREATE_CONNECTION; conn_rqst->desc_list_len = cpu_to_be32( @@ -1420,54 +1424,385 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) int ret; lsop = kzalloc((sizeof(*lsop) + - ctrl->lport->ops->lsrqst_priv_sz + - sizeof(*discon_rqst) + sizeof(*discon_acc)), - GFP_KERNEL); - if (!lsop) - /* couldn't sent it... too bad */ + sizeof(*discon_rqst) + sizeof(*discon_acc) + + ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL); + if (!lsop) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: send Disconnect Association " + "failed: ENOMEM\n", + ctrl->cnum); return; + } + discon_rqst = (struct fcnvme_ls_disconnect_assoc_rqst *)&lsop[1]; + discon_acc = (struct fcnvme_ls_disconnect_assoc_acc *)&discon_rqst[1]; lsreq = &lsop->ls_req; + if (ctrl->lport->ops->lsrqst_priv_sz) + lsreq->private = (void *)&discon_acc[1]; + else + lsreq->private = NULL; - lsreq->private = (void *)&lsop[1]; - discon_rqst = (struct fcnvme_ls_disconnect_assoc_rqst *) - (lsreq->private + ctrl->lport->ops->lsrqst_priv_sz); - discon_acc = (struct fcnvme_ls_disconnect_assoc_acc *)&discon_rqst[1]; + nvmefc_fmt_lsreq_discon_assoc(lsreq, discon_rqst, discon_acc, + ctrl->association_id); - discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT_ASSOC; - discon_rqst->desc_list_len = cpu_to_be32( - sizeof(struct fcnvme_lsdesc_assoc_id) + - sizeof(struct fcnvme_lsdesc_disconn_cmd)); + ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop, + nvme_fc_disconnect_assoc_done); + if (ret) + kfree(lsop); +} - discon_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID); - discon_rqst->associd.desc_len = - fcnvme_lsdesc_len( - sizeof(struct fcnvme_lsdesc_assoc_id)); +static void +nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +{ + struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private; + struct nvme_fc_rport *rport = lsop->rport; + struct nvme_fc_lport *lport = rport->lport; + unsigned long flags; + + spin_lock_irqsave(&rport->lock, flags); + list_del(&lsop->lsrcv_list); + spin_unlock_irqrestore(&rport->lock, flags); + + fc_dma_sync_single_for_cpu(lport->dev, lsop->rspdma, + sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + fc_dma_unmap_single(lport->dev, lsop->rspdma, + sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + + kfree(lsop); + + nvme_fc_rport_put(rport); +} + +static void +nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop) +{ + struct nvme_fc_rport *rport = lsop->rport; + struct nvme_fc_lport *lport = rport->lport; + struct fcnvme_ls_rqst_w0 *w0 = &lsop->rqstbuf->w0; + int ret; + + fc_dma_sync_single_for_device(lport->dev, lsop->rspdma, + sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + + ret = lport->ops->xmt_ls_rsp(&lport->localport, &rport->remoteport, + lsop->lsrsp); + if (ret) { + dev_warn(lport->dev, + "LLDD rejected LS RSP xmt: LS %d status %d\n", + w0->ls_cmd, ret); + nvme_fc_xmt_ls_rsp_done(lsop->lsrsp); + return; + } +} + +static struct nvme_fc_ctrl * +nvme_fc_match_disconn_ls(struct nvme_fc_rport *rport, + struct nvmefc_ls_rcv_op *lsop) +{ + struct fcnvme_ls_disconnect_assoc_rqst *rqst = + &lsop->rqstbuf->rq_dis_assoc; + struct nvme_fc_ctrl *ctrl, *ret = NULL; + struct nvmefc_ls_rcv_op *oldls = NULL; + u64 association_id = be64_to_cpu(rqst->associd.association_id); + unsigned long flags; + + spin_lock_irqsave(&rport->lock, flags); + + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + if (!nvme_fc_ctrl_get(ctrl)) + continue; + spin_lock(&ctrl->lock); + if (association_id == ctrl->association_id) { + oldls = ctrl->rcv_disconn; + ctrl->rcv_disconn = lsop; + ret = ctrl; + } + spin_unlock(&ctrl->lock); + if (ret) + /* leave the ctrl get reference */ + break; + nvme_fc_ctrl_put(ctrl); + } + + spin_unlock_irqrestore(&rport->lock, flags); + + /* transmit a response for anything that was pending */ + if (oldls) { + dev_info(rport->lport->dev, + "NVME-FC{%d}: Multiple Disconnect Association " + "LS's received\n", ctrl->cnum); + /* overwrite good response with bogus failure */ + oldls->lsrsp->rsplen = nvme_fc_format_rjt(oldls->rspbuf, + sizeof(*oldls->rspbuf), + rqst->w0.ls_cmd, + FCNVME_RJT_RC_UNAB, + FCNVME_RJT_EXP_NONE, 0); + nvme_fc_xmt_ls_rsp(oldls); + } + + return ret; +} + +/* + * returns true to mean LS handled and ls_rsp can be sent + * returns false to defer ls_rsp xmt (will be done as part of + * association termination) + */ +static bool +nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop) +{ + struct nvme_fc_rport *rport = lsop->rport; + struct fcnvme_ls_disconnect_assoc_rqst *rqst = + &lsop->rqstbuf->rq_dis_assoc; + struct fcnvme_ls_disconnect_assoc_acc *acc = + &lsop->rspbuf->rsp_dis_assoc; + struct nvme_fc_ctrl *ctrl = NULL; + int ret = 0; + + memset(acc, 0, sizeof(*acc)); + + ret = nvmefc_vldt_lsreq_discon_assoc(lsop->rqstdatalen, rqst); + if (!ret) { + /* match an active association */ + ctrl = nvme_fc_match_disconn_ls(rport, lsop); + if (!ctrl) + ret = VERR_NO_ASSOC; + } + + if (ret) { + dev_info(rport->lport->dev, + "Disconnect LS failed: %s\n", + validation_errors[ret]); + lsop->lsrsp->rsplen = nvme_fc_format_rjt(acc, + sizeof(*acc), rqst->w0.ls_cmd, + (ret == VERR_NO_ASSOC) ? + FCNVME_RJT_RC_INV_ASSOC : + FCNVME_RJT_RC_LOGIC, + FCNVME_RJT_EXP_NONE, 0); + return true; + } - discon_rqst->associd.association_id = cpu_to_be64(ctrl->association_id); + /* format an ACCept response */ - discon_rqst->discon_cmd.desc_tag = cpu_to_be32( - FCNVME_LSDESC_DISCONN_CMD); - discon_rqst->discon_cmd.desc_len = + lsop->lsrsp->rsplen = sizeof(*acc); + + nvme_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, fcnvme_lsdesc_len( - sizeof(struct fcnvme_lsdesc_disconn_cmd)); + sizeof(struct fcnvme_ls_disconnect_assoc_acc)), + FCNVME_LS_DISCONNECT_ASSOC); - lsreq->rqstaddr = discon_rqst; - lsreq->rqstlen = sizeof(*discon_rqst); - lsreq->rspaddr = discon_acc; - lsreq->rsplen = sizeof(*discon_acc); - lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC; + /* + * the transmit of the response will occur after the exchanges + * for the association have been ABTS'd by + * nvme_fc_delete_association(). + */ - ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop, - nvme_fc_disconnect_assoc_done); - if (ret) - kfree(lsop); + /* fail the association */ + nvme_fc_error_recovery(ctrl, "Disconnect Association LS received"); + + /* release the reference taken by nvme_fc_match_disconn_ls() */ + nvme_fc_ctrl_put(ctrl); + + return false; } +/* + * Actual Processing routine for received FC-NVME LS Requests from the LLD + * returns true if a response should be sent afterward, false if rsp will + * be sent asynchronously. + */ +static bool +nvme_fc_handle_ls_rqst(struct nvmefc_ls_rcv_op *lsop) +{ + struct fcnvme_ls_rqst_w0 *w0 = &lsop->rqstbuf->w0; + bool ret = true; + + lsop->lsrsp->nvme_fc_private = lsop; + lsop->lsrsp->rspbuf = lsop->rspbuf; + lsop->lsrsp->rspdma = lsop->rspdma; + lsop->lsrsp->done = nvme_fc_xmt_ls_rsp_done; + /* Be preventative. handlers will later set to valid length */ + lsop->lsrsp->rsplen = 0; -/* *********************** NVME Ctrl Routines **************************** */ + /* + * handlers: + * parse request input, execute the request, and format the + * LS response + */ + switch (w0->ls_cmd) { + case FCNVME_LS_DISCONNECT_ASSOC: + ret = nvme_fc_ls_disconnect_assoc(lsop); + break; + case FCNVME_LS_DISCONNECT_CONN: + lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, + sizeof(*lsop->rspbuf), w0->ls_cmd, + FCNVME_RJT_RC_UNSUP, FCNVME_RJT_EXP_NONE, 0); + break; + case FCNVME_LS_CREATE_ASSOCIATION: + case FCNVME_LS_CREATE_CONNECTION: + lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, + sizeof(*lsop->rspbuf), w0->ls_cmd, + FCNVME_RJT_RC_LOGIC, FCNVME_RJT_EXP_NONE, 0); + break; + default: + lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, + sizeof(*lsop->rspbuf), w0->ls_cmd, + FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0); + break; + } -static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); + return(ret); +} + +static void +nvme_fc_handle_ls_rqst_work(struct work_struct *work) +{ + struct nvme_fc_rport *rport = + container_of(work, struct nvme_fc_rport, lsrcv_work); + struct fcnvme_ls_rqst_w0 *w0; + struct nvmefc_ls_rcv_op *lsop; + unsigned long flags; + bool sendrsp; + +restart: + sendrsp = true; + spin_lock_irqsave(&rport->lock, flags); + list_for_each_entry(lsop, &rport->ls_rcv_list, lsrcv_list) { + if (lsop->handled) + continue; + + lsop->handled = true; + if (rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&rport->lock, flags); + sendrsp = nvme_fc_handle_ls_rqst(lsop); + } else { + spin_unlock_irqrestore(&rport->lock, flags); + w0 = &lsop->rqstbuf->w0; + lsop->lsrsp->rsplen = nvme_fc_format_rjt( + lsop->rspbuf, + sizeof(*lsop->rspbuf), + w0->ls_cmd, + FCNVME_RJT_RC_UNAB, + FCNVME_RJT_EXP_NONE, 0); + } + if (sendrsp) + nvme_fc_xmt_ls_rsp(lsop); + goto restart; + } + spin_unlock_irqrestore(&rport->lock, flags); +} + +/** + * nvme_fc_rcv_ls_req - transport entry point called by an LLDD + * upon the reception of a NVME LS request. + * + * The nvme-fc layer will copy payload to an internal structure for + * processing. As such, upon completion of the routine, the LLDD may + * immediately free/reuse the LS request buffer passed in the call. + * + * If this routine returns error, the LLDD should abort the exchange. + * + * @remoteport: pointer to the (registered) remote port that the LS + * was received from. The remoteport is associated with + * a specific localport. + * @lsrsp: pointer to a nvmefc_ls_rsp response structure to be + * used to reference the exchange corresponding to the LS + * when issuing an ls response. + * @lsreqbuf: pointer to the buffer containing the LS Request + * @lsreqbuf_len: length, in bytes, of the received LS request + */ +int +nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, + struct nvmefc_ls_rsp *lsrsp, + void *lsreqbuf, u32 lsreqbuf_len) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(portptr); + struct nvme_fc_lport *lport = rport->lport; + struct fcnvme_ls_rqst_w0 *w0 = (struct fcnvme_ls_rqst_w0 *)lsreqbuf; + struct nvmefc_ls_rcv_op *lsop; + unsigned long flags; + int ret; + + nvme_fc_rport_get(rport); + + /* validate there's a routine to transmit a response */ + if (!lport->ops->xmt_ls_rsp) { + dev_info(lport->dev, + "RCV %s LS failed: no LLDD xmt_ls_rsp\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -EINVAL; + goto out_put; + } + + if (lsreqbuf_len > sizeof(union nvmefc_ls_requests)) { + dev_info(lport->dev, + "RCV %s LS failed: payload too large\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -E2BIG; + goto out_put; + } + + lsop = kzalloc(sizeof(*lsop) + + sizeof(union nvmefc_ls_requests) + + sizeof(union nvmefc_ls_responses), + GFP_KERNEL); + if (!lsop) { + dev_info(lport->dev, + "RCV %s LS failed: No memory\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -ENOMEM; + goto out_put; + } + lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1]; + lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1]; + + lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf, + sizeof(*lsop->rspbuf), + DMA_TO_DEVICE); + if (fc_dma_mapping_error(lport->dev, lsop->rspdma)) { + dev_info(lport->dev, + "RCV %s LS failed: DMA mapping failure\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -EFAULT; + goto out_free; + } + + lsop->rport = rport; + lsop->lsrsp = lsrsp; + + memcpy(lsop->rqstbuf, lsreqbuf, lsreqbuf_len); + lsop->rqstdatalen = lsreqbuf_len; + + spin_lock_irqsave(&rport->lock, flags); + if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&rport->lock, flags); + ret = -ENOTCONN; + goto out_unmap; + } + list_add_tail(&lsop->lsrcv_list, &rport->ls_rcv_list); + spin_unlock_irqrestore(&rport->lock, flags); + + schedule_work(&rport->lsrcv_work); + + return 0; + +out_unmap: + fc_dma_unmap_single(lport->dev, lsop->rspdma, + sizeof(*lsop->rspbuf), DMA_TO_DEVICE); +out_free: + kfree(lsop); +out_put: + nvme_fc_rport_put(rport); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_fc_rcv_ls_req); + + +/* *********************** NVME Ctrl Routines **************************** */ static void __nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl, @@ -1500,7 +1835,7 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED); if (opstate != FCPOP_STATE_ACTIVE) atomic_set(&op->state, opstate); - else if (ctrl->flags & FCCTRL_TERMIO) + else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) ctrl->iocnt++; spin_unlock_irqrestore(&ctrl->lock, flags); @@ -1537,7 +1872,7 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, if (opstate == FCPOP_STATE_ABORTED) { spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) { + if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) { if (!--ctrl->iocnt) wake_up(&ctrl->ioabort_wait); } @@ -1771,7 +2106,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, res = __nvme_fc_init_request(ctrl, queue, &op->op, rq, queue->rqcnt++); if (res) return res; - op->op.fcp_req.first_sgl = &op->sgl[0]; + op->op.fcp_req.first_sgl = op->sgl; op->op.fcp_req.private = &op->priv[0]; nvme_req(rq)->ctrl = &ctrl->ctrl; return res; @@ -1783,15 +2118,17 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) struct nvme_fc_fcp_op *aen_op; struct nvme_fc_cmd_iu *cmdiu; struct nvme_command *sqe; - void *private; + void *private = NULL; int i, ret; aen_op = ctrl->aen_ops; for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { - private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz, + if (ctrl->lport->ops->fcprqst_priv_sz) { + private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz, GFP_KERNEL); - if (!private) - return -ENOMEM; + if (!private) + return -ENOMEM; + } cmdiu = &aen_op->cmd_iu; sqe = &cmdiu->sqe; @@ -1822,9 +2159,6 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl) aen_op = ctrl->aen_ops; for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { - if (!aen_op->fcp_req.private) - continue; - __nvme_fc_exit_request(ctrl, aen_op); kfree(aen_op->fcp_req.private); @@ -2366,16 +2700,9 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg); struct nvme_fc_fcp_op *aen_op; - unsigned long flags; - bool terminating = false; blk_status_t ret; - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) - terminating = true; - spin_unlock_irqrestore(&ctrl->lock, flags); - - if (terminating) + if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) return; aen_op = &ctrl->aen_ops[0]; @@ -2584,10 +2911,9 @@ nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl) struct nvme_fc_rport *rport = ctrl->rport; u32 cnt; - if (ctrl->assoc_active) + if (test_and_set_bit(ASSOC_ACTIVE, &ctrl->flags)) return 1; - ctrl->assoc_active = true; cnt = atomic_inc_return(&rport->act_ctrl_cnt); if (cnt == 1) nvme_fc_rport_active_on_lport(rport); @@ -2602,7 +2928,7 @@ nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl) struct nvme_fc_lport *lport = rport->lport; u32 cnt; - /* ctrl->assoc_active=false will be set independently */ + /* clearing of ctrl->flags ASSOC_ACTIVE bit is in association delete */ cnt = atomic_dec_return(&rport->act_ctrl_cnt); if (cnt == 0) { @@ -2622,6 +2948,8 @@ static int nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + struct nvmefc_ls_rcv_op *disls = NULL; + unsigned long flags; int ret; bool changed; @@ -2739,12 +3067,18 @@ out_term_aen_ops: out_disconnect_admin_queue: /* send a Disconnect(association) LS to fc-nvme target */ nvme_fc_xmt_disconnect_assoc(ctrl); + spin_lock_irqsave(&ctrl->lock, flags); ctrl->association_id = 0; + disls = ctrl->rcv_disconn; + ctrl->rcv_disconn = NULL; + spin_unlock_irqrestore(&ctrl->lock, flags); + if (disls) + nvme_fc_xmt_ls_rsp(disls); out_delete_hw_queue: __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); out_free_queue: nvme_fc_free_queue(&ctrl->queues[0]); - ctrl->assoc_active = false; + clear_bit(ASSOC_ACTIVE, &ctrl->flags); nvme_fc_ctlr_inactive_on_rport(ctrl); return ret; @@ -2759,14 +3093,14 @@ out_free_queue: static void nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) { + struct nvmefc_ls_rcv_op *disls = NULL; unsigned long flags; - if (!ctrl->assoc_active) + if (!test_and_clear_bit(ASSOC_ACTIVE, &ctrl->flags)) return; - ctrl->assoc_active = false; spin_lock_irqsave(&ctrl->lock, flags); - ctrl->flags |= FCCTRL_TERMIO; + set_bit(FCCTRL_TERMIO, &ctrl->flags); ctrl->iocnt = 0; spin_unlock_irqrestore(&ctrl->lock, flags); @@ -2817,7 +3151,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) /* wait for all io that had to be aborted */ spin_lock_irq(&ctrl->lock); wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock); - ctrl->flags &= ~FCCTRL_TERMIO; + clear_bit(FCCTRL_TERMIO, &ctrl->flags); spin_unlock_irq(&ctrl->lock); nvme_fc_term_aen_ops(ctrl); @@ -2831,7 +3165,17 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) if (ctrl->association_id) nvme_fc_xmt_disconnect_assoc(ctrl); + spin_lock_irqsave(&ctrl->lock, flags); ctrl->association_id = 0; + disls = ctrl->rcv_disconn; + ctrl->rcv_disconn = NULL; + spin_unlock_irqrestore(&ctrl->lock, flags); + if (disls) + /* + * if a Disconnect Request was waiting for a response, send + * now that all ABTS's have been issued (and are complete). + */ + nvme_fc_xmt_ls_rsp(disls); if (ctrl->ctrl.tagset) { nvme_fc_delete_hw_io_queues(ctrl); @@ -2902,7 +3246,9 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: dev_loss_tmo (%d) expired " "while waiting for remoteport connectivity.\n", - ctrl->cnum, portptr->dev_loss_tmo); + ctrl->cnum, min_t(int, portptr->dev_loss_tmo, + (ctrl->ctrl.opts->max_reconnects * + ctrl->ctrl.opts->reconnect_delay))); WARN_ON(nvme_delete_ctrl(&ctrl->ctrl)); } } @@ -3089,7 +3435,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->dev = lport->dev; ctrl->cnum = idx; ctrl->ioq_live = false; - ctrl->assoc_active = false; atomic_set(&ctrl->err_work_active, 0); init_waitqueue_head(&ctrl->ioabort_wait); diff --git a/drivers/nvme/host/fc.h b/drivers/nvme/host/fc.h new file mode 100644 index 000000000000..05ce566f2caf --- /dev/null +++ b/drivers/nvme/host/fc.h @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2016, Avago Technologies + */ + +#ifndef _NVME_FC_TRANSPORT_H +#define _NVME_FC_TRANSPORT_H 1 + + +/* + * Common definitions between the nvme_fc (host) transport and + * nvmet_fc (target) transport implementation. + */ + +/* + * ****************** FC-NVME LS HANDLING ****************** + */ + +union nvmefc_ls_requests { + struct fcnvme_ls_rqst_w0 w0; + struct fcnvme_ls_cr_assoc_rqst rq_cr_assoc; + struct fcnvme_ls_cr_conn_rqst rq_cr_conn; + struct fcnvme_ls_disconnect_assoc_rqst rq_dis_assoc; + struct fcnvme_ls_disconnect_conn_rqst rq_dis_conn; +} __aligned(128); /* alignment for other things alloc'd with */ + +union nvmefc_ls_responses { + struct fcnvme_ls_rjt rsp_rjt; + struct fcnvme_ls_cr_assoc_acc rsp_cr_assoc; + struct fcnvme_ls_cr_conn_acc rsp_cr_conn; + struct fcnvme_ls_disconnect_assoc_acc rsp_dis_assoc; + struct fcnvme_ls_disconnect_conn_acc rsp_dis_conn; +} __aligned(128); /* alignment for other things alloc'd with */ + +static inline void +nvme_fc_format_rsp_hdr(void *buf, u8 ls_cmd, __be32 desc_len, u8 rqst_ls_cmd) +{ + struct fcnvme_ls_acc_hdr *acc = buf; + + acc->w0.ls_cmd = ls_cmd; + acc->desc_list_len = desc_len; + acc->rqst.desc_tag = cpu_to_be32(FCNVME_LSDESC_RQST); + acc->rqst.desc_len = + fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)); + acc->rqst.w0.ls_cmd = rqst_ls_cmd; +} + +static inline int +nvme_fc_format_rjt(void *buf, u16 buflen, u8 ls_cmd, + u8 reason, u8 explanation, u8 vendor) +{ + struct fcnvme_ls_rjt *rjt = buf; + + nvme_fc_format_rsp_hdr(buf, FCNVME_LSDESC_RQST, + fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_rjt)), + ls_cmd); + rjt->rjt.desc_tag = cpu_to_be32(FCNVME_LSDESC_RJT); + rjt->rjt.desc_len = fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rjt)); + rjt->rjt.reason_code = reason; + rjt->rjt.reason_explanation = explanation; + rjt->rjt.vendor = vendor; + + return sizeof(struct fcnvme_ls_rjt); +} + +/* Validation Error indexes into the string table below */ +enum { + VERR_NO_ERROR = 0, + VERR_CR_ASSOC_LEN = 1, + VERR_CR_ASSOC_RQST_LEN = 2, + VERR_CR_ASSOC_CMD = 3, + VERR_CR_ASSOC_CMD_LEN = 4, + VERR_ERSP_RATIO = 5, + VERR_ASSOC_ALLOC_FAIL = 6, + VERR_QUEUE_ALLOC_FAIL = 7, + VERR_CR_CONN_LEN = 8, + VERR_CR_CONN_RQST_LEN = 9, + VERR_ASSOC_ID = 10, + VERR_ASSOC_ID_LEN = 11, + VERR_NO_ASSOC = 12, + VERR_CONN_ID = 13, + VERR_CONN_ID_LEN = 14, + VERR_INVAL_CONN = 15, + VERR_CR_CONN_CMD = 16, + VERR_CR_CONN_CMD_LEN = 17, + VERR_DISCONN_LEN = 18, + VERR_DISCONN_RQST_LEN = 19, + VERR_DISCONN_CMD = 20, + VERR_DISCONN_CMD_LEN = 21, + VERR_DISCONN_SCOPE = 22, + VERR_RS_LEN = 23, + VERR_RS_RQST_LEN = 24, + VERR_RS_CMD = 25, + VERR_RS_CMD_LEN = 26, + VERR_RS_RCTL = 27, + VERR_RS_RO = 28, + VERR_LSACC = 29, + VERR_LSDESC_RQST = 30, + VERR_LSDESC_RQST_LEN = 31, + VERR_CR_ASSOC = 32, + VERR_CR_ASSOC_ACC_LEN = 33, + VERR_CR_CONN = 34, + VERR_CR_CONN_ACC_LEN = 35, + VERR_DISCONN = 36, + VERR_DISCONN_ACC_LEN = 37, +}; + +static char *validation_errors[] = { + "OK", + "Bad CR_ASSOC Length", + "Bad CR_ASSOC Rqst Length", + "Not CR_ASSOC Cmd", + "Bad CR_ASSOC Cmd Length", + "Bad Ersp Ratio", + "Association Allocation Failed", + "Queue Allocation Failed", + "Bad CR_CONN Length", + "Bad CR_CONN Rqst Length", + "Not Association ID", + "Bad Association ID Length", + "No Association", + "Not Connection ID", + "Bad Connection ID Length", + "Invalid Connection ID", + "Not CR_CONN Cmd", + "Bad CR_CONN Cmd Length", + "Bad DISCONN Length", + "Bad DISCONN Rqst Length", + "Not DISCONN Cmd", + "Bad DISCONN Cmd Length", + "Bad Disconnect Scope", + "Bad RS Length", + "Bad RS Rqst Length", + "Not RS Cmd", + "Bad RS Cmd Length", + "Bad RS R_CTL", + "Bad RS Relative Offset", + "Not LS_ACC", + "Not LSDESC_RQST", + "Bad LSDESC_RQST Length", + "Not CR_ASSOC Rqst", + "Bad CR_ASSOC ACC Length", + "Not CR_CONN Rqst", + "Bad CR_CONN ACC Length", + "Not Disconnect Rqst", + "Bad Disconnect ACC Length", +}; + +#define NVME_FC_LAST_LS_CMD_VALUE FCNVME_LS_DISCONNECT_CONN + +static char *nvmefc_ls_names[] = { + "Reserved (0)", + "RJT (1)", + "ACC (2)", + "Create Association", + "Create Connection", + "Disconnect Association", + "Disconnect Connection", +}; + +static inline void +nvmefc_fmt_lsreq_discon_assoc(struct nvmefc_ls_req *lsreq, + struct fcnvme_ls_disconnect_assoc_rqst *discon_rqst, + struct fcnvme_ls_disconnect_assoc_acc *discon_acc, + u64 association_id) +{ + lsreq->rqstaddr = discon_rqst; + lsreq->rqstlen = sizeof(*discon_rqst); + lsreq->rspaddr = discon_acc; + lsreq->rsplen = sizeof(*discon_acc); + lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC; + + discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT_ASSOC; + discon_rqst->desc_list_len = cpu_to_be32( + sizeof(struct fcnvme_lsdesc_assoc_id) + + sizeof(struct fcnvme_lsdesc_disconn_cmd)); + + discon_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID); + discon_rqst->associd.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_assoc_id)); + + discon_rqst->associd.association_id = cpu_to_be64(association_id); + + discon_rqst->discon_cmd.desc_tag = cpu_to_be32( + FCNVME_LSDESC_DISCONN_CMD); + discon_rqst->discon_cmd.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_disconn_cmd)); +} + +static inline int +nvmefc_vldt_lsreq_discon_assoc(u32 rqstlen, + struct fcnvme_ls_disconnect_assoc_rqst *rqst) +{ + int ret = 0; + + if (rqstlen < sizeof(struct fcnvme_ls_disconnect_assoc_rqst)) + ret = VERR_DISCONN_LEN; + else if (rqst->desc_list_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_ls_disconnect_assoc_rqst))) + ret = VERR_DISCONN_RQST_LEN; + else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) + ret = VERR_ASSOC_ID; + else if (rqst->associd.desc_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_assoc_id))) + ret = VERR_ASSOC_ID_LEN; + else if (rqst->discon_cmd.desc_tag != + cpu_to_be32(FCNVME_LSDESC_DISCONN_CMD)) + ret = VERR_DISCONN_CMD; + else if (rqst->discon_cmd.desc_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_disconn_cmd))) + ret = VERR_DISCONN_CMD_LEN; + /* + * As the standard changed on the LS, check if old format and scope + * something other than Association (e.g. 0). + */ + else if (rqst->discon_cmd.rsvd8[0]) + ret = VERR_DISCONN_SCOPE; + + return ret; +} + +#endif /* _NVME_FC_TRANSPORT_H */ diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ec46693f6b64..69608755d415 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -171,7 +171,7 @@ struct nvme_nvm_bb_tbl { __le32 tdresv; __le32 thresv; __le32 rsvd2[8]; - __u8 blk[0]; + __u8 blk[]; }; struct nvme_nvm_id20_addrf { @@ -961,7 +961,10 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) geo = &dev->geo; geo->csecs = 1 << ns->lba_shift; geo->sos = ns->ms; - geo->ext = ns->ext; + if (ns->features & NVME_NS_EXT_LBAS) + geo->ext = true; + else + geo->ext = false; geo->mdts = ns->ctrl->max_hw_sectors; dev->q = q; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 54603bd3e02d..da78e499947a 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -3,6 +3,7 @@ * Copyright (c) 2017-2018 Christoph Hellwig. */ +#include <linux/backing-dev.h> #include <linux/moduleparam.h> #include <trace/events/block.h> #include "nvme.h" @@ -293,7 +294,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, struct bio *bio) { - struct nvme_ns_head *head = q->queuedata; + struct nvme_ns_head *head = bio->bi_disk->private_data; struct device *dev = disk_to_dev(head->disk); struct nvme_ns *ns; blk_qc_t ret = BLK_QC_T_NONE; @@ -371,13 +372,12 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) * We also do this for private namespaces as the namespace sharing data could * change after a rescan. */ - if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) return 0; q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node); if (!q) goto out; - q->queuedata = head; blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(q, 512); @@ -666,6 +666,13 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) nvme_mpath_set_live(ns); mutex_unlock(&ns->head->lock); } + + if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { + struct backing_dev_info *info = + ns->head->disk->queue->backing_dev_info; + + info->capabilities |= BDI_CAP_STABLE_WRITES; + } } void nvme_mpath_remove_disk(struct nvme_ns_head *head) @@ -687,7 +694,8 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) int error; /* check if multipath is enabled and we have the capability */ - if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3))) + if (!multipath || !ctrl->subsys || + !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) return 0; ctrl->anacap = id->anacap; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2e04a36296d9..fa5c75501049 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -16,6 +16,7 @@ #include <linux/fault-inject.h> #include <linux/rcupdate.h> #include <linux/wait.h> +#include <linux/t10-pi.h> #include <trace/events/block.h> @@ -30,8 +31,10 @@ extern unsigned int admin_timeout; #ifdef CONFIG_ARCH_NO_SG_CHAIN #define NVME_INLINE_SG_CNT 0 +#define NVME_INLINE_METADATA_SG_CNT 0 #else #define NVME_INLINE_SG_CNT 2 +#define NVME_INLINE_METADATA_SG_CNT 1 #endif extern struct workqueue_struct *nvme_wq; @@ -228,6 +231,7 @@ struct nvme_ctrl { u32 page_size; u32 max_hw_sectors; u32 max_segments; + u32 max_integrity_segments; u16 crdt[3]; u16 oncs; u16 oacs; @@ -352,6 +356,7 @@ struct nvme_ns_head { struct nvme_ns_ids ids; struct list_head entry; struct kref ref; + bool shared; int instance; #ifdef CONFIG_NVME_MULTIPATH struct gendisk *disk; @@ -363,6 +368,11 @@ struct nvme_ns_head { #endif }; +enum nvme_ns_features { + NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ + NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ +}; + struct nvme_ns { struct list_head list; @@ -382,18 +392,23 @@ struct nvme_ns { u16 ms; u16 sgs; u32 sws; - bool ext; u8 pi_type; + unsigned long features; unsigned long flags; #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 - u16 noiob; struct nvme_fault_inject fault_inject; }; +/* NVMe ns supports metadata actions by the controller (generate/strip) */ +static inline bool nvme_ns_has_pi(struct nvme_ns *ns) +{ + return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); +} + struct nvme_ctrl_ops { const char *name; struct module *module; @@ -449,6 +464,14 @@ static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba) return lba << (ns->lba_shift - SECTOR_SHIFT); } +/* + * Convert byte length to nvme's 0-based num dwords + */ +static inline u32 nvme_bytes_to_numd(size_t len) +{ + return (len >> 2) - 1; +} + static inline void nvme_end_request(struct request *req, __le16 status, union nvme_result result) { @@ -489,7 +512,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cc46e250fcac..d690d5593a80 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -68,14 +68,30 @@ static int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); +static int io_queue_count_set(const char *val, const struct kernel_param *kp) +{ + unsigned int n; + int ret; + + ret = kstrtouint(val, 10, &n); + if (ret != 0 || n > num_possible_cpus()) + return -EINVAL; + return param_set_uint(val, kp); +} + +static const struct kernel_param_ops io_queue_count_ops = { + .set = io_queue_count_set, + .get = param_get_uint, +}; + static unsigned int write_queues; -module_param(write_queues, uint, 0644); +module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644); MODULE_PARM_DESC(write_queues, "Number of queues to use for writes. If not set, reads and writes " "will share a queue set."); static unsigned int poll_queues; -module_param(poll_queues, uint, 0644); +module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); struct nvme_dev; @@ -128,6 +144,9 @@ struct nvme_dev { dma_addr_t host_mem_descs_dma; struct nvme_host_mem_buf_desc *host_mem_descs; void **host_mem_desc_bufs; + unsigned int nr_allocated_queues; + unsigned int nr_write_queues; + unsigned int nr_poll_queues; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -166,14 +185,13 @@ struct nvme_queue { void *sq_cmds; /* only used for poll queues: */ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; - volatile struct nvme_completion *cqes; + struct nvme_completion *cqes; dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; u32 __iomem *q_db; u16 q_depth; u16 cq_vector; u16 sq_tail; - u16 last_sq_tail; u16 cq_head; u16 qid; u8 cq_phase; @@ -209,25 +227,14 @@ struct nvme_iod { struct scatterlist *sg; }; -static unsigned int max_io_queues(void) +static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) { - return num_possible_cpus() + write_queues + poll_queues; -} - -static unsigned int max_queue_count(void) -{ - /* IO queues + admin queue */ - return 1 + max_io_queues(); -} - -static inline unsigned int nvme_dbbuf_size(u32 stride) -{ - return (max_queue_count() * 8 * stride); + return dev->nr_allocated_queues * 8 * dev->db_stride; } static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) { - unsigned int mem_size = nvme_dbbuf_size(dev->db_stride); + unsigned int mem_size = nvme_dbbuf_size(dev); if (dev->dbbuf_dbs) return 0; @@ -252,7 +259,7 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) static void nvme_dbbuf_dma_free(struct nvme_dev *dev) { - unsigned int mem_size = nvme_dbbuf_size(dev->db_stride); + unsigned int mem_size = nvme_dbbuf_size(dev); if (dev->dbbuf_dbs) { dma_free_coherent(dev->dev, mem_size, @@ -446,24 +453,11 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) return 0; } -/* - * Write sq tail if we are asked to, or if the next command would wrap. - */ -static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) +static inline void nvme_write_sq_db(struct nvme_queue *nvmeq) { - if (!write_sq) { - u16 next_tail = nvmeq->sq_tail + 1; - - if (next_tail == nvmeq->q_depth) - next_tail = 0; - if (next_tail != nvmeq->last_sq_tail) - return; - } - if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) writel(nvmeq->sq_tail, nvmeq->q_db); - nvmeq->last_sq_tail = nvmeq->sq_tail; } /** @@ -480,7 +474,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, cmd, sizeof(*cmd)); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; - nvme_write_sq_db(nvmeq, write_sq); + if (write_sq) + nvme_write_sq_db(nvmeq); spin_unlock(&nvmeq->sq_lock); } @@ -489,8 +484,7 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) struct nvme_queue *nvmeq = hctx->driver_data; spin_lock(&nvmeq->sq_lock); - if (nvmeq->sq_tail != nvmeq->last_sq_tail) - nvme_write_sq_db(nvmeq, true); + nvme_write_sq_db(nvmeq); spin_unlock(&nvmeq->sq_lock); } @@ -922,8 +916,9 @@ static void nvme_pci_complete_rq(struct request *req) /* We read the CQE phase first to check if the rest of the entry is valid */ static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) { - return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == - nvmeq->cq_phase; + struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head]; + + return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase; } static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) @@ -944,7 +939,7 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) { - volatile struct nvme_completion *cqe = &nvmeq->cqes[idx]; + struct nvme_completion *cqe = &nvmeq->cqes[idx]; struct request *req; if (unlikely(cqe->command_id >= nvmeq->q_depth)) { @@ -1501,7 +1496,6 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) struct nvme_dev *dev = nvmeq->dev; nvmeq->sq_tail = 0; - nvmeq->last_sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; @@ -2003,7 +1997,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) { struct nvme_dev *dev = affd->priv; - unsigned int nr_read_queues; + unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; /* * If there is no interupt available for queues, ensure that @@ -2019,12 +2013,12 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) if (!nrirqs) { nrirqs = 1; nr_read_queues = 0; - } else if (nrirqs == 1 || !write_queues) { + } else if (nrirqs == 1 || !nr_write_queues) { nr_read_queues = 0; - } else if (write_queues >= nrirqs) { + } else if (nr_write_queues >= nrirqs) { nr_read_queues = 1; } else { - nr_read_queues = nrirqs - write_queues; + nr_read_queues = nrirqs - nr_write_queues; } dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; @@ -2048,7 +2042,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) * Poll queues don't need interrupts, but we need at least one IO * queue left over for non-polled IO. */ - this_p_queues = poll_queues; + this_p_queues = dev->nr_poll_queues; if (this_p_queues >= nr_io_queues) { this_p_queues = nr_io_queues - 1; irq_queues = 1; @@ -2078,14 +2072,25 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) __nvme_disable_io_queues(dev, nvme_admin_delete_cq); } +static unsigned int nvme_max_io_queues(struct nvme_dev *dev) +{ + return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; +} + static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = &dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); - int result, nr_io_queues; + unsigned int nr_io_queues; unsigned long size; + int result; - nr_io_queues = max_io_queues(); + /* + * Sample the module parameters once at reset time so that we have + * stable values to work with. + */ + dev->nr_write_queues = write_queues; + dev->nr_poll_queues = poll_queues; /* * If tags are shared with admin queue (Apple bug), then @@ -2093,6 +2098,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) */ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) nr_io_queues = 1; + else + nr_io_queues = min(nvme_max_io_queues(dev), + dev->nr_allocated_queues - 1); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) @@ -2565,6 +2573,12 @@ static void nvme_reset_work(struct work_struct *work) goto out; } + /* + * We do not support an SGL for metadata (yet), so we are limited to a + * single integrity segment for the separate metadata pointer. + */ + dev->ctrl.max_integrity_segments = 1; + result = nvme_init_identify(&dev->ctrl); if (result) goto out; @@ -2767,8 +2781,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev) return -ENOMEM; - dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue), - GFP_KERNEL, node); + dev->nr_write_queues = write_queues; + dev->nr_poll_queues = poll_queues; + dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1; + dev->queues = kcalloc_node(dev->nr_allocated_queues, + sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) goto free; @@ -3131,8 +3148,6 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - write_queues = min(write_queues, num_possible_cpus()); - poll_queues = min(poll_queues, num_possible_cpus()); return pci_register_driver(&nvme_driver); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cac8a930396a..f8f856dc0c67 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -34,6 +34,11 @@ #define NVME_RDMA_MAX_INLINE_SEGMENTS 4 +#define NVME_RDMA_DATA_SGL_SIZE \ + (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT) +#define NVME_RDMA_METADATA_SGL_SIZE \ + (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT) + struct nvme_rdma_device { struct ib_device *dev; struct ib_pd *pd; @@ -48,6 +53,11 @@ struct nvme_rdma_qe { u64 dma; }; +struct nvme_rdma_sgl { + int nents; + struct sg_table sg_table; +}; + struct nvme_rdma_queue; struct nvme_rdma_request { struct nvme_request req; @@ -58,12 +68,12 @@ struct nvme_rdma_request { refcount_t ref; struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; u32 num_sge; - int nents; struct ib_reg_wr reg_wr; struct ib_cqe reg_cqe; struct nvme_rdma_queue *queue; - struct sg_table sg_table; - struct scatterlist first_sgl[]; + struct nvme_rdma_sgl data_sgl; + struct nvme_rdma_sgl *metadata_sgl; + bool use_sig_mr; }; enum nvme_rdma_queue_flags { @@ -85,6 +95,7 @@ struct nvme_rdma_queue { struct rdma_cm_id *cm_id; int cm_error; struct completion cm_done; + bool pi_support; }; struct nvme_rdma_ctrl { @@ -261,6 +272,8 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = queue->ib_cq; init_attr.recv_cq = queue->ib_cq; + if (queue->pi_support) + init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); @@ -290,6 +303,12 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, if (!req->sqe.data) return -ENOMEM; + /* metadata nvme_rdma_sgl struct is located after command's data SGL */ + if (queue->pi_support) + req->metadata_sgl = (void *)nvme_req(rq) + + sizeof(struct nvme_rdma_request) + + NVME_RDMA_DATA_SGL_SIZE; + req->queue = queue; return 0; @@ -400,6 +419,8 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) dev = queue->device; ibdev = dev->dev; + if (queue->pi_support) + ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs); ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); /* @@ -416,10 +437,16 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) nvme_rdma_dev_put(dev); } -static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) +static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support) { - return min_t(u32, NVME_RDMA_MAX_SEGMENTS, - ibdev->attrs.max_fast_reg_page_list_len - 1); + u32 max_page_list_len; + + if (pi_support) + max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len; + else + max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len; + + return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1); } static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) @@ -476,7 +503,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) * misaligned we'll end up using two entries for a single data page, * so one additional entry is required. */ - pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1; + pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1; ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, queue->queue_size, IB_MR_TYPE_MEM_REG, @@ -488,10 +515,24 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) goto out_destroy_ring; } + if (queue->pi_support) { + ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs, + queue->queue_size, IB_MR_TYPE_INTEGRITY, + pages_per_mr, pages_per_mr); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "failed to initialize PI MR pool sized %d for QID %d\n", + queue->queue_size, idx); + goto out_destroy_mr_pool; + } + } + set_bit(NVME_RDMA_Q_TR_READY, &queue->flags); return 0; +out_destroy_mr_pool: + ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); out_destroy_ring: nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, sizeof(struct nvme_completion), DMA_FROM_DEVICE); @@ -513,6 +554,10 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, queue = &ctrl->queues[idx]; queue->ctrl = ctrl; + if (idx && ctrl->ctrl.max_integrity_segments) + queue->pi_support = true; + else + queue->pi_support = false; init_completion(&queue->cm_done); if (idx > 0) @@ -723,7 +768,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->reserved_tags = 2; /* connect + keep-alive */ set->numa_node = nctrl->numa_node; set->cmd_size = sizeof(struct nvme_rdma_request) + - NVME_INLINE_SG_CNT * sizeof(struct scatterlist); + NVME_RDMA_DATA_SGL_SIZE; set->driver_data = ctrl; set->nr_hw_queues = 1; set->timeout = ADMIN_TIMEOUT; @@ -737,7 +782,10 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->numa_node = nctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE; set->cmd_size = sizeof(struct nvme_rdma_request) + - NVME_INLINE_SG_CNT * sizeof(struct scatterlist); + NVME_RDMA_DATA_SGL_SIZE; + if (nctrl->max_integrity_segments) + set->cmd_size += sizeof(struct nvme_rdma_sgl) + + NVME_RDMA_METADATA_SGL_SIZE; set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; @@ -770,6 +818,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, bool new) { + bool pi_capable = false; int error; error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); @@ -779,7 +828,13 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->device = ctrl->queues[0].device; ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device); - ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); + /* T10-PI support */ + if (ctrl->device->dev->attrs.device_cap_flags & + IB_DEVICE_INTEGRITY_HANDOVER) + pi_capable = true; + + ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, + pi_capable); /* * Bind the async event SQE DMA mapping to the admin queue lifetime. @@ -821,6 +876,10 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->ctrl.max_segments = ctrl->max_fr_pages; ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9); + if (pi_capable) + ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages; + else + ctrl->ctrl.max_integrity_segments = 0; blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); @@ -1149,17 +1208,29 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; + struct list_head *pool = &queue->qp->rdma_mrs; if (!blk_rq_nr_phys_segments(rq)) return; + if (blk_integrity_rq(rq)) { + ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, rq_dma_dir(rq)); + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); + } + + if (req->use_sig_mr) + pool = &queue->qp->sig_mrs; + if (req->mr) { - ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); + ib_mr_pool_put(queue->qp, pool, req->mr); req->mr = NULL; } - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); - sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT); + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); } static int nvme_rdma_set_sg_null(struct nvme_command *c) @@ -1178,7 +1249,7 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, int count) { struct nvme_sgl_desc *sg = &c->common.dptr.sgl; - struct scatterlist *sgl = req->sg_table.sgl; + struct scatterlist *sgl = req->data_sgl.sg_table.sgl; struct ib_sge *sge = &req->sge[1]; u32 len = 0; int i; @@ -1203,8 +1274,8 @@ static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue, { struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; - sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl)); - put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length); + sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl)); + put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length); put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key); sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; return 0; @@ -1225,7 +1296,8 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, * Align the MR to a 4K page size to match the ctrl page size and * the block virtual boundary. */ - nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K); + nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL, + SZ_4K); if (unlikely(nr < count)) { ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); req->mr = NULL; @@ -1256,12 +1328,125 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, return 0; } +static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, + struct nvme_command *cmd, struct ib_sig_domain *domain, + u16 control, u8 pi_type) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.pi_interval = 1 << bi->interval_exp; + domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); + if (control & NVME_RW_PRINFO_PRCHK_REF) + domain->sig.dif.ref_remap = true; + + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_escape = true; + if (pi_type == NVME_NS_DPS_PI_TYPE3) + domain->sig.dif.ref_escape = true; +} + +static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, + struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs, + u8 pi_type) +{ + u16 control = le16_to_cpu(cmd->rw.control); + + memset(sig_attrs, 0, sizeof(*sig_attrs)); + if (control & NVME_RW_PRINFO_PRACT) { + /* for WRITE_INSERT/READ_STRIP no memory domain */ + sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, + pi_type); + /* Clear the PRACT bit since HCA will generate/verify the PI */ + control &= ~NVME_RW_PRINFO_PRACT; + cmd->rw.control = cpu_to_le16(control); + } else { + /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, + pi_type); + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, + pi_type); + } +} + +static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask) +{ + *mask = 0; + if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF) + *mask |= IB_SIG_CHECK_REFTAG; + if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD) + *mask |= IB_SIG_CHECK_GUARD; +} + +static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "SIG"); +} + +static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count, int pi_count) +{ + struct nvme_rdma_sgl *sgl = &req->data_sgl; + struct ib_reg_wr *wr = &req->reg_wr; + struct request *rq = blk_mq_rq_from_pdu(req); + struct nvme_ns *ns = rq->q->queuedata; + struct bio *bio = rq->bio; + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + int nr; + + req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs); + if (WARN_ON_ONCE(!req->mr)) + return -EAGAIN; + + nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL, + req->metadata_sgl->sg_table.sgl, pi_count, NULL, + SZ_4K); + if (unlikely(nr)) + goto mr_put; + + nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c, + req->mr->sig_attrs, ns->pi_type); + nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); + + ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); + + req->reg_cqe.done = nvme_rdma_sig_done; + memset(wr, 0, sizeof(*wr)); + wr->wr.opcode = IB_WR_REG_MR_INTEGRITY; + wr->wr.wr_cqe = &req->reg_cqe; + wr->wr.num_sge = 0; + wr->wr.send_flags = 0; + wr->mr = req->mr; + wr->key = req->mr->rkey; + wr->access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + sg->addr = cpu_to_le64(req->mr->iova); + put_unaligned_le24(req->mr->length, sg->length); + put_unaligned_le32(req->mr->rkey, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + + return 0; + +mr_put: + ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr); + req->mr = NULL; + if (nr < 0) + return nr; + return -EINVAL; +} + static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, struct request *rq, struct nvme_command *c) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; + int pi_count = 0; int count, ret; req->num_sge = 1; @@ -1272,22 +1457,52 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, if (!blk_rq_nr_phys_segments(rq)) return nvme_rdma_set_sg_null(c); - req->sg_table.sgl = req->first_sgl; - ret = sg_alloc_table_chained(&req->sg_table, - blk_rq_nr_phys_segments(rq), req->sg_table.sgl, + req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1); + ret = sg_alloc_table_chained(&req->data_sgl.sg_table, + blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl, NVME_INLINE_SG_CNT); if (ret) return -ENOMEM; - req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); + req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, + req->data_sgl.sg_table.sgl); - count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, - rq_dma_dir(rq)); + count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, + req->data_sgl.nents, rq_dma_dir(rq)); if (unlikely(count <= 0)) { ret = -EIO; goto out_free_table; } + if (blk_integrity_rq(rq)) { + req->metadata_sgl->sg_table.sgl = + (struct scatterlist *)(req->metadata_sgl + 1); + ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table, + blk_rq_count_integrity_sg(rq->q, rq->bio), + req->metadata_sgl->sg_table.sgl, + NVME_INLINE_METADATA_SG_CNT); + if (unlikely(ret)) { + ret = -ENOMEM; + goto out_unmap_sg; + } + + req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q, + rq->bio, req->metadata_sgl->sg_table.sgl); + pi_count = ib_dma_map_sg(ibdev, + req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, + rq_dma_dir(rq)); + if (unlikely(pi_count <= 0)) { + ret = -EIO; + goto out_free_pi_table; + } + } + + if (req->use_sig_mr) { + ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count); + goto out; + } + if (count <= dev->num_inline_segments) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && queue->ctrl->use_inline_data && @@ -1306,14 +1521,23 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, ret = nvme_rdma_map_sg_fr(queue, req, c, count); out: if (unlikely(ret)) - goto out_unmap_sg; + goto out_unmap_pi_sg; return 0; +out_unmap_pi_sg: + if (blk_integrity_rq(rq)) + ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, rq_dma_dir(rq)); +out_free_pi_table: + if (blk_integrity_rq(rq)) + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); out_unmap_sg: - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); out_free_table: - sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT); + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); return ret; } @@ -1761,6 +1985,15 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + queue->pi_support && + (c->common.opcode == nvme_cmd_write || + c->common.opcode == nvme_cmd_read) && + nvme_ns_has_pi(ns)) + req->use_sig_mr = true; + else + req->use_sig_mr = false; + err = nvme_rdma_map_data(queue, rq, c); if (unlikely(err < 0)) { dev_err(queue->ctrl->ctrl.device, @@ -1801,12 +2034,46 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) return ib_process_cq_direct(queue->ib_cq, -1); } +static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req) +{ + struct request *rq = blk_mq_rq_from_pdu(req); + struct ib_mr_status mr_status; + int ret; + + ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + nvme_req(rq)->status = NVME_SC_INVALID_PI; + return; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + nvme_req(rq)->status = NVME_SC_GUARD_CHECK; + break; + case IB_SIG_BAD_REFTAG: + nvme_req(rq)->status = NVME_SC_REFTAG_CHECK; + break; + case IB_SIG_BAD_APPTAG: + nvme_req(rq)->status = NVME_SC_APPTAG_CHECK; + break; + } + pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, mr_status.sig_err.expected, + mr_status.sig_err.actual); + } +} + static void nvme_rdma_complete_rq(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_queue *queue = req->queue; struct ib_device *ibdev = queue->device->dev; + if (req->use_sig_mr) + nvme_rdma_check_pi_status(req); + nvme_rdma_unmap_data(queue, rq); ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command), DMA_TO_DEVICE); @@ -1926,7 +2193,7 @@ out_fail: static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .name = "rdma", .module = THIS_MODULE, - .flags = NVME_F_FABRICS, + .flags = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4c972d8abf31..1843110ec34f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -60,6 +60,7 @@ struct nvme_tcp_request { enum nvme_tcp_queue_flags { NVME_TCP_Q_ALLOCATED = 0, NVME_TCP_Q_LIVE = 1, + NVME_TCP_Q_POLLING = 2, }; enum nvme_tcp_recv_state { @@ -75,6 +76,7 @@ struct nvme_tcp_queue { int io_cpu; spinlock_t lock; + struct mutex send_mutex; struct list_head send_list; /* recv state */ @@ -131,6 +133,7 @@ static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); static struct workqueue_struct *nvme_tcp_wq; static struct blk_mq_ops nvme_tcp_mq_ops; static struct blk_mq_ops nvme_tcp_admin_mq_ops; +static int nvme_tcp_try_send(struct nvme_tcp_queue *queue); static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) { @@ -257,15 +260,29 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, } } -static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req) +static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, + bool sync) { struct nvme_tcp_queue *queue = req->queue; + bool empty; spin_lock(&queue->lock); + empty = list_empty(&queue->send_list) && !queue->request; list_add_tail(&req->entry, &queue->send_list); spin_unlock(&queue->lock); - queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + /* + * if we're the first on the send_list and we can try to send + * directly, otherwise queue io_work. Also, only do that if we + * are on the same cpu, so we don't introduce contention. + */ + if (queue->io_cpu == smp_processor_id() && + sync && empty && mutex_trylock(&queue->send_mutex)) { + nvme_tcp_try_send(queue); + mutex_unlock(&queue->send_mutex); + } else { + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + } } static inline struct nvme_tcp_request * @@ -578,7 +595,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, req->state = NVME_TCP_SEND_H2C_PDU; req->offset = 0; - nvme_tcp_queue_request(req); + nvme_tcp_queue_request(req, false); return 0; } @@ -794,11 +811,12 @@ static void nvme_tcp_data_ready(struct sock *sk) { struct nvme_tcp_queue *queue; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; - if (likely(queue && queue->rd_enabled)) + if (likely(queue && queue->rd_enabled) && + !test_bit(NVME_TCP_Q_POLLING, &queue->flags)) queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static void nvme_tcp_write_space(struct sock *sk) @@ -867,7 +885,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) if (last && !queue->data_digest) flags |= MSG_EOR; else - flags |= MSG_MORE; + flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; /* can't zcopy slab pages */ if (unlikely(PageSlab(page))) { @@ -906,11 +924,16 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) struct nvme_tcp_queue *queue = req->queue; struct nvme_tcp_cmd_pdu *pdu = req->pdu; bool inline_data = nvme_tcp_has_inline_data(req); - int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) + hdgst - req->offset; + int flags = MSG_DONTWAIT; int ret; + if (inline_data) + flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + else + flags |= MSG_EOR; + if (queue->hdr_digest && !req->offset) nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); @@ -949,7 +972,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) ret = kernel_sendpage(queue->sock, virt_to_page(pdu), offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE); + MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); if (unlikely(ret <= 0)) return ret; @@ -1063,11 +1086,14 @@ static void nvme_tcp_io_work(struct work_struct *w) bool pending = false; int result; - result = nvme_tcp_try_send(queue); - if (result > 0) - pending = true; - else if (unlikely(result < 0)) - break; + if (mutex_trylock(&queue->send_mutex)) { + result = nvme_tcp_try_send(queue); + mutex_unlock(&queue->send_mutex); + if (result > 0) + pending = true; + else if (unlikely(result < 0)) + break; + } result = nvme_tcp_try_recv(queue); if (result > 0) @@ -1318,6 +1344,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->ctrl = ctrl; INIT_LIST_HEAD(&queue->send_list); spin_lock_init(&queue->lock); + mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); queue->queue_size = queue_size; @@ -1506,6 +1533,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; set->reserved_tags = 2; /* connect + keep-alive */ set->numa_node = NUMA_NO_NODE; + set->flags = BLK_MQ_F_BLOCKING; set->cmd_size = sizeof(struct nvme_tcp_request); set->driver_data = ctrl; set->nr_hw_queues = 1; @@ -1517,7 +1545,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->queue_depth = nctrl->sqsize + 1; set->reserved_tags = 1; /* fabric connect */ set->numa_node = NUMA_NO_NODE; - set->flags = BLK_MQ_F_SHOULD_MERGE; + set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; set->cmd_size = sizeof(struct nvme_tcp_request); set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; @@ -2076,7 +2104,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; - nvme_tcp_queue_request(&ctrl->async_req); + nvme_tcp_queue_request(&ctrl->async_req, true); } static enum blk_eh_timer_return @@ -2207,7 +2235,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - nvme_tcp_queue_request(req); + nvme_tcp_queue_request(req, true); return BLK_STS_OK; } @@ -2265,9 +2293,11 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) return 0; + set_bit(NVME_TCP_Q_POLLING, &queue->flags); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, true); nvme_tcp_try_recv(queue); + clear_bit(NVME_TCP_Q_POLLING, &queue->flags); return queue->nr_cqe; } |