summaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 23:20:22 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 23:20:22 +0200
commit3e7819886281e077e82006fe4804b0d6b0f5643b (patch)
tree40766af623d8a1dde0edaee8b6abc496efbcc615 /drivers/nvme/host
parentMerge tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux (diff)
parentfloppy: add missing MODULE_DESCRIPTION() macro (diff)
downloadlinux-3e7819886281e077e82006fe4804b0d6b0f5643b.tar.xz
linux-3e7819886281e077e82006fe4804b0d6b0f5643b.zip
Merge tag 'for-6.11/block-20240710' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - NVMe updates via Keith: - Device initialization memory leak fixes (Keith) - More constants defined (Weiwen) - Target debugfs support (Hannes) - PCIe subsystem reset enhancements (Keith) - Queue-depth multipath policy (Redhat and PureStorage) - Implement get_unique_id (Christoph) - Authentication error fixes (Gaosheng) - MD updates via Song - sync_action fix and refactoring (Yu Kuai) - Various small fixes (Christoph Hellwig, Li Nan, and Ofir Gal, Yu Kuai, Benjamin Marzinski, Christophe JAILLET, Yang Li) - Fix loop detach/open race (Gulam) - Fix lower control limit for blk-throttle (Yu) - Add module descriptions to various drivers (Jeff) - Add support for atomic writes for block devices, and statx reporting for same. Includes SCSI and NVMe (John, Prasad, Alan) - Add IO priority information to block trace points (Dongliang) - Various zone improvements and tweaks (Damien) - mq-deadline tag reservation improvements (Bart) - Ignore direct reclaim swap writes in writeback throttling (Baokun) - Block integrity improvements and fixes (Anuj) - Add basic support for rust based block drivers. Has a dummy null_blk variant for now (Andreas) - Series converting driver settings to queue limits, and cleanups and fixes related to that (Christoph) - Cleanup for poking too deeply into the bvec internals, in preparation for DMA mapping API changes (Christoph) - Various minor tweaks and fixes (Jiapeng, John, Kanchan, Mikulas, Ming, Zhu, Damien, Christophe, Chaitanya) * tag 'for-6.11/block-20240710' of git://git.kernel.dk/linux: (206 commits) floppy: add missing MODULE_DESCRIPTION() macro loop: add missing MODULE_DESCRIPTION() macro ublk_drv: add missing MODULE_DESCRIPTION() macro xen/blkback: add missing MODULE_DESCRIPTION() macro block/rnbd: Constify struct kobj_type block: take offset into account in blk_bvec_map_sg again block: fix get_max_segment_size() warning loop: Don't bother validating blocksize virtio_blk: Don't bother validating blocksize null_blk: Don't bother validating blocksize block: Validate logical block size in blk_validate_limits() virtio_blk: Fix default logical block size fallback nvmet-auth: fix nvmet_auth hash error handling nvme: implement ->get_unique_id block: pass a phys_addr_t to get_max_segment_size block: add a bvec_phys helper blk-lib: check for kill signal in ioctl BLKZEROOUT block: limit the Write Zeroes to manually writing zeroes fallback block: refacto blkdev_issue_zeroout block: move read-only and supported checks into (__)blkdev_issue_zeroout ...
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/Kconfig1
-rw-r--r--drivers/nvme/host/apple.c32
-rw-r--r--drivers/nvme/host/constants.c2
-rw-r--r--drivers/nvme/host/core.c286
-rw-r--r--drivers/nvme/host/fabrics.c25
-rw-r--r--drivers/nvme/host/fabrics.h1
-rw-r--r--drivers/nvme/host/fault_inject.c2
-rw-r--r--drivers/nvme/host/fc.c55
-rw-r--r--drivers/nvme/host/multipath.c144
-rw-r--r--drivers/nvme/host/nvme.h28
-rw-r--r--drivers/nvme/host/pci.c47
-rw-r--r--drivers/nvme/host/pr.c10
-rw-r--r--drivers/nvme/host/rdma.c34
-rw-r--r--drivers/nvme/host/tcp.c31
-rw-r--r--drivers/nvme/host/zns.c3
15 files changed, 506 insertions, 195 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index b309c8be720f..a3caef75aa0a 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
config NVME_CORE
tristate
- select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY
config BLK_DEV_NVME
tristate "NVM Express block device"
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 0cfa39361d3b..b1387dc459a3 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -1388,7 +1388,7 @@ static void devm_apple_nvme_mempool_destroy(void *data)
mempool_destroy(data);
}
-static int apple_nvme_probe(struct platform_device *pdev)
+static struct apple_nvme *apple_nvme_alloc(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct apple_nvme *anv;
@@ -1396,7 +1396,7 @@ static int apple_nvme_probe(struct platform_device *pdev)
anv = devm_kzalloc(dev, sizeof(*anv), GFP_KERNEL);
if (!anv)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
anv->dev = get_device(dev);
anv->adminq.is_adminq = true;
@@ -1516,10 +1516,30 @@ static int apple_nvme_probe(struct platform_device *pdev)
goto put_dev;
}
+ return anv;
+put_dev:
+ put_device(anv->dev);
+ return ERR_PTR(ret);
+}
+
+static int apple_nvme_probe(struct platform_device *pdev)
+{
+ struct apple_nvme *anv;
+ int ret;
+
+ anv = apple_nvme_alloc(pdev);
+ if (IS_ERR(anv))
+ return PTR_ERR(anv);
+
+ ret = nvme_add_ctrl(&anv->ctrl);
+ if (ret)
+ goto out_put_ctrl;
+
anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL);
if (IS_ERR(anv->ctrl.admin_q)) {
ret = -ENOMEM;
- goto put_dev;
+ anv->ctrl.admin_q = NULL;
+ goto out_uninit_ctrl;
}
nvme_reset_ctrl(&anv->ctrl);
@@ -1527,8 +1547,10 @@ static int apple_nvme_probe(struct platform_device *pdev)
return 0;
-put_dev:
- put_device(anv->dev);
+out_uninit_ctrl:
+ nvme_uninit_ctrl(&anv->ctrl);
+out_put_ctrl:
+ nvme_put_ctrl(&anv->ctrl);
return ret;
}
diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 6f2ebb5fcdb0..2b9e6cfaf2a8 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -173,7 +173,7 @@ static const char * const nvme_statuses[] = {
const char *nvme_get_error_status_str(u16 status)
{
- status &= 0x7ff;
+ status &= NVME_SCT_SC_MASK;
if (status < ARRAY_SIZE(nvme_statuses) && nvme_statuses[status])
return nvme_statuses[status];
return "Unknown";
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 782090ce0bc1..19917253ba7b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);
static LIST_HEAD(nvme_subsystems);
-static DEFINE_MUTEX(nvme_subsystems_lock);
+DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
@@ -261,7 +261,7 @@ void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
static blk_status_t nvme_error_status(u16 status)
{
- switch (status & 0x7ff) {
+ switch (status & NVME_SCT_SC_MASK) {
case NVME_SC_SUCCESS:
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
@@ -307,7 +307,7 @@ static void nvme_retry_req(struct request *req)
u16 crd;
/* The mask and shift result must be <= 3 */
- crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
+ crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11;
if (crd)
delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
@@ -329,10 +329,10 @@ static void nvme_log_error(struct request *req)
nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
blk_rq_bytes(req) >> ns->head->lba_shift,
nvme_get_error_status_str(nr->status),
- nr->status >> 8 & 7, /* Status Code Type */
- nr->status & 0xff, /* Status Code */
- nr->status & NVME_SC_MORE ? "MORE " : "",
- nr->status & NVME_SC_DNR ? "DNR " : "");
+ NVME_SCT(nr->status), /* Status Code Type */
+ nr->status & NVME_SC_MASK, /* Status Code */
+ nr->status & NVME_STATUS_MORE ? "MORE " : "",
+ nr->status & NVME_STATUS_DNR ? "DNR " : "");
return;
}
@@ -341,10 +341,10 @@ static void nvme_log_error(struct request *req)
nvme_get_admin_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
nvme_get_error_status_str(nr->status),
- nr->status >> 8 & 7, /* Status Code Type */
- nr->status & 0xff, /* Status Code */
- nr->status & NVME_SC_MORE ? "MORE " : "",
- nr->status & NVME_SC_DNR ? "DNR " : "");
+ NVME_SCT(nr->status), /* Status Code Type */
+ nr->status & NVME_SC_MASK, /* Status Code */
+ nr->status & NVME_STATUS_MORE ? "MORE " : "",
+ nr->status & NVME_STATUS_DNR ? "DNR " : "");
}
static void nvme_log_err_passthru(struct request *req)
@@ -359,10 +359,10 @@ static void nvme_log_err_passthru(struct request *req)
nvme_get_admin_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
nvme_get_error_status_str(nr->status),
- nr->status >> 8 & 7, /* Status Code Type */
- nr->status & 0xff, /* Status Code */
- nr->status & NVME_SC_MORE ? "MORE " : "",
- nr->status & NVME_SC_DNR ? "DNR " : "",
+ NVME_SCT(nr->status), /* Status Code Type */
+ nr->status & NVME_SC_MASK, /* Status Code */
+ nr->status & NVME_STATUS_MORE ? "MORE " : "",
+ nr->status & NVME_STATUS_DNR ? "DNR " : "",
nr->cmd->common.cdw10,
nr->cmd->common.cdw11,
nr->cmd->common.cdw12,
@@ -384,11 +384,11 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
return COMPLETE;
if (blk_noretry_request(req) ||
- (nvme_req(req)->status & NVME_SC_DNR) ||
+ (nvme_req(req)->status & NVME_STATUS_DNR) ||
nvme_req(req)->retries >= nvme_max_retries)
return COMPLETE;
- if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
+ if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED)
return AUTHENTICATE;
if (req->cmd_flags & REQ_NVME_MPATH) {
@@ -927,6 +927,36 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
return BLK_STS_OK;
}
+/*
+ * NVMe does not support a dedicated command to issue an atomic write. A write
+ * which does adhere to the device atomic limits will silently be executed
+ * non-atomically. The request issuer should ensure that the write is within
+ * the queue atomic writes limits, but just validate this in case it is not.
+ */
+static bool nvme_valid_atomic_write(struct request *req)
+{
+ struct request_queue *q = req->q;
+ u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
+
+ if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
+ return false;
+
+ if (boundary_bytes) {
+ u64 mask = boundary_bytes - 1, imask = ~mask;
+ u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
+ u64 end = start + blk_rq_bytes(req) - 1;
+
+ /* If greater then must be crossing a boundary */
+ if (blk_rq_bytes(req) > boundary_bytes)
+ return false;
+
+ if ((start & imask) != (end & imask))
+ return false;
+ }
+
+ return true;
+}
+
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd,
enum nvme_opcode op)
@@ -942,6 +972,9 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+ if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
+ return BLK_STS_INVAL;
+
cmnd->rw.opcode = op;
cmnd->rw.flags = 0;
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
@@ -1224,7 +1257,7 @@ EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
/*
* Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
- *
+ *
* The host should send Keep Alive commands at half of the Keep Alive Timeout
* accounting for transport roundtrip times [..].
*/
@@ -1724,11 +1757,12 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
-static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head)
+static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head,
+ struct queue_limits *lim)
{
- struct blk_integrity integrity = { };
+ struct blk_integrity *bi = &lim->integrity;
- blk_integrity_unregister(disk);
+ memset(bi, 0, sizeof(*bi));
if (!head->ms)
return true;
@@ -1745,17 +1779,16 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head)
case NVME_NS_DPS_PI_TYPE3:
switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
- integrity.profile = &t10_pi_type3_crc;
- integrity.tag_size = sizeof(u16) + sizeof(u32);
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
+ bi->tag_size = sizeof(u16) + sizeof(u32);
+ bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
break;
case NVME_NVM_NS_64B_GUARD:
- integrity.profile = &ext_pi_type3_crc64;
- integrity.tag_size = sizeof(u16) + 6;
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
+ bi->tag_size = sizeof(u16) + 6;
+ bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
break;
default:
- integrity.profile = NULL;
break;
}
break;
@@ -1763,28 +1796,27 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head)
case NVME_NS_DPS_PI_TYPE2:
switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
- integrity.profile = &t10_pi_type1_crc;
- integrity.tag_size = sizeof(u16);
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
+ bi->tag_size = sizeof(u16);
+ bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
+ BLK_INTEGRITY_REF_TAG;
break;
case NVME_NVM_NS_64B_GUARD:
- integrity.profile = &ext_pi_type1_crc64;
- integrity.tag_size = sizeof(u16);
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
+ bi->tag_size = sizeof(u16);
+ bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
+ BLK_INTEGRITY_REF_TAG;
break;
default:
- integrity.profile = NULL;
break;
}
break;
default:
- integrity.profile = NULL;
break;
}
- integrity.tuple_size = head->ms;
- integrity.pi_offset = head->pi_offset;
- blk_integrity_register(disk, &integrity);
+ bi->tuple_size = head->ms;
+ bi->pi_offset = head->pi_offset;
return true;
}
@@ -1922,6 +1954,23 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
}
}
+
+static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
+ struct nvme_id_ns *id, struct queue_limits *lim,
+ u32 bs, u32 atomic_bs)
+{
+ unsigned int boundary = 0;
+
+ if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
+ if (le16_to_cpu(id->nabspf))
+ boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
+ }
+ lim->atomic_write_hw_max = atomic_bs;
+ lim->atomic_write_hw_boundary = boundary;
+ lim->atomic_write_hw_unit_min = bs;
+ lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
+}
+
static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
{
return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
@@ -1968,13 +2017,16 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+
+ nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
}
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
/* NPWG = Namespace Preferred Write Granularity */
phys_bs = bs * (1 + le16_to_cpu(id->npwg));
/* NOWS = Namespace Optimal Write Size */
- io_opt = bs * (1 + le16_to_cpu(id->nows));
+ if (id->nows)
+ io_opt = bs * (1 + le16_to_cpu(id->nows));
}
/*
@@ -2058,7 +2110,6 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
- bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
struct queue_limits lim;
struct nvme_id_ns_nvm *nvm = NULL;
struct nvme_zone_info zi = {};
@@ -2107,11 +2158,11 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
ns->head->ids.csi == NVME_CSI_ZNS)
nvme_update_zone_info(ns, &lim, &zi);
- ret = queue_limits_commit_update(ns->disk->queue, &lim);
- if (ret) {
- blk_mq_unfreeze_queue(ns->disk->queue);
- goto out;
- }
+
+ if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+ lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
+ else
+ lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
/*
* Register a metadata profile for PI, or the plain non-integrity NVMe
@@ -2119,9 +2170,15 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
* I/O to namespaces with metadata except when the namespace supports
* PI, as it can strip/insert in that case.
*/
- if (!nvme_init_integrity(ns->disk, ns->head))
+ if (!nvme_init_integrity(ns->disk, ns->head, &lim))
capacity = 0;
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ if (ret) {
+ blk_mq_unfreeze_queue(ns->disk->queue);
+ goto out;
+ }
+
set_capacity_and_notify(ns->disk, capacity);
/*
@@ -2133,7 +2190,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
- blk_queue_write_cache(ns->disk->queue, vwc, vwc);
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
@@ -2193,14 +2249,6 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
struct queue_limits lim;
blk_mq_freeze_queue(ns->head->disk->queue);
- if (unsupported)
- ns->head->disk->flags |= GENHD_FL_HIDDEN;
- else
- nvme_init_integrity(ns->head->disk, ns->head);
- set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
-
/*
* queue_limits mixes values that are the hardware limitations
* for bio splitting with what is the device configuration.
@@ -2223,13 +2271,48 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
lim.io_opt = ns_lim->io_opt;
queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
ns->head->disk->disk_name);
+ if (unsupported)
+ ns->head->disk->flags |= GENHD_FL_HIDDEN;
+ else
+ nvme_init_integrity(ns->head->disk, ns->head, &lim);
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
+
+ set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
+ set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
+ nvme_mpath_revalidate_paths(ns);
+
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
return ret;
}
+int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
+ enum blk_unique_id type)
+{
+ struct nvme_ns_ids *ids = &ns->head->ids;
+
+ if (type != BLK_UID_EUI64)
+ return -EINVAL;
+
+ if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) {
+ memcpy(id, &ids->nguid, sizeof(ids->nguid));
+ return sizeof(ids->nguid);
+ }
+ if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) {
+ memcpy(id, &ids->eui64, sizeof(ids->eui64));
+ return sizeof(ids->eui64);
+ }
+
+ return -EINVAL;
+}
+
+static int nvme_get_unique_id(struct gendisk *disk, u8 id[16],
+ enum blk_unique_id type)
+{
+ return nvme_ns_get_unique_id(disk->private_data, id, type);
+}
+
#ifdef CONFIG_BLK_SED_OPAL
static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
bool send)
@@ -2285,6 +2368,7 @@ const struct block_device_operations nvme_bdev_ops = {
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
+ .get_unique_id = nvme_get_unique_id,
.report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -3721,6 +3805,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
{
+ struct queue_limits lim = { };
struct nvme_ns *ns;
struct gendisk *disk;
int node = ctrl->numa_node;
@@ -3729,7 +3814,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (!ns)
return;
- disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns);
+ if (ctrl->opts && ctrl->opts->data_digest)
+ lim.features |= BLK_FEAT_STABLE_WRITES;
+ if (ctrl->ops->supports_pci_p2pdma &&
+ ctrl->ops->supports_pci_p2pdma(ctrl))
+ lim.features |= BLK_FEAT_PCI_P2PDMA;
+
+ disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
if (IS_ERR(disk))
goto out_free_ns;
disk->fops = &nvme_bdev_ops;
@@ -3737,15 +3828,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
ns->disk = disk;
ns->queue = disk->queue;
-
- if (ctrl->opts && ctrl->opts->data_digest)
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
-
- blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
- if (ctrl->ops->supports_pci_p2pdma &&
- ctrl->ops->supports_pci_p2pdma(ctrl))
- blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
-
ns->ctrl = ctrl;
kref_init(&ns->kref);
@@ -3887,7 +3969,7 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
{
- int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
+ int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
dev_err(ns->ctrl->device,
@@ -3903,7 +3985,7 @@ out:
*
* TODO: we should probably schedule a delayed retry here.
*/
- if (ret > 0 && (ret & NVME_SC_DNR))
+ if (ret > 0 && (ret & NVME_STATUS_DNR))
nvme_ns_remove(ns);
}
@@ -4095,7 +4177,7 @@ static void nvme_scan_work(struct work_struct *work)
* they report) but don't actually support it.
*/
ret = nvme_scan_ns_list(ctrl);
- if (ret > 0 && ret & NVME_SC_DNR)
+ if (ret > 0 && ret & NVME_STATUS_DNR)
nvme_scan_ns_sequential(ctrl);
}
mutex_unlock(&ctrl->scan_lock);
@@ -4489,13 +4571,15 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
return ret;
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL);
+ struct queue_limits lim = {
+ .features = BLK_FEAT_SKIP_TAGSET_QUIESCE,
+ };
+
+ ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
- blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
- ctrl->connect_q);
}
ctrl->tagset = set;
@@ -4613,6 +4697,9 @@ static void nvme_free_ctrl(struct device *dev)
* Initialize a NVMe controller structures. This needs to be called during
* earliest initialization so that we have the initialized structured around
* during probing.
+ *
+ * On success, the caller must use the nvme_put_ctrl() to release this when
+ * needed, which also invokes the ops->free_ctrl() callback.
*/
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks)
@@ -4661,6 +4748,12 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
goto out;
ctrl->instance = ret;
+ ret = nvme_auth_init_ctrl(ctrl);
+ if (ret)
+ goto out_release_instance;
+
+ nvme_mpath_init_ctrl(ctrl);
+
device_initialize(&ctrl->ctrl_device);
ctrl->device = &ctrl->ctrl_device;
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
@@ -4673,16 +4766,36 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->device->groups = nvme_dev_attr_groups;
ctrl->device->release = nvme_free_ctrl;
dev_set_drvdata(ctrl->device, ctrl);
+
+ return ret;
+
+out_release_instance:
+ ida_free(&nvme_instance_ida, ctrl->instance);
+out:
+ if (ctrl->discard_page)
+ __free_page(ctrl->discard_page);
+ cleanup_srcu_struct(&ctrl->srcu);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_init_ctrl);
+
+/*
+ * On success, returns with an elevated controller reference and caller must
+ * use nvme_uninit_ctrl() to properly free resources associated with the ctrl.
+ */
+int nvme_add_ctrl(struct nvme_ctrl *ctrl)
+{
+ int ret;
+
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
if (ret)
- goto out_release_instance;
+ return ret;
- nvme_get_ctrl(ctrl);
cdev_init(&ctrl->cdev, &nvme_dev_fops);
- ctrl->cdev.owner = ops->module;
+ ctrl->cdev.owner = ctrl->ops->module;
ret = cdev_device_add(&ctrl->cdev, ctrl->device);
if (ret)
- goto out_free_name;
+ return ret;
/*
* Initialize latency tolerance controls. The sysfs files won't
@@ -4693,28 +4806,11 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
- nvme_mpath_init_ctrl(ctrl);
- ret = nvme_auth_init_ctrl(ctrl);
- if (ret)
- goto out_free_cdev;
+ nvme_get_ctrl(ctrl);
return 0;
-out_free_cdev:
- nvme_fault_inject_fini(&ctrl->fault_inject);
- dev_pm_qos_hide_latency_tolerance(ctrl->device);
- cdev_device_del(&ctrl->cdev, ctrl->device);
-out_free_name:
- nvme_put_ctrl(ctrl);
- kfree_const(ctrl->device->kobj.name);
-out_release_instance:
- ida_free(&nvme_instance_ida, ctrl->instance);
-out:
- if (ctrl->discard_page)
- __free_page(ctrl->discard_page);
- cleanup_srcu_struct(&ctrl->srcu);
- return ret;
}
-EXPORT_SYMBOL_GPL(nvme_init_ctrl);
+EXPORT_SYMBOL_GPL(nvme_add_ctrl);
/* let I/O to all namespaces fail in preparation for surprise removal */
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index ceb9c0ed3120..44e342a46f39 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -187,7 +187,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
if (unlikely(ret != 0))
dev_err(ctrl->device,
"Property Get error: %d, offset %#x\n",
- ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
+ ret > 0 ? ret & ~NVME_STATUS_DNR : ret, off);
return ret;
}
@@ -233,7 +233,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
if (unlikely(ret != 0))
dev_err(ctrl->device,
"Property Get error: %d, offset %#x\n",
- ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
+ ret > 0 ? ret & ~NVME_STATUS_DNR : ret, off);
return ret;
}
EXPORT_SYMBOL_GPL(nvmf_reg_read64);
@@ -275,11 +275,26 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
if (unlikely(ret))
dev_err(ctrl->device,
"Property Set error: %d, offset %#x\n",
- ret > 0 ? ret & ~NVME_SC_DNR : ret, off);
+ ret > 0 ? ret & ~NVME_STATUS_DNR : ret, off);
return ret;
}
EXPORT_SYMBOL_GPL(nvmf_reg_write32);
+int nvmf_subsystem_reset(struct nvme_ctrl *ctrl)
+{
+ int ret;
+
+ if (!nvme_wait_reset(ctrl))
+ return -EBUSY;
+
+ ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, NVME_SUBSYS_RESET);
+ if (ret)
+ return ret;
+
+ return nvme_try_sched_reset(ctrl);
+}
+EXPORT_SYMBOL_GPL(nvmf_subsystem_reset);
+
/**
* nvmf_log_connect_error() - Error-parsing-diagnostic print out function for
* connect() errors.
@@ -295,7 +310,7 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl,
int errval, int offset, struct nvme_command *cmd,
struct nvmf_connect_data *data)
{
- int err_sctype = errval & ~NVME_SC_DNR;
+ int err_sctype = errval & ~NVME_STATUS_DNR;
if (errval < 0) {
dev_err(ctrl->device,
@@ -573,7 +588,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
*/
bool nvmf_should_reconnect(struct nvme_ctrl *ctrl, int status)
{
- if (status > 0 && (status & NVME_SC_DNR))
+ if (status > 0 && (status & NVME_STATUS_DNR))
return false;
if (status == -EKEYREJECTED)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 602135910ae9..21d75dc4a3a0 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -217,6 +217,7 @@ static inline unsigned int nvmf_nr_io_queues(struct nvmf_ctrl_options *opts)
int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
+int nvmf_subsystem_reset(struct nvme_ctrl *ctrl);
int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
int nvmf_register_transport(struct nvmf_transport_ops *ops);
diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c
index 1ba10a5c656d..1d1b6441a339 100644
--- a/drivers/nvme/host/fault_inject.c
+++ b/drivers/nvme/host/fault_inject.c
@@ -75,7 +75,7 @@ void nvme_should_fail(struct request *req)
/* inject status code and DNR bit */
status = fault_inject->status;
if (fault_inject->dont_retry)
- status |= NVME_SC_DNR;
+ status |= NVME_STATUS_DNR;
nvme_req(req)->status = status;
}
}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index f0b081332749..b81af7919e94 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3132,7 +3132,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ctrl->ctrl.icdoff) {
dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
ctrl->ctrl.icdoff);
- ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ ret = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
goto out_stop_keep_alive;
}
@@ -3140,7 +3140,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (!nvme_ctrl_sgl_supported(&ctrl->ctrl)) {
dev_err(ctrl->ctrl.device,
"Mandatory sgls are not supported!\n");
- ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ ret = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
goto out_stop_keep_alive;
}
@@ -3325,7 +3325,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay);
} else {
if (portptr->port_state == FC_OBJSTATE_ONLINE) {
- if (status > 0 && (status & NVME_SC_DNR))
+ if (status > 0 && (status & NVME_STATUS_DNR))
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: reconnect failure\n",
ctrl->cnum);
@@ -3382,6 +3382,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
+ .subsystem_reset = nvmf_subsystem_reset,
.free_ctrl = nvme_fc_free_ctrl,
.submit_async_event = nvme_fc_submit_async_event,
.delete_ctrl = nvme_fc_delete_ctrl,
@@ -3444,12 +3445,11 @@ nvme_fc_existing_controller(struct nvme_fc_rport *rport,
return found;
}
-static struct nvme_ctrl *
-nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+static struct nvme_fc_ctrl *
+nvme_fc_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
{
struct nvme_fc_ctrl *ctrl;
- unsigned long flags;
int ret, idx, ctrl_loss_tmo;
if (!(rport->remoteport.port_role &
@@ -3538,7 +3538,35 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
if (lport->dev)
ctrl->ctrl.numa_node = dev_to_node(lport->dev);
- /* at this point, teardown path changes to ref counting on nvme ctrl */
+ return ctrl;
+
+out_free_queues:
+ kfree(ctrl->queues);
+out_free_ida:
+ put_device(ctrl->dev);
+ ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
+out_free_ctrl:
+ kfree(ctrl);
+out_fail:
+ /* exit via here doesn't follow ctlr ref points */
+ return ERR_PTR(ret);
+}
+
+static struct nvme_ctrl *
+nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+ struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
+{
+ struct nvme_fc_ctrl *ctrl;
+ unsigned long flags;
+ int ret;
+
+ ctrl = nvme_fc_alloc_ctrl(dev, opts, lport, rport);
+ if (IS_ERR(ctrl))
+ return ERR_CAST(ctrl);
+
+ ret = nvme_add_ctrl(&ctrl->ctrl);
+ if (ret)
+ goto out_put_ctrl;
ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
&nvme_fc_admin_mq_ops,
@@ -3584,6 +3612,7 @@ fail_ctrl:
/* initiate nvme ctrl ref counting teardown */
nvme_uninit_ctrl(&ctrl->ctrl);
+out_put_ctrl:
/* Remove core ctrl ref. */
nvme_put_ctrl(&ctrl->ctrl);
@@ -3597,20 +3626,8 @@ fail_ctrl:
nvme_fc_rport_get(rport);
return ERR_PTR(-EIO);
-
-out_free_queues:
- kfree(ctrl->queues);
-out_free_ida:
- put_device(ctrl->dev);
- ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
-out_free_ctrl:
- kfree(ctrl);
-out_fail:
- /* exit via here doesn't follow ctlr ref points */
- return ERR_PTR(ret);
}
-
struct nvmet_fc_traddr {
u64 nn;
u64 pn;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index d8b6b4648eaf..91d9eb3c22ef 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
+ [NVME_IOPOLICY_QD] = "queue-depth",
};
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
iopolicy = NVME_IOPOLICY_NUMA;
else if (!strncmp(val, "round-robin", 11))
iopolicy = NVME_IOPOLICY_RR;
+ else if (!strncmp(val, "queue-depth", 11))
+ iopolicy = NVME_IOPOLICY_QD;
else
return -EINVAL;
@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
&iopolicy, 0644);
MODULE_PARM_DESC(iopolicy,
- "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+ "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
{
@@ -83,7 +86,7 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
void nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
- u16 status = nvme_req(req)->status & 0x7ff;
+ u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
unsigned long flags;
struct bio *bio;
@@ -128,6 +131,11 @@ void nvme_mpath_start_request(struct request *rq)
struct nvme_ns *ns = rq->q->queuedata;
struct gendisk *disk = ns->head->disk;
+ if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
+ atomic_inc(&ns->ctrl->nr_active);
+ nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
+ }
+
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
return;
@@ -141,6 +149,9 @@ void nvme_mpath_end_request(struct request *rq)
{
struct nvme_ns *ns = rq->q->queuedata;
+ if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
+ atomic_dec_if_positive(&ns->ctrl->nr_active);
+
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
return;
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
@@ -291,10 +302,15 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
}
-static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
- int node, struct nvme_ns *old)
+static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns, *found = NULL;
+ int node = numa_node_id();
+ struct nvme_ns *old = srcu_dereference(head->current_path[node],
+ &head->srcu);
+
+ if (unlikely(!old))
+ return __nvme_find_path(head, node);
if (list_is_singular(&head->list)) {
if (nvme_path_is_disabled(old))
@@ -334,13 +350,49 @@ out:
return found;
}
+static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
+{
+ struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
+ unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+ unsigned int depth;
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ if (nvme_path_is_disabled(ns))
+ continue;
+
+ depth = atomic_read(&ns->ctrl->nr_active);
+
+ switch (ns->ana_state) {
+ case NVME_ANA_OPTIMIZED:
+ if (depth < min_depth_opt) {
+ min_depth_opt = depth;
+ best_opt = ns;
+ }
+ break;
+ case NVME_ANA_NONOPTIMIZED:
+ if (depth < min_depth_nonopt) {
+ min_depth_nonopt = depth;
+ best_nonopt = ns;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (min_depth_opt == 0)
+ return best_opt;
+ }
+
+ return best_opt ? best_opt : best_nonopt;
+}
+
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
ns->ana_state == NVME_ANA_OPTIMIZED;
}
-inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
+static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
{
int node = numa_node_id();
struct nvme_ns *ns;
@@ -348,14 +400,23 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (unlikely(!ns))
return __nvme_find_path(head, node);
-
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
- return nvme_round_robin_path(head, node, ns);
if (unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
}
+inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
+{
+ switch (READ_ONCE(head->subsys->iopolicy)) {
+ case NVME_IOPOLICY_QD:
+ return nvme_queue_depth_path(head);
+ case NVME_IOPOLICY_RR:
+ return nvme_round_robin_path(head);
+ default:
+ return nvme_numa_path(head);
+ }
+}
+
static bool nvme_available_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns;
@@ -427,6 +488,21 @@ static void nvme_ns_head_release(struct gendisk *disk)
nvme_put_ns_head(disk->private_data);
}
+static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
+ enum blk_unique_id type)
+{
+ struct nvme_ns_head *head = disk->private_data;
+ struct nvme_ns *ns;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ ns = nvme_find_path(head);
+ if (ns)
+ ret = nvme_ns_get_unique_id(ns, id, type);
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return ret;
+}
+
#ifdef CONFIG_BLK_DEV_ZONED
static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data)
@@ -454,6 +530,7 @@ const struct block_device_operations nvme_ns_head_ops = {
.ioctl = nvme_ns_head_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl,
.getgeo = nvme_getgeo,
+ .get_unique_id = nvme_ns_head_get_unique_id,
.report_zones = nvme_ns_head_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -521,7 +598,6 @@ static void nvme_requeue_work(struct work_struct *work)
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
struct queue_limits lim;
- bool vwc = false;
mutex_init(&head->lock);
bio_list_init(&head->requeue_list);
@@ -539,6 +615,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
blk_set_stacking_limits(&lim);
lim.dma_alignment = 3;
+ lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL;
if (head->ids.csi != NVME_CSI_ZNS)
lim.max_zone_append_sectors = 0;
@@ -549,24 +626,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
head->disk->private_data = head;
sprintf(head->disk->disk_name, "nvme%dn%d",
ctrl->subsys->instance, head->instance);
-
- blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
- blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
- blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue);
- /*
- * This assumes all controllers that refer to a namespace either
- * support poll queues or not. That is not a strict guarantee,
- * but if the assumption is wrong the effect is only suboptimal
- * performance but not correctness problem.
- */
- if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
- ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
- blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
-
- /* we need to propagate up the VMC settings */
- if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
- vwc = true;
- blk_queue_write_cache(head->disk->queue, vwc, vwc);
return 0;
}
@@ -803,6 +862,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
}
+static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
+ int iopolicy)
+{
+ struct nvme_ctrl *ctrl;
+ int old_iopolicy = READ_ONCE(subsys->iopolicy);
+
+ if (old_iopolicy == iopolicy)
+ return;
+
+ WRITE_ONCE(subsys->iopolicy, iopolicy);
+
+ /* iopolicy changes clear the mpath by design */
+ mutex_lock(&nvme_subsystems_lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+ nvme_mpath_clear_ctrl_paths(ctrl);
+ mutex_unlock(&nvme_subsystems_lock);
+
+ pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
+ subsys->subnqn,
+ nvme_iopolicy_names[old_iopolicy],
+ nvme_iopolicy_names[iopolicy]);
+}
+
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
@@ -812,7 +894,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
- WRITE_ONCE(subsys->iopolicy, i);
+ nvme_subsys_iopolicy_update(subsys, i);
return count;
}
}
@@ -875,9 +957,6 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
nvme_mpath_set_live(ns);
}
- if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
- ns->head->disk->queue);
#ifdef CONFIG_BLK_DEV_ZONED
if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
ns->head->disk->nr_zones = ns->disk->nr_zones;
@@ -923,6 +1002,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
!(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
return 0;
+ /* initialize this in the identify path to cover controller resets */
+ atomic_set(&ctrl->nr_active, 0);
+
if (!ctrl->max_namespaces ||
ctrl->max_namespaces > le32_to_cpu(id->nn)) {
dev_err(ctrl->device,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 68b400f9c42d..f900e44243ae 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -49,6 +49,7 @@ extern unsigned int admin_timeout;
extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
+extern struct mutex nvme_subsystems_lock;
/*
* List of workarounds for devices that required behavior not specified in
@@ -195,6 +196,7 @@ enum {
NVME_REQ_CANCELLED = (1 << 0),
NVME_REQ_USERCMD = (1 << 1),
NVME_MPATH_IO_STATS = (1 << 2),
+ NVME_MPATH_CNT_ACTIVE = (1 << 3),
};
static inline struct nvme_request *nvme_req(struct request *req)
@@ -360,6 +362,7 @@ struct nvme_ctrl {
size_t ana_log_size;
struct timer_list anatt_timer;
struct work_struct ana_work;
+ atomic_t nr_active;
#endif
#ifdef CONFIG_NVME_HOST_AUTH
@@ -408,6 +411,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
+ NVME_IOPOLICY_QD,
};
struct nvme_subsystem {
@@ -551,6 +555,7 @@ struct nvme_ctrl_ops {
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
void (*free_ctrl)(struct nvme_ctrl *ctrl);
void (*submit_async_event)(struct nvme_ctrl *ctrl);
+ int (*subsystem_reset)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
void (*stop_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
@@ -649,18 +654,9 @@ int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
{
- int ret;
-
- if (!ctrl->subsystem)
+ if (!ctrl->subsystem || !ctrl->ops->subsystem_reset)
return -ENOTTY;
- if (!nvme_wait_reset(ctrl))
- return -EBUSY;
-
- ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
- if (ret)
- return ret;
-
- return nvme_try_sched_reset(ctrl);
+ return ctrl->ops->subsystem_reset(ctrl);
}
/*
@@ -689,7 +685,7 @@ static inline u32 nvme_bytes_to_numd(size_t len)
static inline bool nvme_is_ana_error(u16 status)
{
- switch (status & 0x7ff) {
+ switch (status & NVME_SCT_SC_MASK) {
case NVME_SC_ANA_TRANSITION:
case NVME_SC_ANA_INACCESSIBLE:
case NVME_SC_ANA_PERSISTENT_LOSS:
@@ -702,7 +698,7 @@ static inline bool nvme_is_ana_error(u16 status)
static inline bool nvme_is_path_error(u16 status)
{
/* check for a status code type of 'path related status' */
- return (status & 0x700) == 0x300;
+ return (status & NVME_SCT_MASK) == NVME_SCT_PATH;
}
/*
@@ -792,6 +788,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks);
+int nvme_add_ctrl(struct nvme_ctrl *ctrl);
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
@@ -877,7 +874,7 @@ enum {
NVME_SUBMIT_NOWAIT = (__force nvme_submit_flags_t)(1 << 1),
/* Set BLK_MQ_REQ_RESERVED when allocating request */
NVME_SUBMIT_RESERVED = (__force nvme_submit_flags_t)(1 << 2),
- /* Retry command when NVME_SC_DNR is not set in the result */
+ /* Retry command when NVME_STATUS_DNR is not set in the result */
NVME_SUBMIT_RETRY = (__force nvme_submit_flags_t)(1 << 3),
};
@@ -1062,6 +1059,9 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
}
#endif /* CONFIG_NVME_MULTIPATH */
+int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
+ enum blk_unique_id type);
+
struct nvme_zone_info {
u64 zone_size;
unsigned int max_open_zones;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 102a9fb0c65f..21f8e1b9801f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -826,9 +826,9 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct bio_vec bv = rq_integrity_vec(req);
- iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
- rq_dma_dir(req), 0);
+ iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
@@ -967,7 +967,7 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
dma_unmap_page(dev->dev, iod->meta_dma,
- rq_integrity_vec(req)->bv_len, rq_dma_dir(req));
+ rq_integrity_vec(req).bv_len, rq_dma_dir(req));
}
if (blk_rq_nr_phys_segments(req))
@@ -1143,6 +1143,41 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
spin_unlock(&nvmeq->sq_lock);
}
+static int nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl)
+{
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+ int ret = 0;
+
+ /*
+ * Taking the shutdown_lock ensures the BAR mapping is not being
+ * altered by reset_work. Holding this lock before the RESETTING state
+ * change, if successful, also ensures nvme_remove won't be able to
+ * proceed to iounmap until we're done.
+ */
+ mutex_lock(&dev->shutdown_lock);
+ if (!dev->bar_mapped_size) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR);
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE);
+
+ /*
+ * Read controller status to flush the previous write and trigger a
+ * pcie read error.
+ */
+ readl(dev->bar + NVME_REG_CSTS);
+unlock:
+ mutex_unlock(&dev->shutdown_lock);
+ return ret;
+}
+
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
struct nvme_command c = { };
@@ -2859,6 +2894,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.reg_read64 = nvme_pci_reg_read64,
.free_ctrl = nvme_pci_free_ctrl,
.submit_async_event = nvme_pci_submit_async_event,
+ .subsystem_reset = nvme_pci_subsystem_reset,
.get_address = nvme_pci_get_address,
.print_device_info = nvme_pci_print_device_info,
.supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
@@ -3015,6 +3051,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (IS_ERR(dev))
return PTR_ERR(dev);
+ result = nvme_add_ctrl(&dev->ctrl);
+ if (result)
+ goto out_put_ctrl;
+
result = nvme_dev_map(dev);
if (result)
goto out_uninit_ctrl;
@@ -3101,6 +3141,7 @@ out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
nvme_uninit_ctrl(&dev->ctrl);
+out_put_ctrl:
nvme_put_ctrl(&dev->ctrl);
return result;
}
diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c
index 8fa1ffcdaed4..7347ddf85f00 100644
--- a/drivers/nvme/host/pr.c
+++ b/drivers/nvme/host/pr.c
@@ -72,12 +72,12 @@ static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
return nvme_submit_sync_cmd(ns->queue, c, data, data_len);
}
-static int nvme_sc_to_pr_err(int nvme_sc)
+static int nvme_status_to_pr_err(int status)
{
- if (nvme_is_path_error(nvme_sc))
+ if (nvme_is_path_error(status))
return PR_STS_PATH_FAILED;
- switch (nvme_sc & 0x7ff) {
+ switch (status & NVME_SCT_SC_MASK) {
case NVME_SC_SUCCESS:
return PR_STS_SUCCESS;
case NVME_SC_RESERVATION_CONFLICT:
@@ -121,7 +121,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
if (ret < 0)
return ret;
- return nvme_sc_to_pr_err(ret);
+ return nvme_status_to_pr_err(ret);
}
static int nvme_pr_register(struct block_device *bdev, u64 old,
@@ -196,7 +196,7 @@ retry:
if (ret < 0)
return ret;
- return nvme_sc_to_pr_err(ret);
+ return nvme_status_to_pr_err(ret);
}
static int nvme_pr_read_keys(struct block_device *bdev,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 51a62b0c645a..2eb33842f971 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2201,6 +2201,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
+ .subsystem_reset = nvmf_subsystem_reset,
.free_ctrl = nvme_rdma_free_ctrl,
.submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl,
@@ -2237,12 +2238,11 @@ nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
return found;
}
-static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
+static struct nvme_rdma_ctrl *nvme_rdma_alloc_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_rdma_ctrl *ctrl;
int ret;
- bool changed;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
@@ -2304,6 +2304,30 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
if (ret)
goto out_kfree_queues;
+ return ctrl;
+
+out_kfree_queues:
+ kfree(ctrl->queues);
+out_free_ctrl:
+ kfree(ctrl);
+ return ERR_PTR(ret);
+}
+
+static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
+ struct nvmf_ctrl_options *opts)
+{
+ struct nvme_rdma_ctrl *ctrl;
+ bool changed;
+ int ret;
+
+ ctrl = nvme_rdma_alloc_ctrl(dev, opts);
+ if (IS_ERR(ctrl))
+ return ERR_CAST(ctrl);
+
+ ret = nvme_add_ctrl(&ctrl->ctrl);
+ if (ret)
+ goto out_put_ctrl;
+
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
WARN_ON_ONCE(!changed);
@@ -2322,15 +2346,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
+out_put_ctrl:
nvme_put_ctrl(&ctrl->ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);
-out_kfree_queues:
- kfree(ctrl->queues);
-out_free_ctrl:
- kfree(ctrl);
- return ERR_PTR(ret);
}
static struct nvmf_transport_ops nvme_rdma_transport = {
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8b5e4327fe83..a2a47d3ab99f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2662,6 +2662,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
+ .subsystem_reset = nvmf_subsystem_reset,
.free_ctrl = nvme_tcp_free_ctrl,
.submit_async_event = nvme_tcp_submit_async_event,
.delete_ctrl = nvme_tcp_delete_ctrl,
@@ -2686,7 +2687,7 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
return found;
}
-static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
+static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_tcp_ctrl *ctrl;
@@ -2761,6 +2762,28 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
if (ret)
goto out_kfree_queues;
+ return ctrl;
+out_kfree_queues:
+ kfree(ctrl->queues);
+out_free_ctrl:
+ kfree(ctrl);
+ return ERR_PTR(ret);
+}
+
+static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
+ struct nvmf_ctrl_options *opts)
+{
+ struct nvme_tcp_ctrl *ctrl;
+ int ret;
+
+ ctrl = nvme_tcp_alloc_ctrl(dev, opts);
+ if (IS_ERR(ctrl))
+ return ERR_CAST(ctrl);
+
+ ret = nvme_add_ctrl(&ctrl->ctrl);
+ if (ret)
+ goto out_put_ctrl;
+
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
WARN_ON_ONCE(1);
ret = -EINTR;
@@ -2782,15 +2805,11 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
+out_put_ctrl:
nvme_put_ctrl(&ctrl->ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);
-out_kfree_queues:
- kfree(ctrl->queues);
-out_free_ctrl:
- kfree(ctrl);
- return ERR_PTR(ret);
}
static struct nvmf_transport_ops nvme_tcp_transport = {
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 77aa0f440a6d..9a06f9d98cd6 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -108,13 +108,12 @@ free_data:
void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
struct nvme_zone_info *zi)
{
- lim->zoned = 1;
+ lim->features |= BLK_FEAT_ZONED;
lim->max_open_zones = zi->max_open_zones;
lim->max_active_zones = zi->max_active_zones;
lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
lim->chunk_sectors = ns->head->zsze =
nvme_lba_to_sect(ns->head, zi->zone_size);
- blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue);
}
static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,