summaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r--drivers/nvme/host/core.c390
1 files changed, 368 insertions, 22 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8a3c3e32a704..9b3b57fef446 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -26,6 +26,7 @@
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/t10-pi.h>
+#include <linux/pm_qos.h>
#include <scsi/sg.h>
#include <asm/unaligned.h>
@@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries);
static int nvme_char_major;
module_param(nvme_char_major, int, 0);
+static unsigned long default_ps_max_latency_us = 25000;
+module_param(default_ps_max_latency_us, ulong, 0644);
+MODULE_PARM_DESC(default_ps_max_latency_us,
+ "max power saving latency for new devices; use PM QOS to change per device");
+
static LIST_HEAD(nvme_ctrl_list);
static DEFINE_SPINLOCK(dev_list_lock);
@@ -208,18 +214,18 @@ EXPORT_SYMBOL_GPL(nvme_requeue_req);
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid)
{
+ unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
struct request *req;
if (qid == NVME_QID_ANY) {
- req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags);
+ req = blk_mq_alloc_request(q, op, flags);
} else {
- req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags,
+ req = blk_mq_alloc_request_hctx(q, op, flags,
qid ? qid - 1 : 0);
}
if (IS_ERR(req))
return req;
- req->cmd_type = REQ_TYPE_DRV_PRIV;
req->cmd_flags |= REQ_FAILFAST_DRIVER;
nvme_req(req)->cmd = cmd;
@@ -238,26 +244,38 @@ static inline void nvme_setup_flush(struct nvme_ns *ns,
static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd)
{
+ unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
struct nvme_dsm_range *range;
- unsigned int nr_bytes = blk_rq_bytes(req);
+ struct bio *bio;
- range = kmalloc(sizeof(*range), GFP_ATOMIC);
+ range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
if (!range)
return BLK_MQ_RQ_QUEUE_BUSY;
- range->cattr = cpu_to_le32(0);
- range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift);
- range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ __rq_for_each_bio(bio, req) {
+ u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
+ u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
+
+ range[n].cattr = cpu_to_le32(0);
+ range[n].nlb = cpu_to_le32(nlb);
+ range[n].slba = cpu_to_le64(slba);
+ n++;
+ }
+
+ if (WARN_ON_ONCE(n != segments)) {
+ kfree(range);
+ return BLK_MQ_RQ_QUEUE_ERROR;
+ }
memset(cmnd, 0, sizeof(*cmnd));
cmnd->dsm.opcode = nvme_cmd_dsm;
cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
- cmnd->dsm.nr = 0;
+ cmnd->dsm.nr = segments - 1;
cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
req->special_vec.bv_page = virt_to_page(range);
req->special_vec.bv_offset = offset_in_page(range);
- req->special_vec.bv_len = sizeof(*range);
+ req->special_vec.bv_len = sizeof(*range) * segments;
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
return BLK_MQ_RQ_QUEUE_OK;
@@ -309,17 +327,27 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
{
int ret = BLK_MQ_RQ_QUEUE_OK;
- if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+ switch (req_op(req)) {
+ case REQ_OP_DRV_IN:
+ case REQ_OP_DRV_OUT:
memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
- else if (req_op(req) == REQ_OP_FLUSH)
+ break;
+ case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
- else if (req_op(req) == REQ_OP_DISCARD)
+ break;
+ case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
- else
+ break;
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
nvme_setup_rw(ns, req, cmd);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return BLK_MQ_RQ_QUEUE_ERROR;
+ }
cmd->common.command_id = req->tag;
-
return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@ -538,7 +566,7 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
c.identify.opcode = nvme_admin_identify;
- c.identify.cns = cpu_to_le32(NVME_ID_CNS_CTRL);
+ c.identify.cns = NVME_ID_CNS_CTRL;
*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
if (!*id)
@@ -556,7 +584,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
struct nvme_command c = { };
c.identify.opcode = nvme_admin_identify;
- c.identify.cns = cpu_to_le32(NVME_ID_CNS_NS_ACTIVE_LIST);
+ c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
c.identify.nsid = cpu_to_le32(nsid);
return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
}
@@ -568,8 +596,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
int error;
/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
- c.identify.opcode = nvme_admin_identify,
- c.identify.nsid = cpu_to_le32(nsid),
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.nsid = cpu_to_le32(nsid);
+ c.identify.cns = NVME_ID_CNS_NS;
*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
if (!*id)
@@ -784,6 +813,13 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
return nvme_sg_io(ns, (void __user *)arg);
#endif
default:
+#ifdef CONFIG_NVM
+ if (ns->ndev)
+ return nvme_nvm_ioctl(ns, cmd, arg);
+#endif
+ if (is_sed_ioctl(cmd))
+ return sed_ioctl(ns->ctrl->opal_dev, cmd,
+ (void __user *) arg);
return -ENOTTY;
}
}
@@ -861,6 +897,9 @@ static void nvme_config_discard(struct nvme_ns *ns)
struct nvme_ctrl *ctrl = ns->ctrl;
u32 logical_block_size = queue_logical_block_size(ns->queue);
+ BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
+ NVME_DSM_MAX_RANGES);
+
if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
ns->queue->limits.discard_zeroes_data = 1;
else
@@ -869,6 +908,7 @@ static void nvme_config_discard(struct nvme_ns *ns)
ns->queue->limits.discard_alignment = logical_block_size;
ns->queue->limits.discard_granularity = logical_block_size;
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
+ blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
}
@@ -1051,6 +1091,28 @@ static const struct pr_ops nvme_pr_ops = {
.pr_clear = nvme_pr_clear,
};
+#ifdef CONFIG_BLK_SED_OPAL
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+ bool send)
+{
+ struct nvme_ctrl *ctrl = data;
+ struct nvme_command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ if (send)
+ cmd.common.opcode = nvme_admin_security_send;
+ else
+ cmd.common.opcode = nvme_admin_security_recv;
+ cmd.common.nsid = 0;
+ cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+ cmd.common.cdw10[1] = cpu_to_le32(len);
+
+ return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
+ ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
+}
+EXPORT_SYMBOL_GPL(nvme_sec_submit);
+#endif /* CONFIG_BLK_SED_OPAL */
+
static const struct block_device_operations nvme_fops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
@@ -1196,6 +1258,176 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
blk_queue_write_cache(q, vwc, vwc);
}
+static void nvme_configure_apst(struct nvme_ctrl *ctrl)
+{
+ /*
+ * APST (Autonomous Power State Transition) lets us program a
+ * table of power state transitions that the controller will
+ * perform automatically. We configure it with a simple
+ * heuristic: we are willing to spend at most 2% of the time
+ * transitioning between power states. Therefore, when running
+ * in any given state, we will enter the next lower-power
+ * non-operational state after waiting 100 * (enlat + exlat)
+ * microseconds, as long as that state's total latency is under
+ * the requested maximum latency.
+ *
+ * We will not autonomously enter any non-operational state for
+ * which the total latency exceeds ps_max_latency_us. Users
+ * can set ps_max_latency_us to zero to turn off APST.
+ */
+
+ unsigned apste;
+ struct nvme_feat_auto_pst *table;
+ int ret;
+
+ /*
+ * If APST isn't supported or if we haven't been initialized yet,
+ * then don't do anything.
+ */
+ if (!ctrl->apsta)
+ return;
+
+ if (ctrl->npss > 31) {
+ dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
+ return;
+ }
+
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return;
+
+ if (ctrl->ps_max_latency_us == 0) {
+ /* Turn off APST. */
+ apste = 0;
+ } else {
+ __le64 target = cpu_to_le64(0);
+ int state;
+
+ /*
+ * Walk through all states from lowest- to highest-power.
+ * According to the spec, lower-numbered states use more
+ * power. NPSS, despite the name, is the index of the
+ * lowest-power state, not the number of states.
+ */
+ for (state = (int)ctrl->npss; state >= 0; state--) {
+ u64 total_latency_us, transition_ms;
+
+ if (target)
+ table->entries[state] = target;
+
+ /*
+ * Is this state a useful non-operational state for
+ * higher-power states to autonomously transition to?
+ */
+ if (!(ctrl->psd[state].flags &
+ NVME_PS_FLAGS_NON_OP_STATE))
+ continue;
+
+ total_latency_us =
+ (u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
+ + le32_to_cpu(ctrl->psd[state].exit_lat);
+ if (total_latency_us > ctrl->ps_max_latency_us)
+ continue;
+
+ /*
+ * This state is good. Use it as the APST idle
+ * target for higher power states.
+ */
+ transition_ms = total_latency_us + 19;
+ do_div(transition_ms, 20);
+ if (transition_ms > (1 << 24) - 1)
+ transition_ms = (1 << 24) - 1;
+
+ target = cpu_to_le64((state << 3) |
+ (transition_ms << 8));
+ }
+
+ apste = 1;
+ }
+
+ ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
+ table, sizeof(*table), NULL);
+ if (ret)
+ dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
+
+ kfree(table);
+}
+
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ u64 latency;
+
+ switch (val) {
+ case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
+ case PM_QOS_LATENCY_ANY:
+ latency = U64_MAX;
+ break;
+
+ default:
+ latency = val;
+ }
+
+ if (ctrl->ps_max_latency_us != latency) {
+ ctrl->ps_max_latency_us = latency;
+ nvme_configure_apst(ctrl);
+ }
+}
+
+struct nvme_core_quirk_entry {
+ /*
+ * NVMe model and firmware strings are padded with spaces. For
+ * simplicity, strings in the quirk table are padded with NULLs
+ * instead.
+ */
+ u16 vid;
+ const char *mn;
+ const char *fr;
+ unsigned long quirks;
+};
+
+static const struct nvme_core_quirk_entry core_quirks[] = {
+ /*
+ * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
+ * the controller to go out to lunch. It dies when the watchdog
+ * timer reads CSTS and gets 0xffffffff.
+ */
+ {
+ .vid = 0x144d,
+ .fr = "BXW75D0Q",
+ .quirks = NVME_QUIRK_NO_APST,
+ },
+};
+
+/* match is null-terminated but idstr is space-padded. */
+static bool string_matches(const char *idstr, const char *match, size_t len)
+{
+ size_t matchlen;
+
+ if (!match)
+ return true;
+
+ matchlen = strlen(match);
+ WARN_ON_ONCE(matchlen > len);
+
+ if (memcmp(idstr, match, matchlen))
+ return false;
+
+ for (; matchlen < len; matchlen++)
+ if (idstr[matchlen] != ' ')
+ return false;
+
+ return true;
+}
+
+static bool quirk_matches(const struct nvme_id_ctrl *id,
+ const struct nvme_core_quirk_entry *q)
+{
+ return q->vid == le16_to_cpu(id->vid) &&
+ string_matches(id->mn, q->mn, sizeof(id->mn)) &&
+ string_matches(id->fr, q->fr, sizeof(id->fr));
+}
+
/*
* Initialize the cached copies of the Identify data and various controller
* register in our nvme_ctrl structure. This should be called as soon as
@@ -1207,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
+ u8 prev_apsta;
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
if (ret) {
@@ -1230,6 +1463,25 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
return -EIO;
}
+ if (!ctrl->identified) {
+ /*
+ * Check for quirks. Quirk can depend on firmware version,
+ * so, in principle, the set of quirks present can change
+ * across a reset. As a possible future enhancement, we
+ * could re-scan for quirks every time we reinitialize
+ * the device, but we'd have to make sure that the driver
+ * behaves intelligently if the quirks change.
+ */
+
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
+ if (quirk_matches(id, &core_quirks[i]))
+ ctrl->quirks |= core_quirks[i].quirks;
+ }
+ }
+
+ ctrl->oacs = le16_to_cpu(id->oacs);
ctrl->vid = le16_to_cpu(id->vid);
ctrl->oncs = le16_to_cpup(&id->oncs);
atomic_set(&ctrl->abort_limit, id->acl + 1);
@@ -1249,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
+ ctrl->npss = id->npss;
+ prev_apsta = ctrl->apsta;
+ ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
+ memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
+
if (ctrl->ops->is_fabrics) {
ctrl->icdoff = le16_to_cpu(id->icdoff);
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
@@ -1272,6 +1529,16 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
kfree(id);
+
+ if (ctrl->apsta && !prev_apsta)
+ dev_pm_qos_expose_latency_tolerance(ctrl->device);
+ else if (!ctrl->apsta && prev_apsta)
+ dev_pm_qos_hide_latency_tolerance(ctrl->device);
+
+ nvme_configure_apst(ctrl);
+
+ ctrl->identified = true;
+
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_identify);
@@ -1521,6 +1788,29 @@ static ssize_t nvme_sysfs_show_transport(struct device *dev,
}
static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
+static ssize_t nvme_sysfs_show_state(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ static const char *const state_name[] = {
+ [NVME_CTRL_NEW] = "new",
+ [NVME_CTRL_LIVE] = "live",
+ [NVME_CTRL_RESETTING] = "resetting",
+ [NVME_CTRL_RECONNECTING]= "reconnecting",
+ [NVME_CTRL_DELETING] = "deleting",
+ [NVME_CTRL_DEAD] = "dead",
+ };
+
+ if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
+ state_name[ctrl->state])
+ return sprintf(buf, "%s\n", state_name[ctrl->state]);
+
+ return sprintf(buf, "unknown state\n");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
+
static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -1553,6 +1843,7 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_transport.attr,
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
+ &dev_attr_state.attr,
NULL
};
@@ -2009,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
list_add_tail(&ctrl->node, &nvme_ctrl_list);
spin_unlock(&dev_list_lock);
+ /*
+ * Initialize latency tolerance controls. The sysfs files won't
+ * be visible to userspace unless the device actually supports APST.
+ */
+ ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
+ dev_pm_qos_update_user_latency_tolerance(ctrl->device,
+ min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+
return 0;
out_release_instance:
nvme_release_instance(ctrl);
@@ -2034,9 +2333,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
* Revalidating a dead namespace sets capacity to 0. This will
* end buffered writers dirtying pages that can't be synced.
*/
- if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags))
- revalidate_disk(ns->disk);
-
+ if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ continue;
+ revalidate_disk(ns->disk);
blk_set_queue_dying(ns->queue);
blk_mq_abort_requeue_list(ns->queue);
blk_mq_start_stopped_hw_queues(ns->queue, true);
@@ -2045,6 +2344,53 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_kill_queues);
+void nvme_unfreeze(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_mq_unfreeze_queue(ns->queue);
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_unfreeze);
+
+void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
+ if (timeout <= 0)
+ break;
+ }
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
+
+void nvme_wait_freeze(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_mq_freeze_queue_wait(ns->queue);
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_wait_freeze);
+
+void nvme_start_freeze(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_mq_freeze_queue_start(ns->queue);
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_start_freeze);
+
void nvme_stop_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;