diff options
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r-- | drivers/nvme/host/pci.c | 270 |
1 files changed, 155 insertions, 115 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fbc71fac6f1e..e526437bacbf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -13,6 +13,7 @@ */ #include <linux/aer.h> +#include <linux/async.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-mq-pci.h> @@ -68,7 +69,6 @@ MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); struct nvme_dev; struct nvme_queue; -static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); /* @@ -147,9 +147,10 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) struct nvme_queue { struct device *q_dmadev; struct nvme_dev *dev; - spinlock_t q_lock; + spinlock_t sq_lock; struct nvme_command *sq_cmds; struct nvme_command __iomem *sq_cmds_io; + spinlock_t cq_lock ____cacheline_aligned_in_smp; volatile struct nvme_completion *cqes; struct blk_mq_tags **tags; dma_addr_t sq_dma_addr; @@ -159,9 +160,9 @@ struct nvme_queue { s16 cq_vector; u16 sq_tail; u16 cq_head; + u16 last_cq_head; u16 qid; u8 cq_phase; - u8 cqe_seen; u32 *dbbuf_sq_db; u32 *dbbuf_cq_db; u32 *dbbuf_sq_ei; @@ -420,28 +421,25 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) } /** - * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use * @cmd: The command to send - * - * Safe to use from interrupt context */ -static void __nvme_submit_cmd(struct nvme_queue *nvmeq, - struct nvme_command *cmd) +static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { - u16 tail = nvmeq->sq_tail; - + spin_lock(&nvmeq->sq_lock); if (nvmeq->sq_cmds_io) - memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); + memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd, + sizeof(*cmd)); else - memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); - if (++tail == nvmeq->q_depth) - tail = 0; - if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db, - nvmeq->dbbuf_sq_ei)) - writel(tail, nvmeq->q_db); - nvmeq->sq_tail = tail; + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, + nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) + writel(nvmeq->sq_tail, nvmeq->q_db); + spin_unlock(&nvmeq->sq_lock); } static void **nvme_pci_iod_list(struct request *req) @@ -872,6 +870,13 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command cmnd; blk_status_t ret; + /* + * We should not need to do this, but we're still using this to + * ensure we can drain requests on a dying queue. + */ + if (unlikely(nvmeq->cq_vector < 0)) + return BLK_STS_IOERR; + ret = nvme_setup_cmd(ns, req, &cmnd); if (ret) return ret; @@ -887,16 +892,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } blk_mq_start_request(req); - - spin_lock_irq(&nvmeq->q_lock); - if (unlikely(nvmeq->cq_vector < 0)) { - ret = BLK_STS_IOERR; - spin_unlock_irq(&nvmeq->q_lock); - goto out_cleanup_iod; - } - __nvme_submit_cmd(nvmeq, &cmnd); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); + nvme_submit_cmd(nvmeq, &cmnd); return BLK_STS_OK; out_cleanup_iod: nvme_free_iod(dev, req); @@ -914,10 +910,10 @@ static void nvme_pci_complete_rq(struct request *req) } /* We read the CQE phase first to check if the rest of the entry is valid */ -static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, - u16 phase) +static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) { - return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; + return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == + nvmeq->cq_phase; } static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) @@ -931,9 +927,9 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) } } -static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, - struct nvme_completion *cqe) +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) { + volatile struct nvme_completion *cqe = &nvmeq->cqes[idx]; struct request *req; if (unlikely(cqe->command_id >= nvmeq->q_depth)) { @@ -956,83 +952,87 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, return; } - nvmeq->cqe_seen = 1; req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); nvme_end_request(req, cqe->status, cqe->result); } -static inline bool nvme_read_cqe(struct nvme_queue *nvmeq, - struct nvme_completion *cqe) +static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) { - if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { - *cqe = nvmeq->cqes[nvmeq->cq_head]; + while (start != end) { + nvme_handle_cqe(nvmeq, start); + if (++start == nvmeq->q_depth) + start = 0; + } +} - if (++nvmeq->cq_head == nvmeq->q_depth) { - nvmeq->cq_head = 0; - nvmeq->cq_phase = !nvmeq->cq_phase; - } - return true; +static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) +{ + if (++nvmeq->cq_head == nvmeq->q_depth) { + nvmeq->cq_head = 0; + nvmeq->cq_phase = !nvmeq->cq_phase; } - return false; } -static void nvme_process_cq(struct nvme_queue *nvmeq) +static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, + u16 *end, int tag) { - struct nvme_completion cqe; - int consumed = 0; + bool found = false; - while (nvme_read_cqe(nvmeq, &cqe)) { - nvme_handle_cqe(nvmeq, &cqe); - consumed++; + *start = nvmeq->cq_head; + while (!found && nvme_cqe_pending(nvmeq)) { + if (nvmeq->cqes[nvmeq->cq_head].command_id == tag) + found = true; + nvme_update_cq_head(nvmeq); } + *end = nvmeq->cq_head; - if (consumed) + if (*start != *end) nvme_ring_cq_doorbell(nvmeq); + return found; } static irqreturn_t nvme_irq(int irq, void *data) { - irqreturn_t result; struct nvme_queue *nvmeq = data; - spin_lock(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; - nvmeq->cqe_seen = 0; - spin_unlock(&nvmeq->q_lock); - return result; + irqreturn_t ret = IRQ_NONE; + u16 start, end; + + spin_lock(&nvmeq->cq_lock); + if (nvmeq->cq_head != nvmeq->last_cq_head) + ret = IRQ_HANDLED; + nvme_process_cq(nvmeq, &start, &end, -1); + nvmeq->last_cq_head = nvmeq->cq_head; + spin_unlock(&nvmeq->cq_lock); + + if (start != end) { + nvme_complete_cqes(nvmeq, start, end); + return IRQ_HANDLED; + } + + return ret; } static irqreturn_t nvme_irq_check(int irq, void *data) { struct nvme_queue *nvmeq = data; - if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + if (nvme_cqe_pending(nvmeq)) return IRQ_WAKE_THREAD; return IRQ_NONE; } static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) { - struct nvme_completion cqe; - int found = 0, consumed = 0; + u16 start, end; + bool found; - if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + if (!nvme_cqe_pending(nvmeq)) return 0; - spin_lock_irq(&nvmeq->q_lock); - while (nvme_read_cqe(nvmeq, &cqe)) { - nvme_handle_cqe(nvmeq, &cqe); - consumed++; - - if (tag == cqe.command_id) { - found = 1; - break; - } - } - - if (consumed) - nvme_ring_cq_doorbell(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); + spin_lock_irq(&nvmeq->cq_lock); + found = nvme_process_cq(nvmeq, &start, &end, tag); + spin_unlock_irq(&nvmeq->cq_lock); + nvme_complete_cqes(nvmeq, start, end); return found; } @@ -1052,10 +1052,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; - - spin_lock_irq(&nvmeq->q_lock); - __nvme_submit_cmd(nvmeq, &c); - spin_unlock_irq(&nvmeq->q_lock); + nvme_submit_cmd(nvmeq, &c); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1070,7 +1067,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) + struct nvme_queue *nvmeq, s16 vector) { struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; @@ -1085,7 +1082,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cqid = cpu_to_le16(qid); c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); c.create_cq.cq_flags = cpu_to_le16(flags); - c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); + c.create_cq.irq_vector = cpu_to_le16(vector); return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } @@ -1093,10 +1090,19 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { + struct nvme_ctrl *ctrl = &dev->ctrl; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG; /* + * Some drives have a bug that auto-enables WRRU if MEDIUM isn't + * set. Since URGENT priority is zeroes, it makes all queues + * URGENT. + */ + if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ) + flags |= NVME_SQ_PRIO_MEDIUM; + + /* * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request. */ @@ -1199,7 +1205,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) nvme_warn_reset(dev, csts); nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); - return BLK_EH_HANDLED; + return BLK_EH_DONE; } /* @@ -1209,24 +1215,24 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", req->tag, nvmeq->qid); - return BLK_EH_HANDLED; + return BLK_EH_DONE; } /* * Shutdown immediately if controller times out while starting. The * reset work will see the pci device disabled when it gets the forced * cancellation error. All outstanding requests are completed on - * shutdown, so we return BLK_EH_HANDLED. + * shutdown, so we return BLK_EH_DONE. */ switch (dev->ctrl.state) { case NVME_CTRL_CONNECTING: case NVME_CTRL_RESETTING: - dev_warn(dev->ctrl.device, + dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); nvme_req(req)->flags |= NVME_REQ_CANCELLED; - return BLK_EH_HANDLED; + return BLK_EH_DONE; default: break; } @@ -1243,12 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); - /* - * Mark the request as handled, since the inline shutdown - * forces all outstanding requests to complete. - */ nvme_req(req)->flags |= NVME_REQ_CANCELLED; - return BLK_EH_HANDLED; + return BLK_EH_DONE; } if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { @@ -1312,15 +1314,21 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { int vector; - spin_lock_irq(&nvmeq->q_lock); + spin_lock_irq(&nvmeq->cq_lock); if (nvmeq->cq_vector == -1) { - spin_unlock_irq(&nvmeq->q_lock); + spin_unlock_irq(&nvmeq->cq_lock); return 1; } vector = nvmeq->cq_vector; nvmeq->dev->online_queues--; nvmeq->cq_vector = -1; - spin_unlock_irq(&nvmeq->q_lock); + spin_unlock_irq(&nvmeq->cq_lock); + + /* + * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without + * having to grab the lock. + */ + mb(); if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); @@ -1333,15 +1341,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = &dev->queues[0]; + u16 start, end; if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); + spin_lock_irq(&nvmeq->cq_lock); + nvme_process_cq(nvmeq, &start, &end, -1); + spin_unlock_irq(&nvmeq->cq_lock); + + nvme_complete_cqes(nvmeq, start, end); } static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, @@ -1399,7 +1410,8 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; - spin_lock_init(&nvmeq->q_lock); + spin_lock_init(&nvmeq->sq_lock); + spin_lock_init(&nvmeq->cq_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; @@ -1435,7 +1447,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { struct nvme_dev *dev = nvmeq->dev; - spin_lock_irq(&nvmeq->q_lock); + spin_lock_irq(&nvmeq->cq_lock); nvmeq->sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; @@ -1443,13 +1455,14 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; - spin_unlock_irq(&nvmeq->q_lock); + spin_unlock_irq(&nvmeq->cq_lock); } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) { struct nvme_dev *dev = nvmeq->dev; int result; + s16 vector; if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth), @@ -1462,15 +1475,21 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) * A queue's vector matches the queue identifier unless the controller * has only one vector available. */ - nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid; - result = adapter_alloc_cq(dev, qid, nvmeq); + vector = dev->num_vecs == 1 ? 0 : qid; + result = adapter_alloc_cq(dev, qid, nvmeq, vector); if (result < 0) - goto release_vector; + goto out; result = adapter_alloc_sq(dev, qid, nvmeq); if (result < 0) goto release_cq; + /* + * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will + * invoke free_irq for it and cause a 'Trying to free already-free IRQ + * xxx' warning if the create CQ/SQ command times out. + */ + nvmeq->cq_vector = vector; nvme_init_queue(nvmeq, qid); result = queue_request_irq(nvmeq); if (result < 0) @@ -1478,13 +1497,13 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) return result; - release_sq: +release_sq: + nvmeq->cq_vector = -1; dev->online_queues--; adapter_delete_sq(dev, qid); - release_cq: +release_cq: adapter_delete_cq(dev, qid); - release_vector: - nvmeq->cq_vector = -1; +out: return result; } @@ -1988,19 +2007,22 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error) static void nvme_del_cq_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; + u16 start, end; if (!error) { unsigned long flags; /* - * We might be called with the AQ q_lock held - * and the I/O queue q_lock should always + * We might be called with the AQ cq_lock held + * and the I/O queue cq_lock should always * nest inside the AQ one. */ - spin_lock_irqsave_nested(&nvmeq->q_lock, flags, + spin_lock_irqsave_nested(&nvmeq->cq_lock, flags, SINGLE_DEPTH_NESTING); - nvme_process_cq(nvmeq); - spin_unlock_irqrestore(&nvmeq->q_lock, flags); + nvme_process_cq(nvmeq, &start, &end, -1); + spin_unlock_irqrestore(&nvmeq->cq_lock, flags); + + nvme_complete_cqes(nvmeq, start, end); } nvme_del_queue_end(req, error); @@ -2488,6 +2510,15 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } +static void nvme_async_probe(void *data, async_cookie_t cookie) +{ + struct nvme_dev *dev = data; + + nvme_reset_ctrl_sync(&dev->ctrl); + flush_work(&dev->ctrl.scan_work); + nvme_put_ctrl(&dev->ctrl); +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -2532,7 +2563,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - nvme_reset_ctrl(&dev->ctrl); + nvme_get_ctrl(&dev->ctrl); + async_schedule(nvme_async_probe, dev); return 0; @@ -2676,6 +2708,9 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) static void nvme_error_resume(struct pci_dev *pdev) { + struct nvme_dev *dev = pci_get_drvdata(pdev); + + flush_work(&dev->ctrl.reset_work); pci_cleanup_aer_uncorrect_error_status(pdev); } @@ -2701,9 +2736,12 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_STRIPE_SIZE | NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ - .driver_data = NVME_QUIRK_NO_DEEPEST_PS }, + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_MEDIUM_PRIO_SQ }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */ @@ -2718,6 +2756,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ .driver_data = NVME_QUIRK_LIGHTNVM, }, + { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ + .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, |