From 1616d6c3717bae9041a4240d381ec56ccdaafedc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 3 May 2023 18:57:33 +0300 Subject: nvme-pci: add NVME_QUIRK_BOGUS_NID for HS-SSD-FUTURE 2048G Add a quirk to fix HS-SSD-FUTURE 2048G SSD drives reporting duplicate nsids. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217384 Reported-by: Andrey God Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 593f86323e25..d8b660daf7c2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3433,6 +3433,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1e4b, 0x1602), /* HS-SSD-FUTURE 2048G */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), -- cgit v1.2.3 From bd375feeaf3408ed00e08c3bc918d6be15f691ad Mon Sep 17 00:00:00 2001 From: Hristo Venev Date: Tue, 25 Apr 2023 22:58:54 +0300 Subject: nvme-pci: add quirk for missing secondary temperature thresholds On Kingston KC3000 and Kingston FURY Renegade (both have the same PCI IDs) accessing temp3_{min,max} fails with an invalid field error (note that there is no problem setting the thresholds for temp1). This contradicts the NVM Express Base Specification 2.0b, page 292: The over temperature threshold and under temperature threshold features shall be implemented for all implemented temperature sensors (i.e., all Temperature Sensor fields that report a non-zero value). Define NVME_QUIRK_NO_SECONDARY_TEMP_THRESH that disables the thresholds for all but the composite temperature and set it for this device. Signed-off-by: Hristo Venev Reviewed-by: Guenter Roeck Signed-off-by: Christoph Hellwig --- drivers/nvme/host/hwmon.c | 4 +++- drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 9e6e56c20ec9..316f3e4ca7cc 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -163,7 +163,9 @@ static umode_t nvme_hwmon_is_visible(const void *_data, case hwmon_temp_max: case hwmon_temp_min: if ((!channel && data->ctrl->wctemp) || - (channel && data->log->temp_sensor[channel - 1])) { + (channel && data->log->temp_sensor[channel - 1] && + !(data->ctrl->quirks & + NVME_QUIRK_NO_SECONDARY_TEMP_THRESH))) { if (data->ctrl->quirks & NVME_QUIRK_NO_TEMP_THRESH_CHANGE) return 0444; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bf46f122e9e1..a2d4f59e0535 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -149,6 +149,11 @@ enum nvme_quirks { * Reports garbage in the namespace identifiers (eui64, nguid, uuid). */ NVME_QUIRK_BOGUS_NID = (1 << 18), + + /* + * No temperature thresholds for channels other than 0 (Composite). + */ + NVME_QUIRK_NO_SECONDARY_TEMP_THRESH = (1 << 19), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d8b660daf7c2..18ca1e3ae070 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3401,6 +3401,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x2646, 0x5013), /* Kingston KC3000, Kingston FURY Renegade */ + .driver_data = NVME_QUIRK_NO_SECONDARY_TEMP_THRESH, }, { PCI_DEVICE(0x2646, 0x5018), /* KINGSTON OM8SFP4xxxxP OS21012 NVMe SSD */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x2646, 0x5016), /* KINGSTON OM3PGP4xxxxP OS21011 NVMe SSD */ -- cgit v1.2.3 From 3710e2b056cb92ad816e4d79fa54a6a5b6ad8cbd Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Fri, 21 Apr 2023 16:08:00 +0800 Subject: nvme-pci: clamp max_hw_sectors based on DMA optimized limitation When running the fio test on a 448-core AMD server + a NVME disk, a soft lockup or a hard lockup call trace is shown: [soft lockup] watchdog: BUG: soft lockup - CPU#126 stuck for 23s! [swapper/126:0] RIP: 0010:_raw_spin_unlock_irqrestore+0x21/0x50 ... Call Trace: fq_flush_timeout+0x7d/0xd0 ? __pfx_fq_flush_timeout+0x10/0x10 call_timer_fn+0x2e/0x150 run_timer_softirq+0x48a/0x560 ? __pfx_fq_flush_timeout+0x10/0x10 ? clockevents_program_event+0xaf/0x130 __do_softirq+0xf1/0x335 irq_exit_rcu+0x9f/0xd0 sysvec_apic_timer_interrupt+0xb4/0xd0 asm_sysvec_apic_timer_interrupt+0x1f/0x30 ... Obvisouly, fq_flush_timeout spends over 20 seconds. Here is ftrace log: | fq_flush_timeout() { | fq_ring_free() { | put_pages_list() { 0.170 us | free_unref_page_list(); 0.810 us | } | free_iova_fast() { | free_iova() { * 85622.66 us | _raw_spin_lock_irqsave(); 2.860 us | remove_iova(); 0.600 us | _raw_spin_unlock_irqrestore(); 0.470 us | lock_info_report(); 2.420 us | free_iova_mem.part.0(); * 85638.27 us | } * 85638.84 us | } | put_pages_list() { 0.230 us | free_unref_page_list(); 0.470 us | } ... ... $ 31017069 us | } Most of cores are under lock contention for acquiring iova_rbtree_lock due to the iova flush queue mechanism. [hard lockup] NMI watchdog: Watchdog detected hard LOCKUP on cpu 351 RIP: 0010:native_queued_spin_lock_slowpath+0x2d8/0x330 Call Trace: _raw_spin_lock_irqsave+0x4f/0x60 free_iova+0x27/0xd0 free_iova_fast+0x4d/0x1d0 fq_ring_free+0x9b/0x150 iommu_dma_free_iova+0xb4/0x2e0 __iommu_dma_unmap+0x10b/0x140 iommu_dma_unmap_sg+0x90/0x110 dma_unmap_sg_attrs+0x4a/0x50 nvme_unmap_data+0x5d/0x120 [nvme] nvme_pci_complete_batch+0x77/0xc0 [nvme] nvme_irq+0x2ee/0x350 [nvme] ? __pfx_nvme_pci_complete_batch+0x10/0x10 [nvme] __handle_irq_event_percpu+0x53/0x1a0 handle_irq_event_percpu+0x19/0x60 handle_irq_event+0x3d/0x60 handle_edge_irq+0xb3/0x210 __common_interrupt+0x7f/0x150 common_interrupt+0xc5/0xf0 asm_common_interrupt+0x2b/0x40 ... ftrace shows fq_ring_free spends over 10 seconds [1]. Again, most of cores are under lock contention for acquiring iova_rbtree_lock due to the iova flush queue mechanism. [Root Cause] The root cause is that the max_hw_sectors_kb of nvme disk (mdts=10) is 4096kb, which streaming DMA mappings cannot benefit from the scalable IOVA mechanism introduced by the commit 9257b4a206fc ("iommu/iova: introduce per-cpu caching to iova allocation") if the length is greater than 128kb. To fix the lock contention issue, clamp max_hw_sectors based on DMA optimized limitation in order to leverage scalable IOVA mechanism. Note: The issue does not happen with another NVME disk (mdts = 5 and max_hw_sectors_kb = 128) [1] https://gist.github.com/AdrianHuang/bf8ec7338204837631fbdaed25d19cc4 Suggested-by: Keith Busch Reported-and-tested-by: Jiwei Sun Signed-off-by: Adrian Huang Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 18ca1e3ae070..922ffe4e2822 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2956,7 +2956,7 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, * over a single page. */ dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9); + NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9); dev->ctrl.max_segments = NVME_MAX_SEGS; /* -- cgit v1.2.3 From 1743e5f6000901a11f4e1cd741bfa9136f3ec9b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 May 2023 09:53:45 +0200 Subject: nvme-multipath: don't call blk_mark_disk_dead in nvme_mpath_remove_disk nvme_mpath_remove_disk is called after del_gendisk, at which point a blk_mark_disk_dead call doesn't make any sense. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index fc39d01e7b63..187c9de0d6d5 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -884,7 +884,6 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - blk_mark_disk_dead(head->disk); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); flush_work(&head->requeue_work); -- cgit v1.2.3 From 2eb94dd56a4a4e3fe286def3e2ba207804a37345 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 11 May 2023 13:07:41 +0200 Subject: nvme: do not let the user delete a ctrl before a complete initialization If a userspace application performes a "delete_controller" command early during the ctrl initialization, the delete operation may race against the init code and the kernel will crash. nvme nvme5: Connect command failed: host path error nvme nvme5: failed to connect queue: 0 ret=880 PF: supervisor write access in kernel mode PF: error_code(0x0002) - not-present page blk_mq_quiesce_queue+0x18/0x90 nvme_tcp_delete_ctrl+0x24/0x40 [nvme_tcp] nvme_do_delete_ctrl+0x7f/0x8b [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write_iter+0x124/0x1b0 new_sync_write+0xff/0x190 vfs_write+0x1ef/0x280 Fix the crash by checking the NVME_CTRL_STARTED_ONCE bit; if it's not set it means that the nvme controller is still in the process of getting initialized and the kernel will return an -EBUSY error to userspace. Set the NVME_CTRL_STARTED_ONCE later in the nvme_start_ctrl() function, after the controller start operation is completed. Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6c1e7d6709e0..22e9e90aa671 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3574,6 +3574,9 @@ static ssize_t nvme_sysfs_delete(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + if (!test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags)) + return -EBUSY; + if (device_remove_file_self(dev, attr)) nvme_delete_ctrl_sync(ctrl); return count; @@ -5034,7 +5037,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) * that were missed. We identify persistent discovery controllers by * checking that they started once before, hence are reconnecting back. */ - if (test_and_set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) && + if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) && nvme_discovery_ctrl(ctrl)) nvme_change_uevent(ctrl, "NVME_EVENT=rediscover"); @@ -5045,6 +5048,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) } nvme_change_uevent(ctrl, "NVME_EVENT=connected"); + set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags); } EXPORT_SYMBOL_GPL(nvme_start_ctrl); -- cgit v1.2.3 From 0649728123cf6a5518e154b4e1735fc85ea4f55c Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Wed, 17 May 2023 14:32:32 -0700 Subject: nvme-pci: Add quirk for Teamgroup MP33 SSD Add a quirk for Teamgroup MP33 that reports duplicate ids for disk. Signed-off-by: Daniel Smith [kch: patch formatting] Signed-off-by: Chaitanya Kulkarni Tested-by: Daniel Smith Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 922ffe4e2822..e90f824ffed8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3437,6 +3437,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e4b, 0x1602), /* HS-SSD-FUTURE 2048G */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x10ec, 0x5765), /* TEAMGROUP MP33 2TB SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), -- cgit v1.2.3