summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 22:19:59 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 22:19:59 +0100
commit0e9da3fbf7d81f0f913b491c8de1ba7883d4f217 (patch)
tree2b3d25e3be60bf4ee40b4690c7bb9d6fa499ae69 /drivers
parentMerge tag 'y2038-for-4.21' of ssh://gitolite.kernel.org:/pub/scm/linux/kernel... (diff)
parentkyber: use sbitmap add_wait_queue/list_del wait helpers (diff)
downloadlinux-0e9da3fbf7d81f0f913b491c8de1ba7883d4f217.tar.xz
linux-0e9da3fbf7d81f0f913b491c8de1ba7883d4f217.zip
Merge tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "This is the main pull request for block/storage for 4.21. Larger than usual, it was a busy round with lots of goodies queued up. Most notable is the removal of the old IO stack, which has been a long time coming. No new features for a while, everything coming in this week has all been fixes for things that were previously merged. This contains: - Use atomic counters instead of semaphores for mtip32xx (Arnd) - Cleanup of the mtip32xx request setup (Christoph) - Fix for circular locking dependency in loop (Jan, Tetsuo) - bcache (Coly, Guoju, Shenghui) * Optimizations for writeback caching * Various fixes and improvements - nvme (Chaitanya, Christoph, Sagi, Jay, me, Keith) * host and target support for NVMe over TCP * Error log page support * Support for separate read/write/poll queues * Much improved polling * discard OOM fallback * Tracepoint improvements - lightnvm (Hans, Hua, Igor, Matias, Javier) * Igor added packed metadata to pblk. Now drives without metadata per LBA can be used as well. * Fix from Geert on uninitialized value on chunk metadata reads. * Fixes from Hans and Javier to pblk recovery and write path. * Fix from Hua Su to fix a race condition in the pblk recovery code. * Scan optimization added to pblk recovery from Zhoujie. * Small geometry cleanup from me. - Conversion of the last few drivers that used the legacy path to blk-mq (me) - Removal of legacy IO path in SCSI (me, Christoph) - Removal of legacy IO stack and schedulers (me) - Support for much better polling, now without interrupts at all. blk-mq adds support for multiple queue maps, which enables us to have a map per type. This in turn enables nvme to have separate completion queues for polling, which can then be interrupt-less. Also means we're ready for async polled IO, which is hopefully coming in the next release. - Killing of (now) unused block exports (Christoph) - Unification of the blk-rq-qos and blk-wbt wait handling (Josef) - Support for zoned testing with null_blk (Masato) - sx8 conversion to per-host tag sets (Christoph) - IO priority improvements (Damien) - mq-deadline zoned fix (Damien) - Ref count blkcg series (Dennis) - Lots of blk-mq improvements and speedups (me) - sbitmap scalability improvements (me) - Make core inflight IO accounting per-cpu (Mikulas) - Export timeout setting in sysfs (Weiping) - Cleanup the direct issue path (Jianchao) - Export blk-wbt internals in block debugfs for easier debugging (Ming) - Lots of other fixes and improvements" * tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block: (364 commits) kyber: use sbitmap add_wait_queue/list_del wait helpers sbitmap: add helpers for add/del wait queue handling block: save irq state in blkg_lookup_create() dm: don't reuse bio for flushes nvme-pci: trace SQ status on completions nvme-rdma: implement polling queue map nvme-fabrics: allow user to pass in nr_poll_queues nvme-fabrics: allow nvmf_connect_io_queue to poll nvme-core: optionally poll sync commands block: make request_to_qc_t public nvme-tcp: fix spelling mistake "attepmpt" -> "attempt" nvme-tcp: fix endianess annotations nvmet-tcp: fix endianess annotations nvme-pci: refactor nvme_poll_irqdisable to make sparse happy nvme-pci: only set nr_maps to 2 if poll queues are supported nvmet: use a macro for default error location nvmet: fix comparison of a u16 with -1 blk-mq: enable IO poll if .nr_queues of type poll > 0 blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight() blk-mq: skip zero-queue maps in blk_mq_map_swqueue ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/ata/libata-eh.c4
-rw-r--r--drivers/block/aoe/aoe.h4
-rw-r--r--drivers/block/aoe/aoeblk.c1
-rw-r--r--drivers/block/aoe/aoecmd.c27
-rw-r--r--drivers/block/aoe/aoedev.c11
-rw-r--r--drivers/block/aoe/aoemain.c2
-rw-r--r--drivers/block/ataflop.c26
-rw-r--r--drivers/block/drbd/drbd_main.c2
-rw-r--r--drivers/block/floppy.c6
-rw-r--r--drivers/block/loop.c415
-rw-r--r--drivers/block/loop.h1
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c226
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h48
-rw-r--r--drivers/block/nbd.c3
-rw-r--r--drivers/block/null_blk.h1
-rw-r--r--drivers/block/null_blk_main.c21
-rw-r--r--drivers/block/null_blk_zoned.c27
-rw-r--r--drivers/block/paride/pd.c30
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/skd_main.c16
-rw-r--r--drivers/block/sunvdc.c153
-rw-r--r--drivers/block/sx8.c434
-rw-r--r--drivers/block/umem.c3
-rw-r--r--drivers/block/virtio_blk.c17
-rw-r--r--drivers/ide/ide-atapi.c27
-rw-r--r--drivers/ide/ide-cd.c179
-rw-r--r--drivers/ide/ide-devsets.c4
-rw-r--r--drivers/ide/ide-disk.c15
-rw-r--r--drivers/ide/ide-eh.c2
-rw-r--r--drivers/ide/ide-floppy.c2
-rw-r--r--drivers/ide/ide-io.c112
-rw-r--r--drivers/ide/ide-park.c8
-rw-r--r--drivers/ide/ide-pm.c46
-rw-r--r--drivers/ide/ide-probe.c69
-rw-r--r--drivers/ide/ide-tape.c2
-rw-r--r--drivers/ide/ide-taskfile.c2
-rw-r--r--drivers/lightnvm/core.c25
-rw-r--r--drivers/lightnvm/pblk-core.c77
-rw-r--r--drivers/lightnvm/pblk-init.c103
-rw-r--r--drivers/lightnvm/pblk-map.c63
-rw-r--r--drivers/lightnvm/pblk-rb.c5
-rw-r--r--drivers/lightnvm/pblk-read.c66
-rw-r--r--drivers/lightnvm/pblk-recovery.c46
-rw-r--r--drivers/lightnvm/pblk-rl.c5
-rw-r--r--drivers/lightnvm/pblk-sysfs.c7
-rw-r--r--drivers/lightnvm/pblk-write.c64
-rw-r--r--drivers/lightnvm/pblk.h43
-rw-r--r--drivers/md/bcache/bcache.h20
-rw-r--r--drivers/md/bcache/btree.c5
-rw-r--r--drivers/md/bcache/btree.h18
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/request.c6
-rw-r--r--drivers/md/bcache/super.c48
-rw-r--r--drivers/md/bcache/sysfs.c61
-rw-r--r--drivers/md/bcache/writeback.c30
-rw-r--r--drivers/md/bcache/writeback.h12
-rw-r--r--drivers/md/dm-core.h5
-rw-r--r--drivers/md/dm-rq.c7
-rw-r--r--drivers/md/dm-table.c4
-rw-r--r--drivers/md/dm.c79
-rw-r--r--drivers/md/md.c7
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/memstick/core/ms_block.c109
-rw-r--r--drivers/memstick/core/ms_block.h1
-rw-r--r--drivers/memstick/core/mspro_block.c121
-rw-r--r--drivers/mmc/core/block.c26
-rw-r--r--drivers/mmc/core/queue.c110
-rw-r--r--drivers/mmc/core/queue.h4
-rw-r--r--drivers/net/wireless/ath/ath6kl/cfg80211.c2
-rw-r--r--drivers/net/wireless/ath/ath6kl/common.h2
-rw-r--r--drivers/net/wireless/ath/ath6kl/wmi.c6
-rw-r--r--drivers/net/wireless/ath/ath6kl/wmi.h6
-rw-r--r--drivers/nvdimm/pmem.c2
-rw-r--r--drivers/nvme/host/Kconfig15
-rw-r--r--drivers/nvme/host/Makefile3
-rw-r--r--drivers/nvme/host/core.c191
-rw-r--r--drivers/nvme/host/fabrics.c61
-rw-r--r--drivers/nvme/host/fabrics.h17
-rw-r--r--drivers/nvme/host/fc.c43
-rw-r--r--drivers/nvme/host/lightnvm.c33
-rw-r--r--drivers/nvme/host/multipath.c20
-rw-r--r--drivers/nvme/host/nvme.h24
-rw-r--r--drivers/nvme/host/pci.c518
-rw-r--r--drivers/nvme/host/rdma.c119
-rw-r--r--drivers/nvme/host/tcp.c2278
-rw-r--r--drivers/nvme/host/trace.c3
-rw-r--r--drivers/nvme/host/trace.h27
-rw-r--r--drivers/nvme/target/Kconfig10
-rw-r--r--drivers/nvme/target/Makefile2
-rw-r--r--drivers/nvme/target/admin-cmd.c146
-rw-r--r--drivers/nvme/target/configfs.c43
-rw-r--r--drivers/nvme/target/core.c220
-rw-r--r--drivers/nvme/target/discovery.c139
-rw-r--r--drivers/nvme/target/fabrics-cmd.c64
-rw-r--r--drivers/nvme/target/fc.c66
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c89
-rw-r--r--drivers/nvme/target/io-cmd-file.c165
-rw-r--r--drivers/nvme/target/loop.c2
-rw-r--r--drivers/nvme/target/nvmet.h68
-rw-r--r--drivers/nvme/target/rdma.c12
-rw-r--r--drivers/nvme/target/tcp.c1737
-rw-r--r--drivers/s390/block/dasd_ioctl.c22
-rw-r--r--drivers/scsi/Kconfig12
-rw-r--r--drivers/scsi/bnx2i/bnx2i_hwi.c8
-rw-r--r--drivers/scsi/csiostor/csio_scsi.c8
-rw-r--r--drivers/scsi/cxlflash/main.c6
-rw-r--r--drivers/scsi/device_handler/scsi_dh_alua.c21
-rw-r--r--drivers/scsi/device_handler/scsi_dh_emc.c8
-rw-r--r--drivers/scsi/device_handler/scsi_dh_hp_sw.c7
-rw-r--r--drivers/scsi/device_handler/scsi_dh_rdac.c7
-rw-r--r--drivers/scsi/fnic/fnic_scsi.c4
-rw-r--r--drivers/scsi/hosts.c29
-rw-r--r--drivers/scsi/libsas/sas_ata.c5
-rw-r--r--drivers/scsi/libsas/sas_scsi_host.c10
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c2
-rw-r--r--drivers/scsi/osd/osd_initiator.c4
-rw-r--r--drivers/scsi/osst.c2
-rw-r--r--drivers/scsi/qedi/qedi_main.c3
-rw-r--r--drivers/scsi/qla2xxx/qla_nvme.c12
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c37
-rw-r--r--drivers/scsi/scsi.c5
-rw-r--r--drivers/scsi/scsi_debug.c3
-rw-r--r--drivers/scsi/scsi_error.c24
-rw-r--r--drivers/scsi/scsi_lib.c806
-rw-r--r--drivers/scsi/scsi_priv.h1
-rw-r--r--drivers/scsi/scsi_scan.c10
-rw-r--r--drivers/scsi/scsi_sysfs.c8
-rw-r--r--drivers/scsi/scsi_transport_fc.c71
-rw-r--r--drivers/scsi/scsi_transport_iscsi.c7
-rw-r--r--drivers/scsi/scsi_transport_sas.c10
-rw-r--r--drivers/scsi/sd.c85
-rw-r--r--drivers/scsi/sd.h6
-rw-r--r--drivers/scsi/sd_zbc.c10
-rw-r--r--drivers/scsi/sg.c2
-rw-r--r--drivers/scsi/smartpqi/smartpqi_init.c3
-rw-r--r--drivers/scsi/sr.c12
-rw-r--r--drivers/scsi/st.c2
-rw-r--r--drivers/scsi/ufs/ufs_bsg.c4
-rw-r--r--drivers/scsi/virtio_scsi.c3
-rw-r--r--drivers/target/iscsi/iscsi_target_util.c12
-rw-r--r--drivers/target/target_core_pscsi.c2
142 files changed, 7570 insertions, 2988 deletions
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 01306c018398..938ed513b070 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -919,8 +919,6 @@ static void ata_eh_set_pending(struct ata_port *ap, int fastdrain)
void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
- struct request_queue *q = qc->scsicmd->device->request_queue;
- unsigned long flags;
WARN_ON(!ap->ops->error_handler);
@@ -932,9 +930,7 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
* Note that ATA_QCFLAG_FAILED is unconditionally set after
* this function completes.
*/
- spin_lock_irqsave(q->queue_lock, flags);
blk_abort_request(qc->scsicmd->request);
- spin_unlock_irqrestore(q->queue_lock, flags);
}
/**
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 7ca76ed2e71a..84d0fcebd6af 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -100,6 +100,10 @@ enum {
MAX_TAINT = 1000, /* cap on aoetgt taint */
};
+struct aoe_req {
+ unsigned long nr_bios;
+};
+
struct buf {
ulong nframesout;
struct bio *bio;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index ed26b7287256..e2c6aae2d636 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -387,6 +387,7 @@ aoeblk_gdalloc(void *vp)
set = &d->tag_set;
set->ops = &aoeblk_mq_ops;
+ set->cmd_size = sizeof(struct aoe_req);
set->nr_hw_queues = 1;
set->queue_depth = 128;
set->numa_node = NUMA_NO_NODE;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index bb2fba651bd2..3cf9bc5d8d95 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -822,17 +822,6 @@ out:
spin_unlock_irqrestore(&d->lock, flags);
}
-static unsigned long
-rqbiocnt(struct request *r)
-{
- struct bio *bio;
- unsigned long n = 0;
-
- __rq_for_each_bio(bio, r)
- n++;
- return n;
-}
-
static void
bufinit(struct buf *buf, struct request *rq, struct bio *bio)
{
@@ -847,6 +836,7 @@ nextbuf(struct aoedev *d)
{
struct request *rq;
struct request_queue *q;
+ struct aoe_req *req;
struct buf *buf;
struct bio *bio;
@@ -865,7 +855,11 @@ nextbuf(struct aoedev *d)
blk_mq_start_request(rq);
d->ip.rq = rq;
d->ip.nxbio = rq->bio;
- rq->special = (void *) rqbiocnt(rq);
+
+ req = blk_mq_rq_to_pdu(rq);
+ req->nr_bios = 0;
+ __rq_for_each_bio(bio, rq)
+ req->nr_bios++;
}
buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
if (buf == NULL) {
@@ -1069,16 +1063,13 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
static void
aoe_end_buf(struct aoedev *d, struct buf *buf)
{
- struct request *rq;
- unsigned long n;
+ struct request *rq = buf->rq;
+ struct aoe_req *req = blk_mq_rq_to_pdu(rq);
if (buf == d->ip.buf)
d->ip.buf = NULL;
- rq = buf->rq;
mempool_free(buf, d->bufpool);
- n = (unsigned long) rq->special;
- rq->special = (void *) --n;
- if (n == 0)
+ if (--req->nr_bios == 0)
aoe_end_request(d, rq, 0);
}
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 9063f8efbd3b..5b49f1b33ebe 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -160,21 +160,22 @@ static void
aoe_failip(struct aoedev *d)
{
struct request *rq;
+ struct aoe_req *req;
struct bio *bio;
- unsigned long n;
aoe_failbuf(d, d->ip.buf);
-
rq = d->ip.rq;
if (rq == NULL)
return;
+
+ req = blk_mq_rq_to_pdu(rq);
while ((bio = d->ip.nxbio)) {
bio->bi_status = BLK_STS_IOERR;
d->ip.nxbio = bio->bi_next;
- n = (unsigned long) rq->special;
- rq->special = (void *) --n;
+ req->nr_bios--;
}
- if ((unsigned long) rq->special == 0)
+
+ if (!req->nr_bios)
aoe_end_request(d, rq, 0);
}
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index 251482066977..1e4e2971171c 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -24,7 +24,7 @@ static void discover_timer(struct timer_list *t)
aoecmd_cfg(0xffff, 0xff);
}
-static void
+static void __exit
aoe_exit(void)
{
del_timer_sync(&timer);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index f88b4c26d422..b0dbbdfeb33e 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1471,6 +1471,15 @@ static void setup_req_params( int drive )
ReqTrack, ReqSector, (unsigned long)ReqData ));
}
+static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+ spin_lock_irq(&ataflop_lock);
+ atari_disable_irq(IRQ_MFP_FDC);
+ finish_fdc();
+ atari_enable_irq(IRQ_MFP_FDC);
+ spin_unlock_irq(&ataflop_lock);
+}
+
static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -1947,6 +1956,7 @@ static const struct block_device_operations floppy_fops = {
static const struct blk_mq_ops ataflop_mq_ops = {
.queue_rq = ataflop_queue_rq,
+ .commit_rqs = ataflop_commit_rqs,
};
static struct kobject *floppy_find(dev_t dev, int *part, void *data)
@@ -1982,6 +1992,7 @@ static int __init atari_floppy_init (void)
&ataflop_mq_ops, 2,
BLK_MQ_F_SHOULD_MERGE);
if (IS_ERR(unit[i].disk->queue)) {
+ put_disk(unit[i].disk);
ret = PTR_ERR(unit[i].disk->queue);
unit[i].disk->queue = NULL;
goto err;
@@ -2033,18 +2044,13 @@ static int __init atari_floppy_init (void)
return 0;
err:
- do {
+ while (--i >= 0) {
struct gendisk *disk = unit[i].disk;
- if (disk) {
- if (disk->queue) {
- blk_cleanup_queue(disk->queue);
- disk->queue = NULL;
- }
- blk_mq_free_tag_set(&unit[i].tag_set);
- put_disk(unit[i].disk);
- }
- } while (i--);
+ blk_cleanup_queue(disk->queue);
+ blk_mq_free_tag_set(&unit[i].tag_set);
+ put_disk(unit[i].disk);
+ }
unregister_blkdev(FLOPPY_MAJOR, "fd");
return ret;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa8204214ac0..f973a2a845c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2792,7 +2792,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_init_set_defaults(device);
- q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock);
+ q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
if (!q)
goto out_no_q;
device->rq_queue = q;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index fb23578e9a41..6f2856c6d0f2 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2231,7 +2231,6 @@ static void request_done(int uptodate)
{
struct request *req = current_req;
struct request_queue *q;
- unsigned long flags;
int block;
char msg[sizeof("request done ") + sizeof(int) * 3];
@@ -2254,10 +2253,7 @@ static void request_done(int uptodate)
if (block > _floppy->sect)
DRS->maxtrack = 1;
- /* unlock chained buffers */
- spin_lock_irqsave(q->queue_lock, flags);
floppy_end_request(req, 0);
- spin_unlock_irqrestore(q->queue_lock, flags);
} else {
if (rq_data_dir(req) == WRITE) {
/* record write error information */
@@ -2269,9 +2265,7 @@ static void request_done(int uptodate)
DRWE->last_error_sector = blk_rq_pos(req);
DRWE->last_error_generation = DRS->generation;
}
- spin_lock_irqsave(q->queue_lock, flags);
floppy_end_request(req, BLK_STS_IOERR);
- spin_unlock_irqrestore(q->queue_lock, flags);
}
}
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cb0cc8685076..0939f36548c9 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,13 +77,14 @@
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/ioprio.h>
+#include <linux/blk-cgroup.h>
#include "loop.h"
#include <linux/uaccess.h>
static DEFINE_IDR(loop_index_idr);
-static DEFINE_MUTEX(loop_index_mutex);
+static DEFINE_MUTEX(loop_ctl_mutex);
static int max_part;
static int part_shift;
@@ -630,18 +631,7 @@ static void loop_reread_partitions(struct loop_device *lo,
{
int rc;
- /*
- * bd_mutex has been held already in release path, so don't
- * acquire it if this function is called in such case.
- *
- * If the reread partition isn't from release path, lo_refcnt
- * must be at least one and it can only become zero when the
- * current holder is released.
- */
- if (!atomic_read(&lo->lo_refcnt))
- rc = __blkdev_reread_part(bdev);
- else
- rc = blkdev_reread_part(bdev);
+ rc = blkdev_reread_part(bdev);
if (rc)
pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
__func__, lo->lo_number, lo->lo_file_name, rc);
@@ -688,26 +678,30 @@ static int loop_validate_file(struct file *file, struct block_device *bdev)
static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
unsigned int arg)
{
- struct file *file, *old_file;
+ struct file *file = NULL, *old_file;
int error;
+ bool partscan;
+ error = mutex_lock_killable(&loop_ctl_mutex);
+ if (error)
+ return error;
error = -ENXIO;
if (lo->lo_state != Lo_bound)
- goto out;
+ goto out_err;
/* the loop device has to be read-only */
error = -EINVAL;
if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
- goto out;
+ goto out_err;
error = -EBADF;
file = fget(arg);
if (!file)
- goto out;
+ goto out_err;
error = loop_validate_file(file, bdev);
if (error)
- goto out_putf;
+ goto out_err;
old_file = lo->lo_backing_file;
@@ -715,7 +709,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
/* size of the new backing store needs to be the same */
if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
- goto out_putf;
+ goto out_err;
/* and ... switch */
blk_mq_freeze_queue(lo->lo_queue);
@@ -726,15 +720,22 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
loop_update_dio(lo);
blk_mq_unfreeze_queue(lo->lo_queue);
-
+ partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
+ mutex_unlock(&loop_ctl_mutex);
+ /*
+ * We must drop file reference outside of loop_ctl_mutex as dropping
+ * the file ref can take bd_mutex which creates circular locking
+ * dependency.
+ */
fput(old_file);
- if (lo->lo_flags & LO_FLAGS_PARTSCAN)
+ if (partscan)
loop_reread_partitions(lo, bdev);
return 0;
- out_putf:
- fput(file);
- out:
+out_err:
+ mutex_unlock(&loop_ctl_mutex);
+ if (file)
+ fput(file);
return error;
}
@@ -909,6 +910,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
int lo_flags = 0;
int error;
loff_t size;
+ bool partscan;
/* This is safe, since we have a reference from open(). */
__module_get(THIS_MODULE);
@@ -918,13 +920,17 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
if (!file)
goto out;
+ error = mutex_lock_killable(&loop_ctl_mutex);
+ if (error)
+ goto out_putf;
+
error = -EBUSY;
if (lo->lo_state != Lo_unbound)
- goto out_putf;
+ goto out_unlock;
error = loop_validate_file(file, bdev);
if (error)
- goto out_putf;
+ goto out_unlock;
mapping = file->f_mapping;
inode = mapping->host;
@@ -936,10 +942,10 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
error = -EFBIG;
size = get_loop_size(lo, file);
if ((loff_t)(sector_t)size != size)
- goto out_putf;
+ goto out_unlock;
error = loop_prepare_queue(lo);
if (error)
- goto out_putf;
+ goto out_unlock;
error = 0;
@@ -971,18 +977,22 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->lo_state = Lo_bound;
if (part_shift)
lo->lo_flags |= LO_FLAGS_PARTSCAN;
- if (lo->lo_flags & LO_FLAGS_PARTSCAN)
- loop_reread_partitions(lo, bdev);
+ partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
/* Grab the block_device to prevent its destruction after we
- * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
+ * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
*/
bdgrab(bdev);
+ mutex_unlock(&loop_ctl_mutex);
+ if (partscan)
+ loop_reread_partitions(lo, bdev);
return 0;
- out_putf:
+out_unlock:
+ mutex_unlock(&loop_ctl_mutex);
+out_putf:
fput(file);
- out:
+out:
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
return error;
@@ -1025,39 +1035,31 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
return err;
}
-static int loop_clr_fd(struct loop_device *lo)
+static int __loop_clr_fd(struct loop_device *lo, bool release)
{
- struct file *filp = lo->lo_backing_file;
+ struct file *filp = NULL;
gfp_t gfp = lo->old_gfp_mask;
struct block_device *bdev = lo->lo_device;
+ int err = 0;
+ bool partscan = false;
+ int lo_number;
- if (lo->lo_state != Lo_bound)
- return -ENXIO;
-
- /*
- * If we've explicitly asked to tear down the loop device,
- * and it has an elevated reference count, set it for auto-teardown when
- * the last reference goes away. This stops $!~#$@ udev from
- * preventing teardown because it decided that it needs to run blkid on
- * the loopback device whenever they appear. xfstests is notorious for
- * failing tests because blkid via udev races with a losetup
- * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
- * command to fail with EBUSY.
- */
- if (atomic_read(&lo->lo_refcnt) > 1) {
- lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
- mutex_unlock(&lo->lo_ctl_mutex);
- return 0;
+ mutex_lock(&loop_ctl_mutex);
+ if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
+ err = -ENXIO;
+ goto out_unlock;
}
- if (filp == NULL)
- return -EINVAL;
+ filp = lo->lo_backing_file;
+ if (filp == NULL) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
/* freeze request queue during the transition */
blk_mq_freeze_queue(lo->lo_queue);
spin_lock_irq(&lo->lo_lock);
- lo->lo_state = Lo_rundown;
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
@@ -1093,21 +1095,73 @@ static int loop_clr_fd(struct loop_device *lo)
module_put(THIS_MODULE);
blk_mq_unfreeze_queue(lo->lo_queue);
- if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
- loop_reread_partitions(lo, bdev);
+ partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
+ lo_number = lo->lo_number;
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
loop_unprepare_queue(lo);
- mutex_unlock(&lo->lo_ctl_mutex);
+out_unlock:
+ mutex_unlock(&loop_ctl_mutex);
+ if (partscan) {
+ /*
+ * bd_mutex has been held already in release path, so don't
+ * acquire it if this function is called in such case.
+ *
+ * If the reread partition isn't from release path, lo_refcnt
+ * must be at least one and it can only become zero when the
+ * current holder is released.
+ */
+ if (release)
+ err = __blkdev_reread_part(bdev);
+ else
+ err = blkdev_reread_part(bdev);
+ pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
+ __func__, lo_number, err);
+ /* Device is gone, no point in returning error */
+ err = 0;
+ }
/*
- * Need not hold lo_ctl_mutex to fput backing file.
- * Calling fput holding lo_ctl_mutex triggers a circular
+ * Need not hold loop_ctl_mutex to fput backing file.
+ * Calling fput holding loop_ctl_mutex triggers a circular
* lock dependency possibility warning as fput can take
- * bd_mutex which is usually taken before lo_ctl_mutex.
+ * bd_mutex which is usually taken before loop_ctl_mutex.
*/
- fput(filp);
- return 0;
+ if (filp)
+ fput(filp);
+ return err;
+}
+
+static int loop_clr_fd(struct loop_device *lo)
+{
+ int err;
+
+ err = mutex_lock_killable(&loop_ctl_mutex);
+ if (err)
+ return err;
+ if (lo->lo_state != Lo_bound) {
+ mutex_unlock(&loop_ctl_mutex);
+ return -ENXIO;
+ }
+ /*
+ * If we've explicitly asked to tear down the loop device,
+ * and it has an elevated reference count, set it for auto-teardown when
+ * the last reference goes away. This stops $!~#$@ udev from
+ * preventing teardown because it decided that it needs to run blkid on
+ * the loopback device whenever they appear. xfstests is notorious for
+ * failing tests because blkid via udev races with a losetup
+ * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
+ * command to fail with EBUSY.
+ */
+ if (atomic_read(&lo->lo_refcnt) > 1) {
+ lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
+ mutex_unlock(&loop_ctl_mutex);
+ return 0;
+ }
+ lo->lo_state = Lo_rundown;
+ mutex_unlock(&loop_ctl_mutex);
+
+ return __loop_clr_fd(lo, false);
}
static int
@@ -1116,47 +1170,58 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
int err;
struct loop_func_table *xfer;
kuid_t uid = current_uid();
+ struct block_device *bdev;
+ bool partscan = false;
+ err = mutex_lock_killable(&loop_ctl_mutex);
+ if (err)
+ return err;
if (lo->lo_encrypt_key_size &&
!uid_eq(lo->lo_key_owner, uid) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (lo->lo_state != Lo_bound)
- return -ENXIO;
- if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
- return -EINVAL;
+ !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+ if (lo->lo_state != Lo_bound) {
+ err = -ENXIO;
+ goto out_unlock;
+ }
+ if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
/* I/O need to be drained during transfer transition */
blk_mq_freeze_queue(lo->lo_queue);
err = loop_release_xfer(lo);
if (err)
- goto exit;
+ goto out_unfreeze;
if (info->lo_encrypt_type) {
unsigned int type = info->lo_encrypt_type;
if (type >= MAX_LO_CRYPT) {
err = -EINVAL;
- goto exit;
+ goto out_unfreeze;
}
xfer = xfer_funcs[type];
if (xfer == NULL) {
err = -EINVAL;
- goto exit;
+ goto out_unfreeze;
}
} else
xfer = NULL;
err = loop_init_xfer(lo, xfer, info);
if (err)
- goto exit;
+ goto out_unfreeze;
if (lo->lo_offset != info->lo_offset ||
lo->lo_sizelimit != info->lo_sizelimit) {
if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
err = -EFBIG;
- goto exit;
+ goto out_unfreeze;
}
}
@@ -1188,15 +1253,20 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
/* update dio if lo_offset or transfer is changed */
__loop_update_dio(lo, lo->use_dio);
- exit:
+out_unfreeze:
blk_mq_unfreeze_queue(lo->lo_queue);
if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
!(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
lo->lo_flags |= LO_FLAGS_PARTSCAN;
lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
- loop_reread_partitions(lo, lo->lo_device);
+ bdev = lo->lo_device;
+ partscan = true;
}
+out_unlock:
+ mutex_unlock(&loop_ctl_mutex);
+ if (partscan)
+ loop_reread_partitions(lo, bdev);
return err;
}
@@ -1204,12 +1274,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
static int
loop_get_status(struct loop_device *lo, struct loop_info64 *info)
{
- struct file *file;
+ struct path path;
struct kstat stat;
int ret;
+ ret = mutex_lock_killable(&loop_ctl_mutex);
+ if (ret)
+ return ret;
if (lo->lo_state != Lo_bound) {
- mutex_unlock(&lo->lo_ctl_mutex);
+ mutex_unlock(&loop_ctl_mutex);
return -ENXIO;
}
@@ -1228,17 +1301,17 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
lo->lo_encrypt_key_size);
}
- /* Drop lo_ctl_mutex while we call into the filesystem. */
- file = get_file(lo->lo_backing_file);
- mutex_unlock(&lo->lo_ctl_mutex);
- ret = vfs_getattr(&file->f_path, &stat, STATX_INO,
- AT_STATX_SYNC_AS_STAT);
+ /* Drop loop_ctl_mutex while we call into the filesystem. */
+ path = lo->lo_backing_file->f_path;
+ path_get(&path);
+ mutex_unlock(&loop_ctl_mutex);
+ ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
if (!ret) {
info->lo_device = huge_encode_dev(stat.dev);
info->lo_inode = stat.ino;
info->lo_rdevice = huge_encode_dev(stat.rdev);
}
- fput(file);
+ path_put(&path);
return ret;
}
@@ -1322,10 +1395,8 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
struct loop_info64 info64;
int err;
- if (!arg) {
- mutex_unlock(&lo->lo_ctl_mutex);
+ if (!arg)
return -EINVAL;
- }
err = loop_get_status(lo, &info64);
if (!err)
err = loop_info64_to_old(&info64, &info);
@@ -1340,10 +1411,8 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
struct loop_info64 info64;
int err;
- if (!arg) {
- mutex_unlock(&lo->lo_ctl_mutex);
+ if (!arg)
return -EINVAL;
- }
err = loop_get_status(lo, &info64);
if (!err && copy_to_user(arg, &info64, sizeof(info64)))
err = -EFAULT;
@@ -1393,70 +1462,73 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
return 0;
}
+static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
+ unsigned long arg)
+{
+ int err;
+
+ err = mutex_lock_killable(&loop_ctl_mutex);
+ if (err)
+ return err;
+ switch (cmd) {
+ case LOOP_SET_CAPACITY:
+ err = loop_set_capacity(lo);
+ break;
+ case LOOP_SET_DIRECT_IO:
+ err = loop_set_dio(lo, arg);
+ break;
+ case LOOP_SET_BLOCK_SIZE:
+ err = loop_set_block_size(lo, arg);
+ break;
+ default:
+ err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+ }
+ mutex_unlock(&loop_ctl_mutex);
+ return err;
+}
+
static int lo_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct loop_device *lo = bdev->bd_disk->private_data;
int err;
- err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1);
- if (err)
- goto out_unlocked;
-
switch (cmd) {
case LOOP_SET_FD:
- err = loop_set_fd(lo, mode, bdev, arg);
- break;
+ return loop_set_fd(lo, mode, bdev, arg);
case LOOP_CHANGE_FD:
- err = loop_change_fd(lo, bdev, arg);
- break;
+ return loop_change_fd(lo, bdev, arg);
case LOOP_CLR_FD:
- /* loop_clr_fd would have unlocked lo_ctl_mutex on success */
- err = loop_clr_fd(lo);
- if (!err)
- goto out_unlocked;
- break;
+ return loop_clr_fd(lo);
case LOOP_SET_STATUS:
err = -EPERM;
- if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
err = loop_set_status_old(lo,
(struct loop_info __user *)arg);
+ }
break;
case LOOP_GET_STATUS:
- err = loop_get_status_old(lo, (struct loop_info __user *) arg);
- /* loop_get_status() unlocks lo_ctl_mutex */
- goto out_unlocked;
+ return loop_get_status_old(lo, (struct loop_info __user *) arg);
case LOOP_SET_STATUS64:
err = -EPERM;
- if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
err = loop_set_status64(lo,
(struct loop_info64 __user *) arg);
+ }
break;
case LOOP_GET_STATUS64:
- err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
- /* loop_get_status() unlocks lo_ctl_mutex */
- goto out_unlocked;
+ return loop_get_status64(lo, (struct loop_info64 __user *) arg);
case LOOP_SET_CAPACITY:
- err = -EPERM;
- if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
- err = loop_set_capacity(lo);
- break;
case LOOP_SET_DIRECT_IO:
- err = -EPERM;
- if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
- err = loop_set_dio(lo, arg);
- break;
case LOOP_SET_BLOCK_SIZE:
- err = -EPERM;
- if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
- err = loop_set_block_size(lo, arg);
- break;
+ if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /* Fall through */
default:
- err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+ err = lo_simple_ioctl(lo, cmd, arg);
+ break;
}
- mutex_unlock(&lo->lo_ctl_mutex);
-out_unlocked:
return err;
}
@@ -1570,10 +1642,8 @@ loop_get_status_compat(struct loop_device *lo,
struct loop_info64 info64;
int err;
- if (!arg) {
- mutex_unlock(&lo->lo_ctl_mutex);
+ if (!arg)
return -EINVAL;
- }
err = loop_get_status(lo, &info64);
if (!err)
err = loop_info64_to_compat(&info64, arg);
@@ -1588,20 +1658,12 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
switch(cmd) {
case LOOP_SET_STATUS:
- err = mutex_lock_killable(&lo->lo_ctl_mutex);
- if (!err) {
- err = loop_set_status_compat(lo,
- (const struct compat_loop_info __user *)arg);
- mutex_unlock(&lo->lo_ctl_mutex);
- }
+ err = loop_set_status_compat(lo,
+ (const struct compat_loop_info __user *)arg);
break;
case LOOP_GET_STATUS:
- err = mutex_lock_killable(&lo->lo_ctl_mutex);
- if (!err) {
- err = loop_get_status_compat(lo,
- (struct compat_loop_info __user *)arg);
- /* loop_get_status() unlocks lo_ctl_mutex */
- }
+ err = loop_get_status_compat(lo,
+ (struct compat_loop_info __user *)arg);
break;
case LOOP_SET_CAPACITY:
case LOOP_CLR_FD:
@@ -1625,9 +1687,11 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
static int lo_open(struct block_device *bdev, fmode_t mode)
{
struct loop_device *lo;
- int err = 0;
+ int err;
- mutex_lock(&loop_index_mutex);
+ err = mutex_lock_killable(&loop_ctl_mutex);
+ if (err)
+ return err;
lo = bdev->bd_disk->private_data;
if (!lo) {
err = -ENXIO;
@@ -1636,26 +1700,30 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
atomic_inc(&lo->lo_refcnt);
out:
- mutex_unlock(&loop_index_mutex);
+ mutex_unlock(&loop_ctl_mutex);
return err;
}
-static void __lo_release(struct loop_device *lo)
+static void lo_release(struct gendisk *disk, fmode_t mode)
{
- int err;
+ struct loop_device *lo;
+ mutex_lock(&loop_ctl_mutex);
+ lo = disk->private_data;
if (atomic_dec_return(&lo->lo_refcnt))
- return;
+ goto out_unlock;
- mutex_lock(&lo->lo_ctl_mutex);
if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
+ if (lo->lo_state != Lo_bound)
+ goto out_unlock;
+ lo->lo_state = Lo_rundown;
+ mutex_unlock(&loop_ctl_mutex);
/*
* In autoclear mode, stop the loop thread
* and remove configuration after last close.
*/
- err = loop_clr_fd(lo);
- if (!err)
- return;
+ __loop_clr_fd(lo, true);
+ return;
} else if (lo->lo_state == Lo_bound) {
/*
* Otherwise keep thread (if running) and config,
@@ -1665,14 +1733,8 @@ static void __lo_release(struct loop_device *lo)
blk_mq_unfreeze_queue(lo->lo_queue);
}
- mutex_unlock(&lo->lo_ctl_mutex);
-}
-
-static void lo_release(struct gendisk *disk, fmode_t mode)
-{
- mutex_lock(&loop_index_mutex);
- __lo_release(disk->private_data);
- mutex_unlock(&loop_index_mutex);
+out_unlock:
+ mutex_unlock(&loop_ctl_mutex);
}
static const struct block_device_operations lo_fops = {
@@ -1711,10 +1773,10 @@ static int unregister_transfer_cb(int id, void *ptr, void *data)
struct loop_device *lo = ptr;
struct loop_func_table *xfer = data;
- mutex_lock(&lo->lo_ctl_mutex);
+ mutex_lock(&loop_ctl_mutex);
if (lo->lo_encryption == xfer)
loop_release_xfer(lo);
- mutex_unlock(&lo->lo_ctl_mutex);
+ mutex_unlock(&loop_ctl_mutex);
return 0;
}
@@ -1759,8 +1821,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
/* always use the first bio's css */
#ifdef CONFIG_BLK_CGROUP
- if (cmd->use_aio && rq->bio && rq->bio->bi_css) {
- cmd->css = rq->bio->bi_css;
+ if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
+ cmd->css = &bio_blkcg(rq->bio)->css;
css_get(cmd->css);
} else
#endif
@@ -1853,7 +1915,7 @@ static int loop_add(struct loop_device **l, int i)
goto out_free_idr;
lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
- if (IS_ERR_OR_NULL(lo->lo_queue)) {
+ if (IS_ERR(lo->lo_queue)) {
err = PTR_ERR(lo->lo_queue);
goto out_cleanup_tags;
}
@@ -1895,7 +1957,6 @@ static int loop_add(struct loop_device **l, int i)
if (!part_shift)
disk->flags |= GENHD_FL_NO_PART_SCAN;
disk->flags |= GENHD_FL_EXT_DEVT;
- mutex_init(&lo->lo_ctl_mutex);
atomic_set(&lo->lo_refcnt, 0);
lo->lo_number = i;
spin_lock_init(&lo->lo_lock);
@@ -1974,7 +2035,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
struct kobject *kobj;
int err;
- mutex_lock(&loop_index_mutex);
+ mutex_lock(&loop_ctl_mutex);
err = loop_lookup(&lo, MINOR(dev) >> part_shift);
if (err < 0)
err = loop_add(&lo, MINOR(dev) >> part_shift);
@@ -1982,7 +2043,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
kobj = NULL;
else
kobj = get_disk_and_module(lo->lo_disk);
- mutex_unlock(&loop_index_mutex);
+ mutex_unlock(&loop_ctl_mutex);
*part = 0;
return kobj;
@@ -1992,9 +2053,13 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
unsigned long parm)
{
struct loop_device *lo;
- int ret = -ENOSYS;
+ int ret;
- mutex_lock(&loop_index_mutex);
+ ret = mutex_lock_killable(&loop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ ret = -ENOSYS;
switch (cmd) {
case LOOP_CTL_ADD:
ret = loop_lookup(&lo, parm);
@@ -2008,21 +2073,15 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
ret = loop_lookup(&lo, parm);
if (ret < 0)
break;
- ret = mutex_lock_killable(&lo->lo_ctl_mutex);
- if (ret)
- break;
if (lo->lo_state != Lo_unbound) {
ret = -EBUSY;
- mutex_unlock(&lo->lo_ctl_mutex);
break;
}
if (atomic_read(&lo->lo_refcnt) > 0) {
ret = -EBUSY;
- mutex_unlock(&lo->lo_ctl_mutex);
break;
}
lo->lo_disk->private_data = NULL;
- mutex_unlock(&lo->lo_ctl_mutex);
idr_remove(&loop_index_idr, lo->lo_number);
loop_remove(lo);
break;
@@ -2032,7 +2091,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
break;
ret = loop_add(&lo, -1);
}
- mutex_unlock(&loop_index_mutex);
+ mutex_unlock(&loop_ctl_mutex);
return ret;
}
@@ -2116,10 +2175,10 @@ static int __init loop_init(void)
THIS_MODULE, loop_probe, NULL, NULL);
/* pre-create number of devices given by config or max_loop */
- mutex_lock(&loop_index_mutex);
+ mutex_lock(&loop_ctl_mutex);
for (i = 0; i < nr; i++)
loop_add(&lo, i);
- mutex_unlock(&loop_index_mutex);
+ mutex_unlock(&loop_ctl_mutex);
printk(KERN_INFO "loop: module loaded\n");
return 0;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 4d42c7af7de7..af75a5ee4094 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -54,7 +54,6 @@ struct loop_device {
spinlock_t lo_lock;
int lo_state;
- struct mutex lo_ctl_mutex;
struct kthread_worker worker;
struct task_struct *worker_task;
bool use_dio;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a7daa8acbab3..88e8440e75c3 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -168,41 +168,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
return false; /* device present */
}
-/* we have to use runtime tag to setup command header */
-static void mtip_init_cmd_header(struct request *rq)
-{
- struct driver_data *dd = rq->q->queuedata;
- struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
-
- /* Point the command headers at the command tables. */
- cmd->command_header = dd->port->command_list +
- (sizeof(struct mtip_cmd_hdr) * rq->tag);
- cmd->command_header_dma = dd->port->command_list_dma +
- (sizeof(struct mtip_cmd_hdr) * rq->tag);
-
- if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
- cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
-
- cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
-}
-
-static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
-{
- struct request *rq;
-
- if (mtip_check_surprise_removal(dd->pdev))
- return NULL;
-
- rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
- if (IS_ERR(rq))
- return NULL;
-
- /* Internal cmd isn't submitted via .queue_rq */
- mtip_init_cmd_header(rq);
-
- return blk_mq_rq_to_pdu(rq);
-}
-
static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
unsigned int tag)
{
@@ -1023,13 +988,14 @@ static int mtip_exec_internal_command(struct mtip_port *port,
return -EFAULT;
}
- int_cmd = mtip_get_int_command(dd);
- if (!int_cmd) {
+ if (mtip_check_surprise_removal(dd->pdev))
+ return -EFAULT;
+
+ rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
+ if (IS_ERR(rq)) {
dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n");
return -EFAULT;
}
- rq = blk_mq_rq_from_pdu(int_cmd);
- rq->special = &icmd;
set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
@@ -1050,6 +1016,8 @@ static int mtip_exec_internal_command(struct mtip_port *port,
}
/* Copy the command to the command table */
+ int_cmd = blk_mq_rq_to_pdu(rq);
+ int_cmd->icmd = &icmd;
memcpy(int_cmd->command, fis, fis_len*4);
rq->timeout = timeout;
@@ -1423,23 +1391,19 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
* @dd pointer to driver_data structure
* @lba starting lba
* @len # of 512b sectors to trim
- *
- * return value
- * -ENOMEM Out of dma memory
- * -EINVAL Invalid parameters passed in, trim not supported
- * -EIO Error submitting trim request to hw
*/
-static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
- unsigned int len)
+static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
+ unsigned int len)
{
- int i, rv = 0;
u64 tlba, tlen, sect_left;
struct mtip_trim_entry *buf;
dma_addr_t dma_addr;
struct host_to_dev_fis fis;
+ blk_status_t ret = BLK_STS_OK;
+ int i;
if (!len || dd->trim_supp == false)
- return -EINVAL;
+ return BLK_STS_IOERR;
/* Trim request too big */
WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES));
@@ -1454,7 +1418,7 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
GFP_KERNEL);
if (!buf)
- return -ENOMEM;
+ return BLK_STS_RESOURCE;
memset(buf, 0, ATA_SECT_SIZE);
for (i = 0, sect_left = len, tlba = lba;
@@ -1463,8 +1427,8 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ?
MTIP_MAX_TRIM_ENTRY_LEN :
sect_left);
- buf[i].lba = __force_bit2int cpu_to_le32(tlba);
- buf[i].range = __force_bit2int cpu_to_le16(tlen);
+ buf[i].lba = cpu_to_le32(tlba);
+ buf[i].range = cpu_to_le16(tlen);
tlba += tlen;
sect_left -= tlen;
}
@@ -1486,10 +1450,10 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
ATA_SECT_SIZE,
0,
MTIP_TRIM_TIMEOUT_MS) < 0)
- rv = -EIO;
+ ret = BLK_STS_IOERR;
dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
- return rv;
+ return ret;
}
/*
@@ -1585,23 +1549,20 @@ static inline void fill_command_sg(struct driver_data *dd,
int n;
unsigned int dma_len;
struct mtip_cmd_sg *command_sg;
- struct scatterlist *sg = command->sg;
+ struct scatterlist *sg;
command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
- for (n = 0; n < nents; n++) {
+ for_each_sg(command->sg, sg, nents, n) {
dma_len = sg_dma_len(sg);
if (dma_len > 0x400000)
dev_err(&dd->pdev->dev,
"DMA segment length truncated\n");
- command_sg->info = __force_bit2int
- cpu_to_le32((dma_len-1) & 0x3FFFFF);
- command_sg->dba = __force_bit2int
- cpu_to_le32(sg_dma_address(sg));
- command_sg->dba_upper = __force_bit2int
+ command_sg->info = cpu_to_le32((dma_len-1) & 0x3FFFFF);
+ command_sg->dba = cpu_to_le32(sg_dma_address(sg));
+ command_sg->dba_upper =
cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
command_sg++;
- sg++;
}
}
@@ -2171,7 +2132,6 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
* @dd Pointer to the driver data structure.
* @start First sector to read.
* @nsect Number of sectors to read.
- * @nents Number of entries in scatter list for the read command.
* @tag The tag of this read command.
* @callback Pointer to the function that should be called
* when the read completes.
@@ -2183,16 +2143,20 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
* None
*/
static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
- struct mtip_cmd *command, int nents,
+ struct mtip_cmd *command,
struct blk_mq_hw_ctx *hctx)
{
+ struct mtip_cmd_hdr *hdr =
+ dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
struct host_to_dev_fis *fis;
struct mtip_port *port = dd->port;
int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
u64 start = blk_rq_pos(rq);
unsigned int nsect = blk_rq_sectors(rq);
+ unsigned int nents;
/* Map the scatter list for DMA access */
+ nents = blk_rq_map_sg(hctx->queue, rq, command->sg);
nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
prefetch(&port->flags);
@@ -2233,10 +2197,11 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
fis->device |= 1 << 7;
/* Populate the command header */
- command->command_header->opts =
- __force_bit2int cpu_to_le32(
- (nents << 16) | 5 | AHCI_CMD_PREFETCH);
- command->command_header->byte_count = 0;
+ hdr->ctba = cpu_to_le32(command->command_dma & 0xFFFFFFFF);
+ if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
+ hdr->ctbau = cpu_to_le32((command->command_dma >> 16) >> 16);
+ hdr->opts = cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
+ hdr->byte_count = 0;
command->direction = dma_dir;
@@ -2715,12 +2680,12 @@ static void mtip_softirq_done_fn(struct request *rq)
cmd->direction);
if (unlikely(cmd->unaligned))
- up(&dd->port->cmd_slot_unal);
+ atomic_inc(&dd->port->cmd_slot_unal);
blk_mq_end_request(rq, cmd->status);
}
-static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
+static bool mtip_abort_cmd(struct request *req, void *data, bool reserved)
{
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
struct driver_data *dd = data;
@@ -2730,14 +2695,16 @@ static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
clear_bit(req->tag, dd->port->cmds_to_issue);
cmd->status = BLK_STS_IOERR;
mtip_softirq_done_fn(req);
+ return true;
}
-static void mtip_queue_cmd(struct request *req, void *data, bool reserved)
+static bool mtip_queue_cmd(struct request *req, void *data, bool reserved)
{
struct driver_data *dd = data;
set_bit(req->tag, dd->port->cmds_to_issue);
blk_abort_request(req);
+ return true;
}
/*
@@ -2803,10 +2770,7 @@ restart_eh:
blk_mq_quiesce_queue(dd->queue);
- spin_lock(dd->queue->queue_lock);
- blk_mq_tagset_busy_iter(&dd->tags,
- mtip_queue_cmd, dd);
- spin_unlock(dd->queue->queue_lock);
+ blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd);
set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags);
@@ -3026,7 +2990,7 @@ static int mtip_hw_init(struct driver_data *dd)
else
dd->unal_qdepth = 0;
- sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
+ atomic_set(&dd->port->cmd_slot_unal, dd->unal_qdepth);
/* Spinlock to prevent concurrent issue */
for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
@@ -3531,58 +3495,24 @@ static inline bool is_se_active(struct driver_data *dd)
return false;
}
-/*
- * Block layer make request function.
- *
- * This function is called by the kernel to process a BIO for
- * the P320 device.
- *
- * @queue Pointer to the request queue. Unused other than to obtain
- * the driver data structure.
- * @rq Pointer to the request.
- *
- */
-static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static inline bool is_stopped(struct driver_data *dd, struct request *rq)
{
- struct driver_data *dd = hctx->queue->queuedata;
- struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
- unsigned int nents;
-
- if (is_se_active(dd))
- return -ENODATA;
-
- if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
- if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
- &dd->dd_flag))) {
- return -ENXIO;
- }
- if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
- return -ENODATA;
- }
- if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
- &dd->dd_flag) &&
- rq_data_dir(rq))) {
- return -ENODATA;
- }
- if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag) ||
- test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)))
- return -ENODATA;
- }
-
- if (req_op(rq) == REQ_OP_DISCARD) {
- int err;
-
- err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
- blk_mq_end_request(rq, err ? BLK_STS_IOERR : BLK_STS_OK);
- return 0;
- }
+ if (likely(!(dd->dd_flag & MTIP_DDF_STOP_IO)))
+ return false;
- /* Create the scatter list for this request. */
- nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg);
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+ return true;
+ if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+ return true;
+ if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) &&
+ rq_data_dir(rq))
+ return true;
+ if (test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
+ return true;
+ if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+ return true;
- /* Issue the read/write. */
- mtip_hw_submit_io(dd, rq, cmd, nents, hctx);
- return 0;
+ return false;
}
static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
@@ -3603,7 +3533,7 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
cmd->unaligned = 1;
}
- if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal))
+ if (cmd->unaligned && atomic_dec_if_positive(&dd->port->cmd_slot_unal) >= 0)
return true;
return false;
@@ -3613,32 +3543,33 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
struct driver_data *dd = hctx->queue->queuedata;
- struct mtip_int_cmd *icmd = rq->special;
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct mtip_int_cmd *icmd = cmd->icmd;
+ struct mtip_cmd_hdr *hdr =
+ dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
struct mtip_cmd_sg *command_sg;
if (mtip_commands_active(dd->port))
- return BLK_STS_RESOURCE;
+ return BLK_STS_DEV_RESOURCE;
+ hdr->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+ if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
+ hdr->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
/* Populate the SG list */
- cmd->command_header->opts =
- __force_bit2int cpu_to_le32(icmd->opts | icmd->fis_len);
+ hdr->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
if (icmd->buf_len) {
command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ;
- command_sg->info =
- __force_bit2int cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF);
- command_sg->dba =
- __force_bit2int cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
+ command_sg->info = cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF);
+ command_sg->dba = cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
command_sg->dba_upper =
- __force_bit2int cpu_to_le32((icmd->buffer >> 16) >> 16);
+ cpu_to_le32((icmd->buffer >> 16) >> 16);
- cmd->command_header->opts |=
- __force_bit2int cpu_to_le32((1 << 16));
+ hdr->opts |= cpu_to_le32((1 << 16));
}
/* Populate the command header */
- cmd->command_header->byte_count = 0;
+ hdr->byte_count = 0;
blk_mq_start_request(rq);
mtip_issue_non_ncq_command(dd->port, rq->tag);
@@ -3648,23 +3579,25 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
+ struct driver_data *dd = hctx->queue->queuedata;
struct request *rq = bd->rq;
- int ret;
-
- mtip_init_cmd_header(rq);
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
if (blk_rq_is_passthrough(rq))
return mtip_issue_reserved_cmd(hctx, rq);
if (unlikely(mtip_check_unal_depth(hctx, rq)))
- return BLK_STS_RESOURCE;
+ return BLK_STS_DEV_RESOURCE;
+
+ if (is_se_active(dd) || is_stopped(dd, rq))
+ return BLK_STS_IOERR;
blk_mq_start_request(rq);
- ret = mtip_submit_request(hctx, rq);
- if (likely(!ret))
- return BLK_STS_OK;
- return BLK_STS_IOERR;
+ if (req_op(rq) == REQ_OP_DISCARD)
+ return mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
+ mtip_hw_submit_io(dd, rq, cmd, hctx);
+ return BLK_STS_OK;
}
static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,
@@ -3920,12 +3853,13 @@ protocol_init_error:
return rv;
}
-static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
+static bool mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
{
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
cmd->status = BLK_STS_IOERR;
blk_mq_complete_request(rq);
+ return true;
}
/*
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index e20e55dab443..abce25f27f57 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -126,8 +126,6 @@
#define MTIP_DFS_MAX_BUF_SIZE 1024
-#define __force_bit2int (unsigned int __force)
-
enum {
/* below are bit numbers in 'flags' defined in mtip_port */
MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */
@@ -174,10 +172,10 @@ enum {
struct smart_attr {
u8 attr_id;
- u16 flags;
+ __le16 flags;
u8 cur;
u8 worst;
- u32 data;
+ __le32 data;
u8 res[3];
} __packed;
@@ -200,9 +198,9 @@ struct mtip_work {
#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8
struct mtip_trim_entry {
- u32 lba; /* starting lba of region */
- u16 rsvd; /* unused */
- u16 range; /* # of 512b blocks to trim */
+ __le32 lba; /* starting lba of region */
+ __le16 rsvd; /* unused */
+ __le16 range; /* # of 512b blocks to trim */
} __packed;
struct mtip_trim {
@@ -278,24 +276,24 @@ struct mtip_cmd_hdr {
* - Bit 5 Unused in this implementation.
* - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
*/
- unsigned int opts;
+ __le32 opts;
/* This field is unsed when using NCQ. */
union {
- unsigned int byte_count;
- unsigned int status;
+ __le32 byte_count;
+ __le32 status;
};
/*
* Lower 32 bits of the command table address associated with this
* header. The command table addresses must be 128 byte aligned.
*/
- unsigned int ctba;
+ __le32 ctba;
/*
* If 64 bit addressing is used this field is the upper 32 bits
* of the command table address associated with this command.
*/
- unsigned int ctbau;
+ __le32 ctbau;
/* Reserved and unused. */
- unsigned int res[4];
+ u32 res[4];
};
/* Command scatter gather structure (PRD). */
@@ -305,31 +303,28 @@ struct mtip_cmd_sg {
* address must be 8 byte aligned signified by bits 2:0 being
* set to 0.
*/
- unsigned int dba;
+ __le32 dba;
/*
* When 64 bit addressing is used this field is the upper
* 32 bits of the data buffer address.
*/
- unsigned int dba_upper;
+ __le32 dba_upper;
/* Unused. */
- unsigned int reserved;
+ __le32 reserved;
/*
* Bit 31: interrupt when this data block has been transferred.
* Bits 30..22: reserved
* Bits 21..0: byte count (minus 1). For P320 the byte count must be
* 8 byte aligned signified by bits 2:0 being set to 1.
*/
- unsigned int info;
+ __le32 info;
};
struct mtip_port;
+struct mtip_int_cmd;
+
/* Structure used to describe a command. */
struct mtip_cmd {
-
- struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
-
- dma_addr_t command_header_dma; /* corresponding physical address */
-
void *command; /* ptr to command table entry */
dma_addr_t command_dma; /* corresponding physical address */
@@ -338,7 +333,10 @@ struct mtip_cmd {
int unaligned; /* command is unaligned on 4k boundary */
- struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+ union {
+ struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+ struct mtip_int_cmd *icmd;
+ };
int retries; /* The number of retries left for this command. */
@@ -435,8 +433,8 @@ struct mtip_port {
*/
unsigned long ic_pause_timer;
- /* Semaphore to control queue depth of unaligned IOs */
- struct semaphore cmd_slot_unal;
+ /* Counter to control queue depth of unaligned IOs */
+ atomic_t cmd_slot_unal;
/* Spinlock for working around command-issue bug. */
spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4d4d6129ff66..08696f5f00bb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -734,12 +734,13 @@ static void recv_work(struct work_struct *work)
kfree(args);
}
-static void nbd_clear_req(struct request *req, void *data, bool reserved)
+static bool nbd_clear_req(struct request *req, void *data, bool reserved)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
cmd->status = BLK_STS_IOERR;
blk_mq_complete_request(req);
+ return true;
}
static void nbd_clear_que(struct nbd_device *nbd)
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 7685df43f1ef..b3df2793e7cd 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -49,6 +49,7 @@ struct nullb_device {
unsigned long completion_nsec; /* time in ns to complete a request */
unsigned long cache_size; /* disk cache size in MB */
unsigned long zone_size; /* zone size in MB if device is zoned */
+ unsigned int zone_nr_conv; /* number of conventional zones */
unsigned int submit_queues; /* number of submission queues */
unsigned int home_node; /* home node for the device */
unsigned int queue_mode; /* block interface */
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 09339203dfba..62c9654b9ce8 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -188,6 +188,10 @@ static unsigned long g_zone_size = 256;
module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+static unsigned int g_zone_nr_conv;
+module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
+MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
+
static struct nullb_device *null_alloc_dev(void);
static void null_free_dev(struct nullb_device *dev);
static void null_del_dev(struct nullb *nullb);
@@ -293,6 +297,7 @@ NULLB_DEVICE_ATTR(mbps, uint);
NULLB_DEVICE_ATTR(cache_size, ulong);
NULLB_DEVICE_ATTR(zoned, bool);
NULLB_DEVICE_ATTR(zone_size, ulong);
+NULLB_DEVICE_ATTR(zone_nr_conv, uint);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
@@ -407,6 +412,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_badblocks,
&nullb_device_attr_zoned,
&nullb_device_attr_zone_size,
+ &nullb_device_attr_zone_nr_conv,
NULL,
};
@@ -520,6 +526,7 @@ static struct nullb_device *null_alloc_dev(void)
dev->use_per_node_hctx = g_use_per_node_hctx;
dev->zoned = g_zoned;
dev->zone_size = g_zone_size;
+ dev->zone_nr_conv = g_zone_nr_conv;
return dev;
}
@@ -635,14 +642,9 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
}
-static void null_softirq_done_fn(struct request *rq)
+static void null_complete_rq(struct request *rq)
{
- struct nullb *nullb = rq->q->queuedata;
-
- if (nullb->dev->queue_mode == NULL_Q_MQ)
- end_cmd(blk_mq_rq_to_pdu(rq));
- else
- end_cmd(rq->special);
+ end_cmd(blk_mq_rq_to_pdu(rq));
}
static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
@@ -1350,7 +1352,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
- .complete = null_softirq_done_fn,
+ .complete = null_complete_rq,
.timeout = null_timeout_rq,
};
@@ -1657,8 +1659,7 @@ static int null_add_dev(struct nullb_device *dev)
}
null_init_queues(nullb);
} else if (dev->queue_mode == NULL_Q_BIO) {
- nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node,
- NULL);
+ nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node);
if (!nullb->q) {
rv = -ENOMEM;
goto out_cleanup_queues;
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index c0b0e4a3fa8f..5d1c261a2cfd 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -29,7 +29,25 @@ int null_zone_init(struct nullb_device *dev)
if (!dev->zones)
return -ENOMEM;
- for (i = 0; i < dev->nr_zones; i++) {
+ if (dev->zone_nr_conv >= dev->nr_zones) {
+ dev->zone_nr_conv = dev->nr_zones - 1;
+ pr_info("null_blk: changed the number of conventional zones to %u",
+ dev->zone_nr_conv);
+ }
+
+ for (i = 0; i < dev->zone_nr_conv; i++) {
+ struct blk_zone *zone = &dev->zones[i];
+
+ zone->start = sector;
+ zone->len = dev->zone_size_sects;
+ zone->wp = zone->start + zone->len;
+ zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zone->cond = BLK_ZONE_COND_NOT_WP;
+
+ sector += dev->zone_size_sects;
+ }
+
+ for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
struct blk_zone *zone = &dev->zones[i];
zone->start = zone->wp = sector;
@@ -98,6 +116,8 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
if (zone->wp == zone->start + zone->len)
zone->cond = BLK_ZONE_COND_FULL;
break;
+ case BLK_ZONE_COND_NOT_WP:
+ break;
default:
/* Invalid zone condition */
cmd->error = BLK_STS_IOERR;
@@ -111,6 +131,11 @@ void null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
unsigned int zno = null_zone_no(dev, sector);
struct blk_zone *zone = &dev->zones[zno];
+ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ cmd->error = BLK_STS_IOERR;
+ return;
+ }
+
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index ae4971e5d9a8..0ff9b12d0e35 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -242,6 +242,11 @@ struct pd_unit {
static struct pd_unit pd[PD_UNITS];
+struct pd_req {
+ /* for REQ_OP_DRV_IN: */
+ enum action (*func)(struct pd_unit *disk);
+};
+
static char pd_scratch[512]; /* scratch block buffer */
static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR",
@@ -502,8 +507,9 @@ static enum action do_pd_io_start(void)
static enum action pd_special(void)
{
- enum action (*func)(struct pd_unit *) = pd_req->special;
- return func(pd_current);
+ struct pd_req *req = blk_mq_rq_to_pdu(pd_req);
+
+ return req->func(pd_current);
}
static int pd_next_buf(void)
@@ -767,12 +773,14 @@ static int pd_special_command(struct pd_unit *disk,
enum action (*func)(struct pd_unit *disk))
{
struct request *rq;
+ struct pd_req *req;
rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
if (IS_ERR(rq))
return PTR_ERR(rq);
+ req = blk_mq_rq_to_pdu(rq);
- rq->special = func;
+ req->func = func;
blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
blk_put_request(rq);
return 0;
@@ -892,9 +900,21 @@ static void pd_probe_drive(struct pd_unit *disk)
disk->gd = p;
p->private_data = disk;
- p->queue = blk_mq_init_sq_queue(&disk->tag_set, &pd_mq_ops, 2,
- BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
+ memset(&disk->tag_set, 0, sizeof(disk->tag_set));
+ disk->tag_set.ops = &pd_mq_ops;
+ disk->tag_set.cmd_size = sizeof(struct pd_req);
+ disk->tag_set.nr_hw_queues = 1;
+ disk->tag_set.nr_maps = 1;
+ disk->tag_set.queue_depth = 2;
+ disk->tag_set.numa_node = NUMA_NO_NODE;
+ disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+
+ if (blk_mq_alloc_tag_set(&disk->tag_set))
+ return;
+
+ p->queue = blk_mq_init_queue(&disk->tag_set);
if (IS_ERR(p->queue)) {
+ blk_mq_free_tag_set(&disk->tag_set);
p->queue = NULL;
return;
}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 9381f4e3b221..f5a71023f76c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2203,9 +2203,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
* Some CDRW drives can not handle writes larger than one packet,
* even if the size is a multiple of the packet size.
*/
- spin_lock_irq(q->queue_lock);
blk_queue_max_hw_sectors(q, pd->settings.size);
- spin_unlock_irq(q->queue_lock);
set_bit(PACKET_WRITABLE, &pd->flags);
} else {
pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 2459dcc04b1c..a10d5736d8f7 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -181,6 +181,7 @@ struct skd_request_context {
struct fit_completion_entry_v1 completion;
struct fit_comp_error_info err_info;
+ int retries;
blk_status_t status;
};
@@ -382,11 +383,12 @@ static void skd_log_skreq(struct skd_device *skdev,
* READ/WRITE REQUESTS
*****************************************************************************
*/
-static void skd_inc_in_flight(struct request *rq, void *data, bool reserved)
+static bool skd_inc_in_flight(struct request *rq, void *data, bool reserved)
{
int *count = data;
count++;
+ return true;
}
static int skd_in_flight(struct skd_device *skdev)
@@ -494,6 +496,11 @@ static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE))
return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE;
+ if (!(req->rq_flags & RQF_DONTPREP)) {
+ skreq->retries = 0;
+ req->rq_flags |= RQF_DONTPREP;
+ }
+
blk_mq_start_request(req);
WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n",
@@ -1425,7 +1432,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
break;
case SKD_CHECK_STATUS_REQUEUE_REQUEST:
- if ((unsigned long) ++req->special < SKD_MAX_RETRIES) {
+ if (++skreq->retries < SKD_MAX_RETRIES) {
skd_log_skreq(skdev, skreq, "retry");
blk_mq_requeue_request(req, true);
break;
@@ -1887,13 +1894,13 @@ static void skd_isr_fwstate(struct skd_device *skdev)
skd_skdev_state_to_str(skdev->state), skdev->state);
}
-static void skd_recover_request(struct request *req, void *data, bool reserved)
+static bool skd_recover_request(struct request *req, void *data, bool reserved)
{
struct skd_device *const skdev = data;
struct skd_request_context *skreq = blk_mq_rq_to_pdu(req);
if (skreq->state != SKD_REQ_STATE_BUSY)
- return;
+ return true;
skd_log_skreq(skdev, skreq, "recover");
@@ -1904,6 +1911,7 @@ static void skd_recover_request(struct request *req, void *data, bool reserved)
skreq->state = SKD_REQ_STATE_IDLE;
skreq->status = BLK_STS_IOERR;
blk_mq_complete_request(req);
+ return true;
}
static void skd_recover_requests(struct skd_device *skdev)
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index b54fa6726303..9c0553dd13e7 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -6,7 +6,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/hdreg.h>
#include <linux/genhd.h>
#include <linux/cdrom.h>
@@ -45,6 +45,8 @@ MODULE_VERSION(DRV_MODULE_VERSION);
#define WAITING_FOR_GEN_CMD 0x04
#define WAITING_FOR_ANY -1
+#define VDC_MAX_RETRIES 10
+
static struct workqueue_struct *sunvdc_wq;
struct vdc_req_entry {
@@ -66,9 +68,10 @@ struct vdc_port {
u64 max_xfer_size;
u32 vdisk_block_size;
+ u32 drain;
u64 ldc_timeout;
- struct timer_list ldc_reset_timer;
+ struct delayed_work ldc_reset_timer_work;
struct work_struct ldc_reset_work;
/* The server fills these in for us in the disk attribute
@@ -80,12 +83,14 @@ struct vdc_port {
u8 vdisk_mtype;
u32 vdisk_phys_blksz;
+ struct blk_mq_tag_set tag_set;
+
char disk_name[32];
};
static void vdc_ldc_reset(struct vdc_port *port);
static void vdc_ldc_reset_work(struct work_struct *work);
-static void vdc_ldc_reset_timer(struct timer_list *t);
+static void vdc_ldc_reset_timer_work(struct work_struct *work);
static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
{
@@ -175,11 +180,8 @@ static void vdc_blk_queue_start(struct vdc_port *port)
* handshake completes, so check for initial handshake before we've
* allocated a disk.
*/
- if (port->disk && blk_queue_stopped(port->disk->queue) &&
- vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) {
- blk_start_queue(port->disk->queue);
- }
-
+ if (port->disk && vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50)
+ blk_mq_start_hw_queues(port->disk->queue);
}
static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
@@ -197,7 +199,7 @@ static void vdc_handshake_complete(struct vio_driver_state *vio)
{
struct vdc_port *port = to_vdc_port(vio);
- del_timer(&port->ldc_reset_timer);
+ cancel_delayed_work(&port->ldc_reset_timer_work);
vdc_finish(vio, 0, WAITING_FOR_LINK_UP);
vdc_blk_queue_start(port);
}
@@ -320,7 +322,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
rqe->req = NULL;
- __blk_end_request(req, (desc->status ? BLK_STS_IOERR : 0), desc->size);
+ blk_mq_end_request(req, desc->status ? BLK_STS_IOERR : 0);
vdc_blk_queue_start(port);
}
@@ -431,6 +433,7 @@ static int __vdc_tx_trigger(struct vdc_port *port)
.end_idx = dr->prod,
};
int err, delay;
+ int retries = 0;
hdr.seq = dr->snd_nxt;
delay = 1;
@@ -443,6 +446,8 @@ static int __vdc_tx_trigger(struct vdc_port *port)
udelay(delay);
if ((delay <<= 1) > 128)
delay = 128;
+ if (retries++ > VDC_MAX_RETRIES)
+ break;
} while (err == -EAGAIN);
if (err == -ENOTCONN)
@@ -525,29 +530,40 @@ static int __send_request(struct request *req)
return err;
}
-static void do_vdc_request(struct request_queue *rq)
+static blk_status_t vdc_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
{
- struct request *req;
+ struct vdc_port *port = hctx->queue->queuedata;
+ struct vio_dring_state *dr;
+ unsigned long flags;
- while ((req = blk_peek_request(rq)) != NULL) {
- struct vdc_port *port;
- struct vio_dring_state *dr;
+ dr = &port->vio.drings[VIO_DRIVER_TX_RING];
- port = req->rq_disk->private_data;
- dr = &port->vio.drings[VIO_DRIVER_TX_RING];
- if (unlikely(vdc_tx_dring_avail(dr) < 1))
- goto wait;
+ blk_mq_start_request(bd->rq);
- blk_start_request(req);
+ spin_lock_irqsave(&port->vio.lock, flags);
- if (__send_request(req) < 0) {
- blk_requeue_request(rq, req);
-wait:
- /* Avoid pointless unplugs. */
- blk_stop_queue(rq);
- break;
- }
+ /*
+ * Doing drain, just end the request in error
+ */
+ if (unlikely(port->drain)) {
+ spin_unlock_irqrestore(&port->vio.lock, flags);
+ return BLK_STS_IOERR;
+ }
+
+ if (unlikely(vdc_tx_dring_avail(dr) < 1)) {
+ spin_unlock_irqrestore(&port->vio.lock, flags);
+ blk_mq_stop_hw_queue(hctx);
+ return BLK_STS_DEV_RESOURCE;
+ }
+
+ if (__send_request(bd->rq) < 0) {
+ spin_unlock_irqrestore(&port->vio.lock, flags);
+ return BLK_STS_IOERR;
}
+
+ spin_unlock_irqrestore(&port->vio.lock, flags);
+ return BLK_STS_OK;
}
static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
@@ -759,6 +775,31 @@ static void vdc_port_down(struct vdc_port *port)
vio_ldc_free(&port->vio);
}
+static const struct blk_mq_ops vdc_mq_ops = {
+ .queue_rq = vdc_queue_rq,
+};
+
+static void cleanup_queue(struct request_queue *q)
+{
+ struct vdc_port *port = q->queuedata;
+
+ blk_cleanup_queue(q);
+ blk_mq_free_tag_set(&port->tag_set);
+}
+
+static struct request_queue *init_queue(struct vdc_port *port)
+{
+ struct request_queue *q;
+
+ q = blk_mq_init_sq_queue(&port->tag_set, &vdc_mq_ops, VDC_TX_RING_SIZE,
+ BLK_MQ_F_SHOULD_MERGE);
+ if (IS_ERR(q))
+ return q;
+
+ q->queuedata = port;
+ return q;
+}
+
static int probe_disk(struct vdc_port *port)
{
struct request_queue *q;
@@ -796,17 +837,17 @@ static int probe_disk(struct vdc_port *port)
(u64)geom.num_sec);
}
- q = blk_init_queue(do_vdc_request, &port->vio.lock);
- if (!q) {
+ q = init_queue(port);
+ if (IS_ERR(q)) {
printk(KERN_ERR PFX "%s: Could not allocate queue.\n",
port->vio.name);
- return -ENOMEM;
+ return PTR_ERR(q);
}
g = alloc_disk(1 << PARTITION_SHIFT);
if (!g) {
printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
port->vio.name);
- blk_cleanup_queue(q);
+ cleanup_queue(q);
return -ENOMEM;
}
@@ -981,7 +1022,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
*/
ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
- timer_setup(&port->ldc_reset_timer, vdc_ldc_reset_timer, 0);
+ INIT_DELAYED_WORK(&port->ldc_reset_timer_work, vdc_ldc_reset_timer_work);
INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work);
err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
@@ -1034,18 +1075,14 @@ static int vdc_port_remove(struct vio_dev *vdev)
struct vdc_port *port = dev_get_drvdata(&vdev->dev);
if (port) {
- unsigned long flags;
-
- spin_lock_irqsave(&port->vio.lock, flags);
- blk_stop_queue(port->disk->queue);
- spin_unlock_irqrestore(&port->vio.lock, flags);
+ blk_mq_stop_hw_queues(port->disk->queue);
flush_work(&port->ldc_reset_work);
- del_timer_sync(&port->ldc_reset_timer);
+ cancel_delayed_work_sync(&port->ldc_reset_timer_work);
del_timer_sync(&port->vio.timer);
del_gendisk(port->disk);
- blk_cleanup_queue(port->disk->queue);
+ cleanup_queue(port->disk->queue);
put_disk(port->disk);
port->disk = NULL;
@@ -1080,32 +1117,46 @@ static void vdc_requeue_inflight(struct vdc_port *port)
}
rqe->req = NULL;
- blk_requeue_request(port->disk->queue, req);
+ blk_mq_requeue_request(req, false);
}
}
static void vdc_queue_drain(struct vdc_port *port)
{
- struct request *req;
+ struct request_queue *q = port->disk->queue;
+
+ /*
+ * Mark the queue as draining, then freeze/quiesce to ensure
+ * that all existing requests are seen in ->queue_rq() and killed
+ */
+ port->drain = 1;
+ spin_unlock_irq(&port->vio.lock);
- while ((req = blk_fetch_request(port->disk->queue)) != NULL)
- __blk_end_request_all(req, BLK_STS_IOERR);
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
+ spin_lock_irq(&port->vio.lock);
+ port->drain = 0;
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
}
-static void vdc_ldc_reset_timer(struct timer_list *t)
+static void vdc_ldc_reset_timer_work(struct work_struct *work)
{
- struct vdc_port *port = from_timer(port, t, ldc_reset_timer);
- struct vio_driver_state *vio = &port->vio;
- unsigned long flags;
+ struct vdc_port *port;
+ struct vio_driver_state *vio;
- spin_lock_irqsave(&vio->lock, flags);
+ port = container_of(work, struct vdc_port, ldc_reset_timer_work.work);
+ vio = &port->vio;
+
+ spin_lock_irq(&vio->lock);
if (!(port->vio.hs_state & VIO_HS_COMPLETE)) {
pr_warn(PFX "%s ldc down %llu seconds, draining queue\n",
port->disk_name, port->ldc_timeout);
vdc_queue_drain(port);
vdc_blk_queue_start(port);
}
- spin_unlock_irqrestore(&vio->lock, flags);
+ spin_unlock_irq(&vio->lock);
}
static void vdc_ldc_reset_work(struct work_struct *work)
@@ -1129,7 +1180,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
assert_spin_locked(&port->vio.lock);
pr_warn(PFX "%s ldc link reset\n", port->disk_name);
- blk_stop_queue(port->disk->queue);
+ blk_mq_stop_hw_queues(port->disk->queue);
vdc_requeue_inflight(port);
vdc_port_down(port);
@@ -1146,7 +1197,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
}
if (port->ldc_timeout)
- mod_timer(&port->ldc_reset_timer,
+ mod_delayed_work(system_wq, &port->ldc_reset_timer_work,
round_jiffies(jiffies + HZ * port->ldc_timeout));
mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
return;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 064b8c5c7a32..4478eb7efee0 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -243,7 +243,6 @@ struct carm_port {
unsigned int port_no;
struct gendisk *disk;
struct carm_host *host;
- struct blk_mq_tag_set tag_set;
/* attached device characteristics */
u64 capacity;
@@ -254,13 +253,10 @@ struct carm_port {
};
struct carm_request {
- unsigned int tag;
int n_elem;
unsigned int msg_type;
unsigned int msg_subtype;
unsigned int msg_bucket;
- struct request *rq;
- struct carm_port *port;
struct scatterlist sg[CARM_MAX_REQ_SG];
};
@@ -291,9 +287,6 @@ struct carm_host {
unsigned int wait_q_cons;
struct request_queue *wait_q[CARM_MAX_WAIT_Q];
- unsigned int n_msgs;
- u64 msg_alloc;
- struct carm_request req[CARM_MAX_REQ];
void *msg_base;
dma_addr_t msg_dma;
@@ -478,10 +471,10 @@ static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host,
}
static int carm_send_msg(struct carm_host *host,
- struct carm_request *crq)
+ struct carm_request *crq, unsigned tag)
{
void __iomem *mmio = host->mmio;
- u32 msg = (u32) carm_ref_msg_dma(host, crq->tag);
+ u32 msg = (u32) carm_ref_msg_dma(host, tag);
u32 cm_bucket = crq->msg_bucket;
u32 tmp;
int rc = 0;
@@ -506,99 +499,24 @@ static int carm_send_msg(struct carm_host *host,
return rc;
}
-static struct carm_request *carm_get_request(struct carm_host *host)
-{
- unsigned int i;
-
- /* obey global hardware limit on S/G entries */
- if (host->hw_sg_used >= (CARM_MAX_HOST_SG - CARM_MAX_REQ_SG))
- return NULL;
-
- for (i = 0; i < max_queue; i++)
- if ((host->msg_alloc & (1ULL << i)) == 0) {
- struct carm_request *crq = &host->req[i];
- crq->port = NULL;
- crq->n_elem = 0;
-
- host->msg_alloc |= (1ULL << i);
- host->n_msgs++;
-
- assert(host->n_msgs <= CARM_MAX_REQ);
- sg_init_table(crq->sg, CARM_MAX_REQ_SG);
- return crq;
- }
-
- DPRINTK("no request available, returning NULL\n");
- return NULL;
-}
-
-static int carm_put_request(struct carm_host *host, struct carm_request *crq)
-{
- assert(crq->tag < max_queue);
-
- if (unlikely((host->msg_alloc & (1ULL << crq->tag)) == 0))
- return -EINVAL; /* tried to clear a tag that was not active */
-
- assert(host->hw_sg_used >= crq->n_elem);
-
- host->msg_alloc &= ~(1ULL << crq->tag);
- host->hw_sg_used -= crq->n_elem;
- host->n_msgs--;
-
- return 0;
-}
-
-static struct carm_request *carm_get_special(struct carm_host *host)
-{
- unsigned long flags;
- struct carm_request *crq = NULL;
- struct request *rq;
- int tries = 5000;
-
- while (tries-- > 0) {
- spin_lock_irqsave(&host->lock, flags);
- crq = carm_get_request(host);
- spin_unlock_irqrestore(&host->lock, flags);
-
- if (crq)
- break;
- msleep(10);
- }
-
- if (!crq)
- return NULL;
-
- rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0);
- if (IS_ERR(rq)) {
- spin_lock_irqsave(&host->lock, flags);
- carm_put_request(host, crq);
- spin_unlock_irqrestore(&host->lock, flags);
- return NULL;
- }
-
- crq->rq = rq;
- return crq;
-}
-
static int carm_array_info (struct carm_host *host, unsigned int array_idx)
{
struct carm_msg_ioctl *ioc;
- unsigned int idx;
u32 msg_data;
dma_addr_t msg_dma;
struct carm_request *crq;
+ struct request *rq;
int rc;
- crq = carm_get_special(host);
- if (!crq) {
+ rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
+ if (IS_ERR(rq)) {
rc = -ENOMEM;
goto err_out;
}
+ crq = blk_mq_rq_to_pdu(rq);
- idx = crq->tag;
-
- ioc = carm_ref_msg(host, idx);
- msg_dma = carm_ref_msg_dma(host, idx);
+ ioc = carm_ref_msg(host, rq->tag);
+ msg_dma = carm_ref_msg_dma(host, rq->tag);
msg_data = (u32) (msg_dma + sizeof(struct carm_array_info));
crq->msg_type = CARM_MSG_ARRAY;
@@ -612,7 +530,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
ioc->type = CARM_MSG_ARRAY;
ioc->subtype = CARM_ARRAY_INFO;
ioc->array_id = (u8) array_idx;
- ioc->handle = cpu_to_le32(TAG_ENCODE(idx));
+ ioc->handle = cpu_to_le32(TAG_ENCODE(rq->tag));
ioc->data_addr = cpu_to_le32(msg_data);
spin_lock_irq(&host->lock);
@@ -620,9 +538,8 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
host->state == HST_DEV_SCAN);
spin_unlock_irq(&host->lock);
- DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
- crq->rq->special = crq;
- blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
+ DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
+ blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
return 0;
@@ -637,21 +554,21 @@ typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *);
static int carm_send_special (struct carm_host *host, carm_sspc_t func)
{
+ struct request *rq;
struct carm_request *crq;
struct carm_msg_ioctl *ioc;
void *mem;
- unsigned int idx, msg_size;
+ unsigned int msg_size;
int rc;
- crq = carm_get_special(host);
- if (!crq)
+ rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
+ if (IS_ERR(rq))
return -ENOMEM;
+ crq = blk_mq_rq_to_pdu(rq);
- idx = crq->tag;
+ mem = carm_ref_msg(host, rq->tag);
- mem = carm_ref_msg(host, idx);
-
- msg_size = func(host, idx, mem);
+ msg_size = func(host, rq->tag, mem);
ioc = mem;
crq->msg_type = ioc->type;
@@ -660,9 +577,8 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
BUG_ON(rc < 0);
crq->msg_bucket = (u32) rc;
- DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
- crq->rq->special = crq;
- blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
+ DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
+ blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
return 0;
}
@@ -744,19 +660,6 @@ static unsigned int carm_fill_get_fw_ver(struct carm_host *host,
sizeof(struct carm_fw_ver);
}
-static inline void carm_end_request_queued(struct carm_host *host,
- struct carm_request *crq,
- blk_status_t error)
-{
- struct request *req = crq->rq;
- int rc;
-
- blk_mq_end_request(req, error);
-
- rc = carm_put_request(host, crq);
- assert(rc == 0);
-}
-
static inline void carm_push_q (struct carm_host *host, struct request_queue *q)
{
unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q;
@@ -791,101 +694,50 @@ static inline void carm_round_robin(struct carm_host *host)
}
}
-static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq,
- blk_status_t error)
-{
- carm_end_request_queued(host, crq, error);
- if (max_queue == 1)
- carm_round_robin(host);
- else if ((host->n_msgs <= CARM_MSG_LOW_WATER) &&
- (host->hw_sg_used <= CARM_SG_LOW_WATER)) {
- carm_round_robin(host);
- }
-}
-
-static blk_status_t carm_oob_queue_rq(struct blk_mq_hw_ctx *hctx,
- const struct blk_mq_queue_data *bd)
+static inline enum dma_data_direction carm_rq_dir(struct request *rq)
{
- struct request_queue *q = hctx->queue;
- struct carm_host *host = q->queuedata;
- struct carm_request *crq;
- int rc;
-
- blk_mq_start_request(bd->rq);
-
- spin_lock_irq(&host->lock);
-
- crq = bd->rq->special;
- assert(crq != NULL);
- assert(crq->rq == bd->rq);
-
- crq->n_elem = 0;
-
- DPRINTK("send req\n");
- rc = carm_send_msg(host, crq);
- if (rc) {
- carm_push_q(host, q);
- spin_unlock_irq(&host->lock);
- return BLK_STS_DEV_RESOURCE;
- }
-
- spin_unlock_irq(&host->lock);
- return BLK_STS_OK;
+ return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
}
static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request_queue *q = hctx->queue;
+ struct request *rq = bd->rq;
struct carm_port *port = q->queuedata;
struct carm_host *host = port->host;
+ struct carm_request *crq = blk_mq_rq_to_pdu(rq);
struct carm_msg_rw *msg;
- struct carm_request *crq;
- struct request *rq = bd->rq;
struct scatterlist *sg;
- int writing = 0, pci_dir, i, n_elem, rc;
- u32 tmp;
+ int i, n_elem = 0, rc;
unsigned int msg_size;
+ u32 tmp;
+
+ crq->n_elem = 0;
+ sg_init_table(crq->sg, CARM_MAX_REQ_SG);
blk_mq_start_request(rq);
spin_lock_irq(&host->lock);
-
- crq = carm_get_request(host);
- if (!crq) {
- carm_push_q(host, q);
- spin_unlock_irq(&host->lock);
- return BLK_STS_DEV_RESOURCE;
- }
- crq->rq = rq;
-
- if (rq_data_dir(rq) == WRITE) {
- writing = 1;
- pci_dir = DMA_TO_DEVICE;
- } else {
- pci_dir = DMA_FROM_DEVICE;
- }
+ if (req_op(rq) == REQ_OP_DRV_OUT)
+ goto send_msg;
/* get scatterlist from block layer */
sg = &crq->sg[0];
n_elem = blk_rq_map_sg(q, rq, sg);
- if (n_elem <= 0) {
- /* request with no s/g entries? */
- carm_end_rq(host, crq, BLK_STS_IOERR);
- spin_unlock_irq(&host->lock);
- return BLK_STS_IOERR;
- }
+ if (n_elem <= 0)
+ goto out_ioerr;
/* map scatterlist to PCI bus addresses */
- n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, pci_dir);
- if (n_elem <= 0) {
- /* request with no s/g entries? */
- carm_end_rq(host, crq, BLK_STS_IOERR);
- spin_unlock_irq(&host->lock);
- return BLK_STS_IOERR;
- }
+ n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq));
+ if (n_elem <= 0)
+ goto out_ioerr;
+
+ /* obey global hardware limit on S/G entries */
+ if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem)
+ goto out_resource;
+
crq->n_elem = n_elem;
- crq->port = port;
host->hw_sg_used += n_elem;
/*
@@ -893,9 +745,9 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
*/
VPRINTK("build msg\n");
- msg = (struct carm_msg_rw *) carm_ref_msg(host, crq->tag);
+ msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag);
- if (writing) {
+ if (rq_data_dir(rq) == WRITE) {
msg->type = CARM_MSG_WRITE;
crq->msg_type = CARM_MSG_WRITE;
} else {
@@ -906,7 +758,7 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
msg->id = port->port_no;
msg->sg_count = n_elem;
msg->sg_type = SGT_32BIT;
- msg->handle = cpu_to_le32(TAG_ENCODE(crq->tag));
+ msg->handle = cpu_to_le32(TAG_ENCODE(rq->tag));
msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
tmp = (blk_rq_pos(rq) >> 16) >> 16;
msg->lba_high = cpu_to_le16( (u16) tmp );
@@ -923,22 +775,28 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
rc = carm_lookup_bucket(msg_size);
BUG_ON(rc < 0);
crq->msg_bucket = (u32) rc;
-
+send_msg:
/*
* queue read/write message to hardware
*/
-
- VPRINTK("send msg, tag == %u\n", crq->tag);
- rc = carm_send_msg(host, crq);
+ VPRINTK("send msg, tag == %u\n", rq->tag);
+ rc = carm_send_msg(host, crq, rq->tag);
if (rc) {
- carm_put_request(host, crq);
- carm_push_q(host, q);
- spin_unlock_irq(&host->lock);
- return BLK_STS_DEV_RESOURCE;
+ host->hw_sg_used -= n_elem;
+ goto out_resource;
}
spin_unlock_irq(&host->lock);
return BLK_STS_OK;
+out_resource:
+ dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq));
+ carm_push_q(host, q);
+ spin_unlock_irq(&host->lock);
+ return BLK_STS_DEV_RESOURCE;
+out_ioerr:
+ carm_round_robin(host);
+ spin_unlock_irq(&host->lock);
+ return BLK_STS_IOERR;
}
static void carm_handle_array_info(struct carm_host *host,
@@ -954,8 +812,6 @@ static void carm_handle_array_info(struct carm_host *host,
DPRINTK("ENTER\n");
- carm_end_rq(host, crq, error);
-
if (error)
goto out;
if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST)
@@ -1011,8 +867,6 @@ static void carm_handle_scan_chan(struct carm_host *host,
DPRINTK("ENTER\n");
- carm_end_rq(host, crq, error);
-
if (error) {
new_state = HST_ERROR;
goto out;
@@ -1040,8 +894,6 @@ static void carm_handle_generic(struct carm_host *host,
{
DPRINTK("ENTER\n");
- carm_end_rq(host, crq, error);
-
assert(host->state == cur_state);
if (error)
host->state = HST_ERROR;
@@ -1050,28 +902,12 @@ static void carm_handle_generic(struct carm_host *host,
schedule_work(&host->fsm_task);
}
-static inline void carm_handle_rw(struct carm_host *host,
- struct carm_request *crq, blk_status_t error)
-{
- int pci_dir;
-
- VPRINTK("ENTER\n");
-
- if (rq_data_dir(crq->rq) == WRITE)
- pci_dir = DMA_TO_DEVICE;
- else
- pci_dir = DMA_FROM_DEVICE;
-
- dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem, pci_dir);
-
- carm_end_rq(host, crq, error);
-}
-
static inline void carm_handle_resp(struct carm_host *host,
__le32 ret_handle_le, u32 status)
{
u32 handle = le32_to_cpu(ret_handle_le);
unsigned int msg_idx;
+ struct request *rq;
struct carm_request *crq;
blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR;
u8 *mem;
@@ -1087,13 +923,15 @@ static inline void carm_handle_resp(struct carm_host *host,
msg_idx = TAG_DECODE(handle);
VPRINTK("tag == %u\n", msg_idx);
- crq = &host->req[msg_idx];
+ rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx);
+ crq = blk_mq_rq_to_pdu(rq);
/* fast path */
if (likely(crq->msg_type == CARM_MSG_READ ||
crq->msg_type == CARM_MSG_WRITE)) {
- carm_handle_rw(host, crq, error);
- return;
+ dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem,
+ carm_rq_dir(rq));
+ goto done;
}
mem = carm_ref_msg(host, msg_idx);
@@ -1103,7 +941,7 @@ static inline void carm_handle_resp(struct carm_host *host,
switch (crq->msg_subtype) {
case CARM_IOC_SCAN_CHAN:
carm_handle_scan_chan(host, crq, mem, error);
- break;
+ goto done;
default:
/* unknown / invalid response */
goto err_out;
@@ -1116,11 +954,11 @@ static inline void carm_handle_resp(struct carm_host *host,
case MISC_ALLOC_MEM:
carm_handle_generic(host, crq, error,
HST_ALLOC_BUF, HST_SYNC_TIME);
- break;
+ goto done;
case MISC_SET_TIME:
carm_handle_generic(host, crq, error,
HST_SYNC_TIME, HST_GET_FW_VER);
- break;
+ goto done;
case MISC_GET_FW_VER: {
struct carm_fw_ver *ver = (struct carm_fw_ver *)
(mem + sizeof(struct carm_msg_get_fw_ver));
@@ -1130,7 +968,7 @@ static inline void carm_handle_resp(struct carm_host *host,
}
carm_handle_generic(host, crq, error,
HST_GET_FW_VER, HST_PORT_SCAN);
- break;
+ goto done;
}
default:
/* unknown / invalid response */
@@ -1161,7 +999,13 @@ static inline void carm_handle_resp(struct carm_host *host,
err_out:
printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n",
pci_name(host->pdev), crq->msg_type, crq->msg_subtype);
- carm_end_rq(host, crq, BLK_STS_IOERR);
+ error = BLK_STS_IOERR;
+done:
+ host->hw_sg_used -= crq->n_elem;
+ blk_mq_end_request(blk_mq_rq_from_pdu(crq), error);
+
+ if (host->hw_sg_used <= CARM_SG_LOW_WATER)
+ carm_round_robin(host);
}
static inline void carm_handle_responses(struct carm_host *host)
@@ -1491,78 +1335,56 @@ static int carm_init_host(struct carm_host *host)
return 0;
}
-static const struct blk_mq_ops carm_oob_mq_ops = {
- .queue_rq = carm_oob_queue_rq,
-};
-
static const struct blk_mq_ops carm_mq_ops = {
.queue_rq = carm_queue_rq,
};
-static int carm_init_disks(struct carm_host *host)
+static int carm_init_disk(struct carm_host *host, unsigned int port_no)
{
- unsigned int i;
- int rc = 0;
+ struct carm_port *port = &host->port[port_no];
+ struct gendisk *disk;
+ struct request_queue *q;
- for (i = 0; i < CARM_MAX_PORTS; i++) {
- struct gendisk *disk;
- struct request_queue *q;
- struct carm_port *port;
+ port->host = host;
+ port->port_no = port_no;
- port = &host->port[i];
- port->host = host;
- port->port_no = i;
+ disk = alloc_disk(CARM_MINORS_PER_MAJOR);
+ if (!disk)
+ return -ENOMEM;
- disk = alloc_disk(CARM_MINORS_PER_MAJOR);
- if (!disk) {
- rc = -ENOMEM;
- break;
- }
+ port->disk = disk;
+ sprintf(disk->disk_name, DRV_NAME "/%u",
+ (unsigned int)host->id * CARM_MAX_PORTS + port_no);
+ disk->major = host->major;
+ disk->first_minor = port_no * CARM_MINORS_PER_MAJOR;
+ disk->fops = &carm_bd_ops;
+ disk->private_data = port;
- port->disk = disk;
- sprintf(disk->disk_name, DRV_NAME "/%u",
- (unsigned int) (host->id * CARM_MAX_PORTS) + i);
- disk->major = host->major;
- disk->first_minor = i * CARM_MINORS_PER_MAJOR;
- disk->fops = &carm_bd_ops;
- disk->private_data = port;
-
- q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops,
- max_queue, BLK_MQ_F_SHOULD_MERGE);
- if (IS_ERR(q)) {
- rc = PTR_ERR(q);
- break;
- }
- disk->queue = q;
- blk_queue_max_segments(q, CARM_MAX_REQ_SG);
- blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
+ q = blk_mq_init_queue(&host->tag_set);
+ if (IS_ERR(q))
+ return PTR_ERR(q);
- q->queuedata = port;
- }
+ blk_queue_max_segments(q, CARM_MAX_REQ_SG);
+ blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
- return rc;
+ q->queuedata = port;
+ disk->queue = q;
+ return 0;
}
-static void carm_free_disks(struct carm_host *host)
+static void carm_free_disk(struct carm_host *host, unsigned int port_no)
{
- unsigned int i;
-
- for (i = 0; i < CARM_MAX_PORTS; i++) {
- struct carm_port *port = &host->port[i];
- struct gendisk *disk = port->disk;
+ struct carm_port *port = &host->port[port_no];
+ struct gendisk *disk = port->disk;
- if (disk) {
- struct request_queue *q = disk->queue;
+ if (!disk)
+ return;
- if (disk->flags & GENHD_FL_UP)
- del_gendisk(disk);
- if (q) {
- blk_mq_free_tag_set(&port->tag_set);
- blk_cleanup_queue(q);
- }
- put_disk(disk);
- }
- }
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+ put_disk(disk);
}
static int carm_init_shm(struct carm_host *host)
@@ -1618,9 +1440,6 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
INIT_WORK(&host->fsm_task, carm_fsm_task);
init_completion(&host->probe_comp);
- for (i = 0; i < ARRAY_SIZE(host->req); i++)
- host->req[i].tag = i;
-
host->mmio = ioremap(pci_resource_start(pdev, 0),
pci_resource_len(pdev, 0));
if (!host->mmio) {
@@ -1637,14 +1456,26 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
goto err_out_iounmap;
}
- q = blk_mq_init_sq_queue(&host->tag_set, &carm_oob_mq_ops, 1,
- BLK_MQ_F_NO_SCHED);
+ memset(&host->tag_set, 0, sizeof(host->tag_set));
+ host->tag_set.ops = &carm_mq_ops;
+ host->tag_set.cmd_size = sizeof(struct carm_request);
+ host->tag_set.nr_hw_queues = 1;
+ host->tag_set.nr_maps = 1;
+ host->tag_set.queue_depth = max_queue;
+ host->tag_set.numa_node = NUMA_NO_NODE;
+ host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+
+ rc = blk_mq_alloc_tag_set(&host->tag_set);
+ if (rc)
+ goto err_out_dma_free;
+
+ q = blk_mq_init_queue(&host->tag_set);
if (IS_ERR(q)) {
- printk(KERN_ERR DRV_NAME "(%s): OOB queue alloc failure\n",
- pci_name(pdev));
rc = PTR_ERR(q);
+ blk_mq_free_tag_set(&host->tag_set);
goto err_out_dma_free;
}
+
host->oob_q = q;
q->queuedata = host;
@@ -1667,9 +1498,11 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
if (host->flags & FL_DYN_MAJOR)
host->major = rc;
- rc = carm_init_disks(host);
- if (rc)
- goto err_out_blkdev_disks;
+ for (i = 0; i < CARM_MAX_PORTS; i++) {
+ rc = carm_init_disk(host, i);
+ if (rc)
+ goto err_out_blkdev_disks;
+ }
pci_set_master(pdev);
@@ -1699,7 +1532,8 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
err_out_free_irq:
free_irq(pdev->irq, host);
err_out_blkdev_disks:
- carm_free_disks(host);
+ for (i = 0; i < CARM_MAX_PORTS; i++)
+ carm_free_disk(host, i);
unregister_blkdev(host->major, host->name);
err_out_free_majors:
if (host->major == 160)
@@ -1724,6 +1558,7 @@ err_out:
static void carm_remove_one (struct pci_dev *pdev)
{
struct carm_host *host = pci_get_drvdata(pdev);
+ unsigned int i;
if (!host) {
printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n",
@@ -1732,7 +1567,8 @@ static void carm_remove_one (struct pci_dev *pdev)
}
free_irq(pdev->irq, host);
- carm_free_disks(host);
+ for (i = 0; i < CARM_MAX_PORTS; i++)
+ carm_free_disk(host, i);
unregister_blkdev(host->major, host->name);
if (host->major == 160)
clear_bit(0, &carm_major_alloc);
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index be3e3ab79950..aa035cf8a51d 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -888,8 +888,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
card->biotail = &card->bio;
spin_lock_init(&card->lock);
- card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE,
- &card->lock);
+ card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
if (!card->queue)
goto failed_alloc;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 086c6bb12baa..912c4265e592 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -214,6 +214,20 @@ static void virtblk_done(struct virtqueue *vq)
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
}
+static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+ struct virtio_blk *vblk = hctx->queue->queuedata;
+ struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
+ bool kick;
+
+ spin_lock_irq(&vq->lock);
+ kick = virtqueue_kick_prepare(vq->vq);
+ spin_unlock_irq(&vq->lock);
+
+ if (kick)
+ virtqueue_notify(vq->vq);
+}
+
static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -624,7 +638,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
{
struct virtio_blk *vblk = set->driver_data;
- return blk_mq_virtio_map_queues(set, vblk->vdev, 0);
+ return blk_mq_virtio_map_queues(&set->map[0], vblk->vdev, 0);
}
#ifdef CONFIG_VIRTIO_BLK_SCSI
@@ -638,6 +652,7 @@ static void virtblk_initialize_rq(struct request *req)
static const struct blk_mq_ops virtio_mq_ops = {
.queue_rq = virtio_queue_rq,
+ .commit_rqs = virtio_commit_rqs,
.complete = virtblk_request_done,
.init_request = virtblk_init_request,
#ifdef CONFIG_VIRTIO_BLK_SCSI
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 8b2b72b93885..da58020a144e 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,7 +94,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_MISC;
- rq->special = (char *)pc;
+ ide_req(rq)->special = pc;
if (buf && bufflen) {
error = blk_rq_map_kern(drive->queue, rq, buf, bufflen,
@@ -172,8 +172,8 @@ EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
void ide_prep_sense(ide_drive_t *drive, struct request *rq)
{
struct request_sense *sense = &drive->sense_data;
- struct request *sense_rq = drive->sense_rq;
- struct scsi_request *req = scsi_req(sense_rq);
+ struct request *sense_rq;
+ struct scsi_request *req;
unsigned int cmd_len, sense_len;
int err;
@@ -196,9 +196,16 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
if (ata_sense_request(rq) || drive->sense_rq_armed)
return;
+ sense_rq = drive->sense_rq;
+ if (!sense_rq) {
+ sense_rq = blk_mq_alloc_request(drive->queue, REQ_OP_DRV_IN,
+ BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+ drive->sense_rq = sense_rq;
+ }
+ req = scsi_req(sense_rq);
+
memset(sense, 0, sizeof(*sense));
- blk_rq_init(rq->q, sense_rq);
scsi_req_init(req);
err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
@@ -207,6 +214,8 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
if (printk_ratelimit())
printk(KERN_WARNING PFX "%s: failed to map sense "
"buffer\n", drive->name);
+ blk_mq_free_request(sense_rq);
+ drive->sense_rq = NULL;
return;
}
@@ -226,6 +235,8 @@ EXPORT_SYMBOL_GPL(ide_prep_sense);
int ide_queue_sense_rq(ide_drive_t *drive, void *special)
{
+ struct request *sense_rq = drive->sense_rq;
+
/* deferred failure from ide_prep_sense() */
if (!drive->sense_rq_armed) {
printk(KERN_WARNING PFX "%s: error queuing a sense request\n",
@@ -233,12 +244,12 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
return -ENOMEM;
}
- drive->sense_rq->special = special;
+ ide_req(sense_rq)->special = special;
drive->sense_rq_armed = false;
drive->hwif->rq = NULL;
- elv_add_request(drive->queue, drive->sense_rq, ELEVATOR_INSERT_FRONT);
+ ide_insert_request_head(drive, sense_rq);
return 0;
}
EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
@@ -270,10 +281,8 @@ void ide_retry_pc(ide_drive_t *drive)
*/
drive->hwif->rq = NULL;
ide_requeue_and_plug(drive, failed_rq);
- if (ide_queue_sense_rq(drive, pc)) {
- blk_start_request(failed_rq);
+ if (ide_queue_sense_rq(drive, pc))
ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq));
- }
}
EXPORT_SYMBOL_GPL(ide_retry_pc);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index f9b59d41813f..1f03884a6808 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -211,12 +211,12 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
{
/*
- * For ATA_PRIV_SENSE, "rq->special" points to the original
+ * For ATA_PRIV_SENSE, "ide_req(rq)->special" points to the original
* failed request. Also, the sense data should be read
* directly from rq which might be different from the original
* sense buffer if it got copied during mapping.
*/
- struct request *failed = (struct request *)rq->special;
+ struct request *failed = ide_req(rq)->special;
void *sense = bio_data(rq->bio);
if (failed) {
@@ -258,11 +258,22 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
/*
* take a breather
*/
- blk_delay_queue(drive->queue, 1);
+ blk_mq_requeue_request(rq, false);
+ blk_mq_delay_kick_requeue_list(drive->queue, 1);
return 1;
}
}
+static void ide_cd_free_sense(ide_drive_t *drive)
+{
+ if (!drive->sense_rq)
+ return;
+
+ blk_mq_free_request(drive->sense_rq);
+ drive->sense_rq = NULL;
+ drive->sense_rq_armed = false;
+}
+
/**
* Returns:
* 0: if the request should be continued.
@@ -516,6 +527,82 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
return false;
}
+/* standard prep_rq that builds 10 byte cmds */
+static bool ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
+{
+ int hard_sect = queue_logical_block_size(q);
+ long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
+ unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
+ struct scsi_request *req = scsi_req(rq);
+
+ if (rq_data_dir(rq) == READ)
+ req->cmd[0] = GPCMD_READ_10;
+ else
+ req->cmd[0] = GPCMD_WRITE_10;
+
+ /*
+ * fill in lba
+ */
+ req->cmd[2] = (block >> 24) & 0xff;
+ req->cmd[3] = (block >> 16) & 0xff;
+ req->cmd[4] = (block >> 8) & 0xff;
+ req->cmd[5] = block & 0xff;
+
+ /*
+ * and transfer length
+ */
+ req->cmd[7] = (blocks >> 8) & 0xff;
+ req->cmd[8] = blocks & 0xff;
+ req->cmd_len = 10;
+ return true;
+}
+
+/*
+ * Most of the SCSI commands are supported directly by ATAPI devices.
+ * This transform handles the few exceptions.
+ */
+static bool ide_cdrom_prep_pc(struct request *rq)
+{
+ u8 *c = scsi_req(rq)->cmd;
+
+ /* transform 6-byte read/write commands to the 10-byte version */
+ if (c[0] == READ_6 || c[0] == WRITE_6) {
+ c[8] = c[4];
+ c[5] = c[3];
+ c[4] = c[2];
+ c[3] = c[1] & 0x1f;
+ c[2] = 0;
+ c[1] &= 0xe0;
+ c[0] += (READ_10 - READ_6);
+ scsi_req(rq)->cmd_len = 10;
+ return true;
+ }
+
+ /*
+ * it's silly to pretend we understand 6-byte sense commands, just
+ * reject with ILLEGAL_REQUEST and the caller should take the
+ * appropriate action
+ */
+ if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
+ scsi_req(rq)->result = ILLEGAL_REQUEST;
+ return false;
+ }
+
+ return true;
+}
+
+static bool ide_cdrom_prep_rq(ide_drive_t *drive, struct request *rq)
+{
+ if (!blk_rq_is_passthrough(rq)) {
+ scsi_req_init(scsi_req(rq));
+
+ return ide_cdrom_prep_fs(drive->queue, rq);
+ } else if (blk_rq_is_scsi(rq))
+ return ide_cdrom_prep_pc(rq);
+
+ return true;
+}
+
static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
{
ide_hwif_t *hwif = drive->hwif;
@@ -675,7 +762,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
out_end:
if (blk_rq_is_scsi(rq) && rc == 0) {
scsi_req(rq)->resid_len = 0;
- blk_end_request_all(rq, BLK_STS_OK);
+ blk_mq_end_request(rq, BLK_STS_OK);
hwif->rq = NULL;
} else {
if (sense && uptodate)
@@ -705,6 +792,8 @@ out_end:
if (sense && rc == 2)
ide_error(drive, "request sense failure", stat);
}
+
+ ide_cd_free_sense(drive);
return ide_stopped;
}
@@ -729,7 +818,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
* We may be retrying this request after an error. Fix up any
* weirdness which might be present in the request packet.
*/
- q->prep_rq_fn(q, rq);
+ ide_cdrom_prep_rq(drive, rq);
}
/* fs requests *must* be hardware frame aligned */
@@ -1323,82 +1412,6 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
return nslots;
}
-/* standard prep_rq_fn that builds 10 byte cmds */
-static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
-{
- int hard_sect = queue_logical_block_size(q);
- long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
- unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
- struct scsi_request *req = scsi_req(rq);
-
- q->initialize_rq_fn(rq);
-
- if (rq_data_dir(rq) == READ)
- req->cmd[0] = GPCMD_READ_10;
- else
- req->cmd[0] = GPCMD_WRITE_10;
-
- /*
- * fill in lba
- */
- req->cmd[2] = (block >> 24) & 0xff;
- req->cmd[3] = (block >> 16) & 0xff;
- req->cmd[4] = (block >> 8) & 0xff;
- req->cmd[5] = block & 0xff;
-
- /*
- * and transfer length
- */
- req->cmd[7] = (blocks >> 8) & 0xff;
- req->cmd[8] = blocks & 0xff;
- req->cmd_len = 10;
- return BLKPREP_OK;
-}
-
-/*
- * Most of the SCSI commands are supported directly by ATAPI devices.
- * This transform handles the few exceptions.
- */
-static int ide_cdrom_prep_pc(struct request *rq)
-{
- u8 *c = scsi_req(rq)->cmd;
-
- /* transform 6-byte read/write commands to the 10-byte version */
- if (c[0] == READ_6 || c[0] == WRITE_6) {
- c[8] = c[4];
- c[5] = c[3];
- c[4] = c[2];
- c[3] = c[1] & 0x1f;
- c[2] = 0;
- c[1] &= 0xe0;
- c[0] += (READ_10 - READ_6);
- scsi_req(rq)->cmd_len = 10;
- return BLKPREP_OK;
- }
-
- /*
- * it's silly to pretend we understand 6-byte sense commands, just
- * reject with ILLEGAL_REQUEST and the caller should take the
- * appropriate action
- */
- if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
- scsi_req(rq)->result = ILLEGAL_REQUEST;
- return BLKPREP_KILL;
- }
-
- return BLKPREP_OK;
-}
-
-static int ide_cdrom_prep_fn(struct request_queue *q, struct request *rq)
-{
- if (!blk_rq_is_passthrough(rq))
- return ide_cdrom_prep_fs(q, rq);
- else if (blk_rq_is_scsi(rq))
- return ide_cdrom_prep_pc(rq);
-
- return 0;
-}
-
struct cd_list_entry {
const char *id_model;
const char *id_firmware;
@@ -1508,7 +1521,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
ide_debug_log(IDE_DBG_PROBE, "enter");
- blk_queue_prep_rq(q, ide_cdrom_prep_fn);
+ drive->prep_rq = ide_cdrom_prep_rq;
blk_queue_dma_alignment(q, 31);
blk_queue_update_dma_pad(q, 15);
@@ -1569,7 +1582,7 @@ static void ide_cd_release(struct device *dev)
if (devinfo->handle == drive)
unregister_cdrom(devinfo);
drive->driver_data = NULL;
- blk_queue_prep_rq(drive->queue, NULL);
+ drive->prep_rq = NULL;
g->private_data = NULL;
put_disk(g);
kfree(info);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index f4f8afdf8bbe..f2f93ed40356 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -171,7 +171,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
scsi_req(rq)->cmd_len = 5;
scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
*(int *)&scsi_req(rq)->cmd[1] = arg;
- rq->special = setting->set;
+ ide_req(rq)->special = setting->set;
blk_execute_rq(q, NULL, rq, 0);
ret = scsi_req(rq)->result;
@@ -182,7 +182,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
{
- int err, (*setfunc)(ide_drive_t *, int) = rq->special;
+ int err, (*setfunc)(ide_drive_t *, int) = ide_req(rq)->special;
err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
if (err)
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index e3b4e659082d..197912af5c2f 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -427,16 +427,15 @@ static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
}
-static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
+static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
{
- ide_drive_t *drive = q->queuedata;
struct ide_cmd *cmd;
if (req_op(rq) != REQ_OP_FLUSH)
- return BLKPREP_OK;
+ return true;
- if (rq->special) {
- cmd = rq->special;
+ if (ide_req(rq)->special) {
+ cmd = ide_req(rq)->special;
memset(cmd, 0, sizeof(*cmd));
} else {
cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
@@ -456,10 +455,10 @@ static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
rq->cmd_flags &= ~REQ_OP_MASK;
rq->cmd_flags |= REQ_OP_DRV_OUT;
ide_req(rq)->type = ATA_PRIV_TASKFILE;
- rq->special = cmd;
+ ide_req(rq)->special = cmd;
cmd->rq = rq;
- return BLKPREP_OK;
+ return true;
}
ide_devset_get(multcount, mult_count);
@@ -548,7 +547,7 @@ static void update_flush(ide_drive_t *drive)
if (barrier) {
wc = true;
- blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
+ drive->prep_rq = idedisk_prep_rq;
}
}
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 47d5f3379748..e1323e058454 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -125,7 +125,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
/* retry only "normal" I/O: */
if (blk_rq_is_passthrough(rq)) {
if (ata_taskfile_request(rq)) {
- struct ide_cmd *cmd = rq->special;
+ struct ide_cmd *cmd = ide_req(rq)->special;
if (cmd)
ide_complete_cmd(drive, cmd, stat, err);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a8df300f949c..780d33ccc5d8 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -276,7 +276,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
switch (ide_req(rq)->type) {
case ATA_PRIV_MISC:
case ATA_PRIV_SENSE:
- pc = (struct ide_atapi_pc *)rq->special;
+ pc = (struct ide_atapi_pc *)ide_req(rq)->special;
break;
default:
BUG();
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 0d93e0cfbeaf..8445b484ae69 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -67,7 +67,15 @@ int ide_end_rq(ide_drive_t *drive, struct request *rq, blk_status_t error,
ide_dma_on(drive);
}
- return blk_end_request(rq, error, nr_bytes);
+ if (!blk_update_request(rq, error, nr_bytes)) {
+ if (rq == drive->sense_rq)
+ drive->sense_rq = NULL;
+
+ __blk_mq_end_request(rq, error);
+ return 0;
+ }
+
+ return 1;
}
EXPORT_SYMBOL_GPL(ide_end_rq);
@@ -103,7 +111,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
}
if (rq && ata_taskfile_request(rq)) {
- struct ide_cmd *orig_cmd = rq->special;
+ struct ide_cmd *orig_cmd = ide_req(rq)->special;
if (cmd->tf_flags & IDE_TFLAG_DYN)
kfree(orig_cmd);
@@ -253,7 +261,7 @@ EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
struct request *rq)
{
- struct ide_cmd *cmd = rq->special;
+ struct ide_cmd *cmd = ide_req(rq)->special;
if (cmd) {
if (cmd->protocol == ATA_PROT_PIO) {
@@ -307,8 +315,6 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
{
ide_startstop_t startstop;
- BUG_ON(!(rq->rq_flags & RQF_STARTED));
-
#ifdef DEBUG
printk("%s: start_request: current=0x%08lx\n",
drive->hwif->name, (unsigned long) rq);
@@ -320,6 +326,9 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
goto kill_rq;
}
+ if (drive->prep_rq && !drive->prep_rq(drive, rq))
+ return ide_stopped;
+
if (ata_pm_request(rq))
ide_check_pm_state(drive, rq);
@@ -343,7 +352,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
if (ata_taskfile_request(rq))
return execute_drive_cmd(drive, rq);
else if (ata_pm_request(rq)) {
- struct ide_pm_state *pm = rq->special;
+ struct ide_pm_state *pm = ide_req(rq)->special;
#ifdef DEBUG_PM
printk("%s: start_power_step(step: %d)\n",
drive->name, pm->pm_step);
@@ -430,44 +439,42 @@ static inline void ide_unlock_host(struct ide_host *host)
}
}
-static void __ide_requeue_and_plug(struct request_queue *q, struct request *rq)
-{
- if (rq)
- blk_requeue_request(q, rq);
- if (rq || blk_peek_request(q)) {
- /* Use 3ms as that was the old plug delay */
- blk_delay_queue(q, 3);
- }
-}
-
void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
{
struct request_queue *q = drive->queue;
- unsigned long flags;
- spin_lock_irqsave(q->queue_lock, flags);
- __ide_requeue_and_plug(q, rq);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ /* Use 3ms as that was the old plug delay */
+ if (rq) {
+ blk_mq_requeue_request(rq, false);
+ blk_mq_delay_kick_requeue_list(q, 3);
+ } else
+ blk_mq_delay_run_hw_queue(q->queue_hw_ctx[0], 3);
}
/*
* Issue a new request to a device.
*/
-void do_ide_request(struct request_queue *q)
+blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
{
- ide_drive_t *drive = q->queuedata;
+ ide_drive_t *drive = hctx->queue->queuedata;
ide_hwif_t *hwif = drive->hwif;
struct ide_host *host = hwif->host;
- struct request *rq = NULL;
+ struct request *rq = bd->rq;
ide_startstop_t startstop;
- spin_unlock_irq(q->queue_lock);
+ if (!blk_rq_is_passthrough(rq) && !(rq->rq_flags & RQF_DONTPREP)) {
+ rq->rq_flags |= RQF_DONTPREP;
+ ide_req(rq)->special = NULL;
+ }
/* HLD do_request() callback might sleep, make sure it's okay */
might_sleep();
if (ide_lock_host(host, hwif))
- goto plug_device_2;
+ return BLK_STS_DEV_RESOURCE;
+
+ blk_mq_start_request(rq);
spin_lock_irq(&hwif->lock);
@@ -503,21 +510,16 @@ repeat:
hwif->cur_dev = drive;
drive->dev_flags &= ~(IDE_DFLAG_SLEEPING | IDE_DFLAG_PARKED);
- spin_unlock_irq(&hwif->lock);
- spin_lock_irq(q->queue_lock);
/*
* we know that the queue isn't empty, but this can happen
- * if the q->prep_rq_fn() decides to kill a request
+ * if ->prep_rq() decides to kill a request
*/
- if (!rq)
- rq = blk_fetch_request(drive->queue);
-
- spin_unlock_irq(q->queue_lock);
- spin_lock_irq(&hwif->lock);
-
if (!rq) {
- ide_unlock_port(hwif);
- goto out;
+ rq = bd->rq;
+ if (!rq) {
+ ide_unlock_port(hwif);
+ goto out;
+ }
}
/*
@@ -551,23 +553,24 @@ repeat:
if (startstop == ide_stopped) {
rq = hwif->rq;
hwif->rq = NULL;
- goto repeat;
+ if (rq)
+ goto repeat;
+ ide_unlock_port(hwif);
+ goto out;
}
- } else
- goto plug_device;
+ } else {
+plug_device:
+ spin_unlock_irq(&hwif->lock);
+ ide_unlock_host(host);
+ ide_requeue_and_plug(drive, rq);
+ return BLK_STS_OK;
+ }
+
out:
spin_unlock_irq(&hwif->lock);
if (rq == NULL)
ide_unlock_host(host);
- spin_lock_irq(q->queue_lock);
- return;
-
-plug_device:
- spin_unlock_irq(&hwif->lock);
- ide_unlock_host(host);
-plug_device_2:
- spin_lock_irq(q->queue_lock);
- __ide_requeue_and_plug(q, rq);
+ return BLK_STS_OK;
}
static int drive_is_ready(ide_drive_t *drive)
@@ -887,3 +890,16 @@ void ide_pad_transfer(ide_drive_t *drive, int write, int len)
}
}
EXPORT_SYMBOL_GPL(ide_pad_transfer);
+
+void ide_insert_request_head(ide_drive_t *drive, struct request *rq)
+{
+ ide_hwif_t *hwif = drive->hwif;
+ unsigned long flags;
+
+ spin_lock_irqsave(&hwif->lock, flags);
+ list_add_tail(&rq->queuelist, &drive->rq_list);
+ spin_unlock_irqrestore(&hwif->lock, flags);
+
+ kblockd_schedule_work(&drive->rq_work);
+}
+EXPORT_SYMBOL_GPL(ide_insert_request_head);
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 622f0edb3945..102aa3bc3e7f 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -27,7 +27,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
spin_unlock_irq(&hwif->lock);
if (start_queue)
- blk_run_queue(q);
+ blk_mq_run_hw_queues(q, true);
return;
}
spin_unlock_irq(&hwif->lock);
@@ -36,7 +36,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
scsi_req(rq)->cmd_len = 1;
ide_req(rq)->type = ATA_PRIV_MISC;
- rq->special = &timeout;
+ ide_req(rq)->special = &timeout;
blk_execute_rq(q, NULL, rq, 1);
rc = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
@@ -54,7 +54,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS;
scsi_req(rq)->cmd_len = 1;
ide_req(rq)->type = ATA_PRIV_MISC;
- elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
+ ide_insert_request_head(drive, rq);
out:
return;
@@ -67,7 +67,7 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
memset(&cmd, 0, sizeof(cmd));
if (scsi_req(rq)->cmd[0] == REQ_PARK_HEADS) {
- drive->sleep = *(unsigned long *)rq->special;
+ drive->sleep = *(unsigned long *)ide_req(rq)->special;
drive->dev_flags |= IDE_DFLAG_SLEEPING;
tf->command = ATA_CMD_IDLEIMMEDIATE;
tf->feature = 0x44;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 59217aa1d1fb..192e6c65d34e 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -21,7 +21,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
memset(&rqpm, 0, sizeof(rqpm));
rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
- rq->special = &rqpm;
+ ide_req(rq)->special = &rqpm;
rqpm.pm_step = IDE_PM_START_SUSPEND;
if (mesg.event == PM_EVENT_PRETHAW)
mesg.event = PM_EVENT_FREEZE;
@@ -40,32 +40,17 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
return ret;
}
-static void ide_end_sync_rq(struct request *rq, blk_status_t error)
-{
- complete(rq->end_io_data);
-}
-
static int ide_pm_execute_rq(struct request *rq)
{
struct request_queue *q = rq->q;
- DECLARE_COMPLETION_ONSTACK(wait);
- rq->end_io_data = &wait;
- rq->end_io = ide_end_sync_rq;
-
- spin_lock_irq(q->queue_lock);
if (unlikely(blk_queue_dying(q))) {
rq->rq_flags |= RQF_QUIET;
scsi_req(rq)->result = -ENXIO;
- __blk_end_request_all(rq, BLK_STS_OK);
- spin_unlock_irq(q->queue_lock);
+ blk_mq_end_request(rq, BLK_STS_OK);
return -ENXIO;
}
- __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
- __blk_run_queue_uncond(q);
- spin_unlock_irq(q->queue_lock);
-
- wait_for_completion_io(&wait);
+ blk_execute_rq(q, NULL, rq, true);
return scsi_req(rq)->result ? -EIO : 0;
}
@@ -79,6 +64,8 @@ int generic_ide_resume(struct device *dev)
struct ide_pm_state rqpm;
int err;
+ blk_mq_start_stopped_hw_queues(drive->queue, true);
+
if (ide_port_acpi(hwif)) {
/* call ACPI _PS0 / _STM only once */
if ((drive->dn & 1) == 0 || pair == NULL) {
@@ -92,7 +79,7 @@ int generic_ide_resume(struct device *dev)
memset(&rqpm, 0, sizeof(rqpm));
rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT);
ide_req(rq)->type = ATA_PRIV_PM_RESUME;
- rq->special = &rqpm;
+ ide_req(rq)->special = &rqpm;
rqpm.pm_step = IDE_PM_START_RESUME;
rqpm.pm_state = PM_EVENT_ON;
@@ -111,7 +98,7 @@ int generic_ide_resume(struct device *dev)
void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
{
- struct ide_pm_state *pm = rq->special;
+ struct ide_pm_state *pm = ide_req(rq)->special;
#ifdef DEBUG_PM
printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -141,7 +128,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
{
- struct ide_pm_state *pm = rq->special;
+ struct ide_pm_state *pm = ide_req(rq)->special;
struct ide_cmd cmd = { };
switch (pm->pm_step) {
@@ -213,8 +200,7 @@ out_do_tf:
void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
{
struct request_queue *q = drive->queue;
- struct ide_pm_state *pm = rq->special;
- unsigned long flags;
+ struct ide_pm_state *pm = ide_req(rq)->special;
ide_complete_power_step(drive, rq);
if (pm->pm_step != IDE_PM_COMPLETED)
@@ -224,22 +210,19 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
printk("%s: completing PM request, %s\n", drive->name,
(ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume");
#endif
- spin_lock_irqsave(q->queue_lock, flags);
if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
- blk_stop_queue(q);
+ blk_mq_stop_hw_queues(q);
else
drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
- spin_unlock_irqrestore(q->queue_lock, flags);
drive->hwif->rq = NULL;
- if (blk_end_request(rq, BLK_STS_OK, 0))
- BUG();
+ blk_mq_end_request(rq, BLK_STS_OK);
}
void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
{
- struct ide_pm_state *pm = rq->special;
+ struct ide_pm_state *pm = ide_req(rq)->special;
if (blk_rq_is_private(rq) &&
ide_req(rq)->type == ATA_PRIV_PM_SUSPEND &&
@@ -260,7 +243,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
ide_hwif_t *hwif = drive->hwif;
const struct ide_tp_ops *tp_ops = hwif->tp_ops;
struct request_queue *q = drive->queue;
- unsigned long flags;
int rc;
#ifdef DEBUG_PM
printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
@@ -274,8 +256,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
if (rc)
printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
- spin_lock_irqsave(q->queue_lock, flags);
- blk_start_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ blk_mq_start_hw_queues(q);
}
}
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 3b75a7b7a284..63627be0811a 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -746,10 +746,16 @@ static void ide_initialize_rq(struct request *rq)
{
struct ide_request *req = blk_mq_rq_to_pdu(rq);
+ req->special = NULL;
scsi_req_init(&req->sreq);
req->sreq.sense = req->sense;
}
+static const struct blk_mq_ops ide_mq_ops = {
+ .queue_rq = ide_queue_rq,
+ .initialize_rq_fn = ide_initialize_rq,
+};
+
/*
* init request queue
*/
@@ -759,6 +765,7 @@ static int ide_init_queue(ide_drive_t *drive)
ide_hwif_t *hwif = drive->hwif;
int max_sectors = 256;
int max_sg_entries = PRD_ENTRIES;
+ struct blk_mq_tag_set *set;
/*
* Our default set up assumes the normal IDE case,
@@ -767,19 +774,26 @@ static int ide_init_queue(ide_drive_t *drive)
* limits and LBA48 we could raise it but as yet
* do not.
*/
- q = blk_alloc_queue_node(GFP_KERNEL, hwif_to_node(hwif), NULL);
- if (!q)
+
+ set = &drive->tag_set;
+ set->ops = &ide_mq_ops;
+ set->nr_hw_queues = 1;
+ set->queue_depth = 32;
+ set->reserved_tags = 1;
+ set->cmd_size = sizeof(struct ide_request);
+ set->numa_node = hwif_to_node(hwif);
+ set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+ if (blk_mq_alloc_tag_set(set))
return 1;
- q->request_fn = do_ide_request;
- q->initialize_rq_fn = ide_initialize_rq;
- q->cmd_size = sizeof(struct ide_request);
- blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
- if (blk_init_allocated_queue(q) < 0) {
- blk_cleanup_queue(q);
+ q = blk_mq_init_queue(set);
+ if (IS_ERR(q)) {
+ blk_mq_free_tag_set(set);
return 1;
}
+ blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
+
q->queuedata = drive;
blk_queue_segment_boundary(q, 0xffff);
@@ -965,8 +979,12 @@ static void drive_release_dev (struct device *dev)
ide_proc_unregister_device(drive);
+ if (drive->sense_rq)
+ blk_mq_free_request(drive->sense_rq);
+
blk_cleanup_queue(drive->queue);
drive->queue = NULL;
+ blk_mq_free_tag_set(&drive->tag_set);
drive->dev_flags &= ~IDE_DFLAG_PRESENT;
@@ -1133,6 +1151,28 @@ static void ide_port_cable_detect(ide_hwif_t *hwif)
}
}
+/*
+ * Deferred request list insertion handler
+ */
+static void drive_rq_insert_work(struct work_struct *work)
+{
+ ide_drive_t *drive = container_of(work, ide_drive_t, rq_work);
+ ide_hwif_t *hwif = drive->hwif;
+ struct request *rq;
+ LIST_HEAD(list);
+
+ spin_lock_irq(&hwif->lock);
+ if (!list_empty(&drive->rq_list))
+ list_splice_init(&drive->rq_list, &list);
+ spin_unlock_irq(&hwif->lock);
+
+ while (!list_empty(&list)) {
+ rq = list_first_entry(&list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ blk_execute_rq_nowait(drive->queue, rq->rq_disk, rq, true, NULL);
+ }
+}
+
static const u8 ide_hwif_to_major[] =
{ IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR,
IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR };
@@ -1145,12 +1185,10 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
ide_port_for_each_dev(i, drive, hwif) {
u8 j = (hwif->index * MAX_DRIVES) + i;
u16 *saved_id = drive->id;
- struct request *saved_sense_rq = drive->sense_rq;
memset(drive, 0, sizeof(*drive));
memset(saved_id, 0, SECTOR_SIZE);
drive->id = saved_id;
- drive->sense_rq = saved_sense_rq;
drive->media = ide_disk;
drive->select = (i << 4) | ATA_DEVICE_OBS;
@@ -1166,6 +1204,9 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
INIT_LIST_HEAD(&drive->list);
init_completion(&drive->gendev_rel_comp);
+
+ INIT_WORK(&drive->rq_work, drive_rq_insert_work);
+ INIT_LIST_HEAD(&drive->rq_list);
}
}
@@ -1255,7 +1296,6 @@ static void ide_port_free_devices(ide_hwif_t *hwif)
int i;
ide_port_for_each_dev(i, drive, hwif) {
- kfree(drive->sense_rq);
kfree(drive->id);
kfree(drive);
}
@@ -1283,17 +1323,10 @@ static int ide_port_alloc_devices(ide_hwif_t *hwif, int node)
if (drive->id == NULL)
goto out_free_drive;
- drive->sense_rq = kmalloc(sizeof(struct request) +
- sizeof(struct ide_request), GFP_KERNEL);
- if (!drive->sense_rq)
- goto out_free_id;
-
hwif->devices[i] = drive;
}
return 0;
-out_free_id:
- kfree(drive->id);
out_free_drive:
kfree(drive);
out_nomem:
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 34c1165226a4..db1a65f4b490 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -639,7 +639,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
goto out;
}
if (req->cmd[13] & REQ_IDETAPE_PC1) {
- pc = (struct ide_atapi_pc *)rq->special;
+ pc = (struct ide_atapi_pc *)ide_req(rq)->special;
req->cmd[13] &= ~(REQ_IDETAPE_PC1);
req->cmd[13] |= REQ_IDETAPE_PC2;
goto out;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index c21d5c50ae3a..17b2e379e872 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -440,7 +440,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
goto put_req;
}
- rq->special = cmd;
+ ide_req(rq)->special = cmd;
cmd->rq = rq;
blk_execute_rq(drive->queue, NULL, rq, 0);
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index efb976a863d2..5f82036fe322 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -389,7 +389,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
goto err_dev;
}
- tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, NULL);
+ tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
if (!tqueue) {
ret = -ENOMEM;
goto err_disk;
@@ -974,7 +974,7 @@ static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba,
struct ppa_addr ppa;
u8 *blks;
int ch, lun, nr_blks;
- int ret;
+ int ret = 0;
ppa.ppa = slba;
ppa = dev_to_generic_addr(dev, ppa);
@@ -1140,30 +1140,33 @@ EXPORT_SYMBOL(nvm_alloc_dev);
int nvm_register(struct nvm_dev *dev)
{
- int ret;
+ int ret, exp_pool_size;
if (!dev->q || !dev->ops)
return -EINVAL;
- dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
+ ret = nvm_init(dev);
+ if (ret)
+ return ret;
+
+ exp_pool_size = max_t(int, PAGE_SIZE,
+ (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos)));
+ exp_pool_size = round_up(exp_pool_size, PAGE_SIZE);
+
+ dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist",
+ exp_pool_size);
if (!dev->dma_pool) {
pr_err("nvm: could not create dma pool\n");
+ nvm_free(dev);
return -ENOMEM;
}
- ret = nvm_init(dev);
- if (ret)
- goto err_init;
-
/* register device with a supported media manager */
down_write(&nvm_lock);
list_add(&dev->devices, &nvm_devices);
up_write(&nvm_lock);
return 0;
-err_init:
- dev->ops->destroy_dma_pool(dev->dma_pool);
- return ret;
}
EXPORT_SYMBOL(nvm_register);
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 6944aac43b01..1ff165351180 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -250,8 +250,8 @@ int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd)
if (rqd->nr_ppas == 1)
return 0;
- rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
- rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+ rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk);
+ rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk);
return 0;
}
@@ -376,7 +376,7 @@ void pblk_write_should_kick(struct pblk *pblk)
{
unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
- if (secs_avail >= pblk->min_write_pgs)
+ if (secs_avail >= pblk->min_write_pgs_data)
pblk_write_kick(pblk);
}
@@ -407,7 +407,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
struct pblk_line_meta *lm = &pblk->lm;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct list_head *move_list = NULL;
- int vsc = le32_to_cpu(*line->vsc);
+ int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
+ * (pblk->min_write_pgs - pblk->min_write_pgs_data);
+ int vsc = le32_to_cpu(*line->vsc) + packed_meta;
lockdep_assert_held(&line->lock);
@@ -531,7 +533,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
if (caddr == 0)
trace_pblk_chunk_state(pblk_disk_name(pblk),
ppa, NVM_CHK_ST_OPEN);
- else if (caddr == chunk->cnlb)
+ else if (caddr == (chunk->cnlb - 1))
trace_pblk_chunk_state(pblk_disk_name(pblk),
ppa, NVM_CHK_ST_CLOSED);
}
@@ -620,12 +622,15 @@ out:
}
int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
- unsigned long secs_to_flush)
+ unsigned long secs_to_flush, bool skip_meta)
{
int max = pblk->sec_per_write;
int min = pblk->min_write_pgs;
int secs_to_sync = 0;
+ if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs)
+ min = max = pblk->min_write_pgs_data;
+
if (secs_avail >= max)
secs_to_sync = max;
else if (secs_avail >= min)
@@ -796,10 +801,11 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
rqd.is_seq = 1;
for (i = 0; i < lm->smeta_sec; i++, paddr++) {
- struct pblk_sec_meta *meta_list = rqd.meta_list;
+ struct pblk_sec_meta *meta = pblk_get_meta(pblk,
+ rqd.meta_list, i);
rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
- meta_list[i].lba = lba_list[paddr] = addr_empty;
+ meta->lba = lba_list[paddr] = addr_empty;
}
ret = pblk_submit_io_sync_sem(pblk, &rqd);
@@ -845,13 +851,13 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
if (!meta_list)
return -ENOMEM;
- ppa_list = meta_list + pblk_dma_meta_size;
- dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+ ppa_list = meta_list + pblk_dma_meta_size(pblk);
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
next_rq:
memset(&rqd, 0, sizeof(struct nvm_rq));
- rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
rq_len = rq_ppas * geo->csecs;
bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
@@ -1276,6 +1282,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
return 0;
}
+/* Line allocations in the recovery path are always single threaded */
int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1295,15 +1302,22 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
ret = pblk_line_alloc_bitmaps(pblk, line);
if (ret)
- return ret;
+ goto fail;
if (!pblk_line_init_bb(pblk, line, 0)) {
- list_add(&line->list, &l_mg->free_list);
- return -EINTR;
+ ret = -EINTR;
+ goto fail;
}
pblk_rl_free_lines_dec(&pblk->rl, line, true);
return 0;
+
+fail:
+ spin_lock(&l_mg->free_lock);
+ list_add(&line->list, &l_mg->free_list);
+ spin_unlock(&l_mg->free_lock);
+
+ return ret;
}
void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
@@ -2160,3 +2174,38 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
}
spin_unlock(&pblk->trans_lock);
}
+
+void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ void *buffer;
+
+ if (pblk_is_oob_meta_supported(pblk)) {
+ /* Just use OOB metadata buffer as always */
+ buffer = rqd->meta_list;
+ } else {
+ /* We need to reuse last page of request (packed metadata)
+ * in similar way as traditional oob metadata
+ */
+ buffer = page_to_virt(
+ rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
+ }
+
+ return buffer;
+}
+
+void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ void *meta_list = rqd->meta_list;
+ void *page;
+ int i = 0;
+
+ if (pblk_is_oob_meta_supported(pblk))
+ return;
+
+ page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
+ /* We need to fill oob meta buffer with data from packed metadata */
+ for (; i < rqd->nr_ppas; i++)
+ memcpy(pblk_get_meta(pblk, meta_list, i),
+ page + (i * sizeof(struct pblk_sec_meta)),
+ sizeof(struct pblk_sec_meta));
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 13822594647c..f9a3e47b6a93 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -207,9 +207,6 @@ static int pblk_rwb_init(struct pblk *pblk)
return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs);
}
-/* Minimum pages needed within a lun */
-#define ADDR_POOL_SIZE 64
-
static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
struct nvm_addrf_12 *dst)
{
@@ -350,23 +347,19 @@ fail_destroy_ws:
static int pblk_get_global_caches(void)
{
- int ret;
+ int ret = 0;
mutex_lock(&pblk_caches.mutex);
- if (kref_read(&pblk_caches.kref) > 0) {
- kref_get(&pblk_caches.kref);
- mutex_unlock(&pblk_caches.mutex);
- return 0;
- }
+ if (kref_get_unless_zero(&pblk_caches.kref))
+ goto out;
ret = pblk_create_global_caches();
-
if (!ret)
- kref_get(&pblk_caches.kref);
+ kref_init(&pblk_caches.kref);
+out:
mutex_unlock(&pblk_caches.mutex);
-
return ret;
}
@@ -406,12 +399,45 @@ static int pblk_core_init(struct pblk *pblk)
pblk->nr_flush_rst = 0;
pblk->min_write_pgs = geo->ws_opt;
+ pblk->min_write_pgs_data = pblk->min_write_pgs;
max_write_ppas = pblk->min_write_pgs * geo->all_luns;
pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
pblk->max_write_pgs = min_t(int, pblk->max_write_pgs,
queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT));
pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
+ pblk->oob_meta_size = geo->sos;
+ if (!pblk_is_oob_meta_supported(pblk)) {
+ /* For drives which does not have OOB metadata feature
+ * in order to support recovery feature we need to use
+ * so called packed metadata. Packed metada will store
+ * the same information as OOB metadata (l2p table mapping,
+ * but in the form of the single page at the end of
+ * every write request.
+ */
+ if (pblk->min_write_pgs
+ * sizeof(struct pblk_sec_meta) > PAGE_SIZE) {
+ /* We want to keep all the packed metadata on single
+ * page per write requests. So we need to ensure that
+ * it will fit.
+ *
+ * This is more like sanity check, since there is
+ * no device with such a big minimal write size
+ * (above 1 metabytes).
+ */
+ pblk_err(pblk, "Not supported min write size\n");
+ return -EINVAL;
+ }
+ /* For packed meta approach we do some simplification.
+ * On read path we always issue requests which size
+ * equal to max_write_pgs, with all pages filled with
+ * user payload except of last one page which will be
+ * filled with packed metadata.
+ */
+ pblk->max_write_pgs = pblk->min_write_pgs;
+ pblk->min_write_pgs_data = pblk->min_write_pgs - 1;
+ }
+
pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t),
GFP_KERNEL);
if (!pblk->pad_dist)
@@ -635,40 +661,61 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]);
}
-static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
struct nvm_geo *geo = &dev->geo;
sector_t provisioned;
- int sec_meta, blk_meta;
+ int sec_meta, blk_meta, clba;
+ int minimum;
if (geo->op == NVM_TARGET_DEFAULT_OP)
pblk->op = PBLK_DEFAULT_OP;
else
pblk->op = geo->op;
- provisioned = nr_free_blks;
+ minimum = pblk_get_min_chks(pblk);
+ provisioned = nr_free_chks;
provisioned *= (100 - pblk->op);
sector_div(provisioned, 100);
- pblk->op_blks = nr_free_blks - provisioned;
+ if ((nr_free_chks - provisioned) < minimum) {
+ if (geo->op != NVM_TARGET_DEFAULT_OP) {
+ pblk_err(pblk, "OP too small to create a sane instance\n");
+ return -EINTR;
+ }
+
+ /* If the user did not specify an OP value, and PBLK_DEFAULT_OP
+ * is not enough, calculate and set sane value
+ */
+
+ provisioned = nr_free_chks - minimum;
+ pblk->op = (100 * minimum) / nr_free_chks;
+ pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n",
+ pblk->op);
+ }
+
+ pblk->op_blks = nr_free_chks - provisioned;
/* Internally pblk manages all free blocks, but all calculations based
* on user capacity consider only provisioned blocks
*/
- pblk->rl.total_blocks = nr_free_blks;
- pblk->rl.nr_secs = nr_free_blks * geo->clba;
+ pblk->rl.total_blocks = nr_free_chks;
+ pblk->rl.nr_secs = nr_free_chks * geo->clba;
/* Consider sectors used for metadata */
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
- pblk->capacity = (provisioned - blk_meta) * geo->clba;
+ clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data;
+ pblk->capacity = (provisioned - blk_meta) * clba;
- atomic_set(&pblk->rl.free_blocks, nr_free_blks);
- atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
+ atomic_set(&pblk->rl.free_blocks, nr_free_chks);
+ atomic_set(&pblk->rl.free_user_blocks, nr_free_chks);
+
+ return 0;
}
static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line,
@@ -984,7 +1031,7 @@ static int pblk_lines_init(struct pblk *pblk)
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line;
void *chunk_meta;
- long nr_free_chks = 0;
+ int nr_free_chks = 0;
int i, ret;
ret = pblk_line_meta_init(pblk);
@@ -1031,7 +1078,9 @@ static int pblk_lines_init(struct pblk *pblk)
goto fail_free_lines;
}
- pblk_set_provision(pblk, nr_free_chks);
+ ret = pblk_set_provision(pblk, nr_free_chks);
+ if (ret)
+ goto fail_free_lines;
vfree(chunk_meta);
return 0;
@@ -1041,7 +1090,7 @@ fail_free_lines:
pblk_line_meta_free(l_mg, &pblk->lines[i]);
kfree(pblk->lines);
fail_free_chunk_meta:
- kfree(chunk_meta);
+ vfree(chunk_meta);
fail_free_luns:
kfree(pblk->luns);
fail_free_meta:
@@ -1154,6 +1203,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
return ERR_PTR(-EINVAL);
}
+ if (geo->ext) {
+ pblk_err(pblk, "extended metadata not supported\n");
+ kfree(pblk);
+ return ERR_PTR(-EINVAL);
+ }
+
spin_lock_init(&pblk->resubmit_lock);
spin_lock_init(&pblk->trans_lock);
spin_lock_init(&pblk->lock);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 6dcbd44e3acb..79df583ea709 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -22,7 +22,7 @@
static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
struct ppa_addr *ppa_list,
unsigned long *lun_bitmap,
- struct pblk_sec_meta *meta_list,
+ void *meta_list,
unsigned int valid_secs)
{
struct pblk_line *line = pblk_line_get_data(pblk);
@@ -33,6 +33,9 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
int nr_secs = pblk->min_write_pgs;
int i;
+ if (!line)
+ return -ENOSPC;
+
if (pblk_line_is_full(line)) {
struct pblk_line *prev_line = line;
@@ -42,8 +45,11 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
line = pblk_line_replace_data(pblk);
pblk_line_close_meta(pblk, prev_line);
- if (!line)
- return -EINTR;
+ if (!line) {
+ pblk_pipeline_stop(pblk);
+ return -ENOSPC;
+ }
+
}
emeta = line->emeta;
@@ -52,6 +58,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
paddr = pblk_alloc_page(pblk, line, nr_secs);
for (i = 0; i < nr_secs; i++, paddr++) {
+ struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
/* ppa to be sent to the device */
@@ -68,14 +75,15 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
kref_get(&line->ref);
w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
w_ctx->ppa = ppa_list[i];
- meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+ meta->lba = cpu_to_le64(w_ctx->lba);
lba_list[paddr] = cpu_to_le64(w_ctx->lba);
if (lba_list[paddr] != addr_empty)
line->nr_valid_lbas++;
else
atomic64_inc(&pblk->pad_wa);
} else {
- lba_list[paddr] = meta_list[i].lba = addr_empty;
+ lba_list[paddr] = addr_empty;
+ meta->lba = addr_empty;
__pblk_map_invalidate(pblk, line, paddr);
}
}
@@ -84,50 +92,57 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
return 0;
}
-void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
unsigned long *lun_bitmap, unsigned int valid_secs,
unsigned int off)
{
- struct pblk_sec_meta *meta_list = rqd->meta_list;
+ void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
+ void *meta_buffer;
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
unsigned int map_secs;
int min = pblk->min_write_pgs;
int i;
+ int ret;
for (i = off; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
- if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
- lun_bitmap, &meta_list[i], map_secs)) {
- bio_put(rqd->bio);
- pblk_free_rqd(pblk, rqd, PBLK_WRITE);
- pblk_pipeline_stop(pblk);
- }
+ meta_buffer = pblk_get_meta(pblk, meta_list, i);
+
+ ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
+ lun_bitmap, meta_buffer, map_secs);
+ if (ret)
+ return ret;
}
+
+ return 0;
}
/* only if erase_ppa is set, acquire erase semaphore */
-void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int sentry, unsigned long *lun_bitmap,
unsigned int valid_secs, struct ppa_addr *erase_ppa)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_line_meta *lm = &pblk->lm;
- struct pblk_sec_meta *meta_list = rqd->meta_list;
+ void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
+ void *meta_buffer;
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
struct pblk_line *e_line, *d_line;
unsigned int map_secs;
int min = pblk->min_write_pgs;
int i, erase_lun;
+ int ret;
+
for (i = 0; i < rqd->nr_ppas; i += min) {
map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
- if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
- lun_bitmap, &meta_list[i], map_secs)) {
- bio_put(rqd->bio);
- pblk_free_rqd(pblk, rqd, PBLK_WRITE);
- pblk_pipeline_stop(pblk);
- }
+ meta_buffer = pblk_get_meta(pblk, meta_list, i);
+
+ ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
+ lun_bitmap, meta_buffer, map_secs);
+ if (ret)
+ return ret;
erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]);
@@ -163,7 +178,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
*/
e_line = pblk_line_get_erase(pblk);
if (!e_line)
- return;
+ return -ENOSPC;
/* Erase blocks that are bad in this line but might not be in next */
if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
@@ -174,7 +189,7 @@ retry:
bit = find_next_bit(d_line->blk_bitmap,
lm->blk_per_line, bit + 1);
if (bit >= lm->blk_per_line)
- return;
+ return 0;
spin_lock(&e_line->lock);
if (test_bit(bit, e_line->erase_bitmap)) {
@@ -188,4 +203,6 @@ retry:
*erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
erase_ppa->a.blk = e_line->id;
}
+
+ return 0;
}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index b1f4b51783f4..d4ca8c64ee0f 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -147,7 +147,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
/*
* Initialize rate-limiter, which controls access to the write buffer
- * but user and GC I/O
+ * by user and GC I/O
*/
pblk_rl_init(&pblk->rl, rb->nr_entries);
@@ -552,6 +552,9 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
to_read = count;
}
+ /* Add space for packed metadata if in use*/
+ pad += (pblk->min_write_pgs - pblk->min_write_pgs_data);
+
c_ctx->sentry = pos;
c_ctx->nr_valid = to_read;
c_ctx->nr_padded = pad;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 9fba614adeeb..3789185144da 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -43,7 +43,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
struct bio *bio, sector_t blba,
unsigned long *read_bitmap)
{
- struct pblk_sec_meta *meta_list = rqd->meta_list;
+ void *meta_list = rqd->meta_list;
struct ppa_addr ppas[NVM_MAX_VLBA];
int nr_secs = rqd->nr_ppas;
bool advanced_bio = false;
@@ -53,12 +53,15 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
for (i = 0; i < nr_secs; i++) {
struct ppa_addr p = ppas[i];
+ struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
sector_t lba = blba + i;
retry:
if (pblk_ppa_empty(p)) {
+ __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
WARN_ON(test_and_set_bit(i, read_bitmap));
- meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+ meta->lba = addr_empty;
if (unlikely(!advanced_bio)) {
bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
@@ -78,7 +81,7 @@ retry:
goto retry;
}
WARN_ON(test_and_set_bit(i, read_bitmap));
- meta_list[i].lba = cpu_to_le64(lba);
+ meta->lba = cpu_to_le64(lba);
advanced_bio = true;
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->cache_reads);
@@ -105,12 +108,16 @@ next:
static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
sector_t blba)
{
- struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+ void *meta_list = rqd->meta_list;
int nr_lbas = rqd->nr_ppas;
int i;
+ if (!pblk_is_oob_meta_supported(pblk))
+ return;
+
for (i = 0; i < nr_lbas; i++) {
- u64 lba = le64_to_cpu(meta_lba_list[i].lba);
+ struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+ u64 lba = le64_to_cpu(meta->lba);
if (lba == ADDR_EMPTY)
continue;
@@ -134,17 +141,22 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
u64 *lba_list, int nr_lbas)
{
- struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+ void *meta_lba_list = rqd->meta_list;
int i, j;
+ if (!pblk_is_oob_meta_supported(pblk))
+ return;
+
for (i = 0, j = 0; i < nr_lbas; i++) {
+ struct pblk_sec_meta *meta = pblk_get_meta(pblk,
+ meta_lba_list, j);
u64 lba = lba_list[i];
u64 meta_lba;
if (lba == ADDR_EMPTY)
continue;
- meta_lba = le64_to_cpu(meta_lba_list[j].lba);
+ meta_lba = le64_to_cpu(meta->lba);
if (lba != meta_lba) {
#ifdef CONFIG_NVM_PBLK_DEBUG
@@ -216,15 +228,15 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
struct pblk *pblk = rqd->private;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct pblk_pr_ctx *pr_ctx = r_ctx->private;
+ struct pblk_sec_meta *meta;
struct bio *new_bio = rqd->bio;
struct bio *bio = pr_ctx->orig_bio;
struct bio_vec src_bv, dst_bv;
- struct pblk_sec_meta *meta_list = rqd->meta_list;
+ void *meta_list = rqd->meta_list;
int bio_init_idx = pr_ctx->bio_init_idx;
unsigned long *read_bitmap = pr_ctx->bitmap;
int nr_secs = pr_ctx->orig_nr_secs;
int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
- __le64 *lba_list_mem, *lba_list_media;
void *src_p, *dst_p;
int hole, i;
@@ -237,13 +249,10 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
rqd->ppa_list[0] = ppa;
}
- /* Re-use allocated memory for intermediate lbas */
- lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
- lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
-
for (i = 0; i < nr_secs; i++) {
- lba_list_media[i] = meta_list[i].lba;
- meta_list[i].lba = lba_list_mem[i];
+ meta = pblk_get_meta(pblk, meta_list, i);
+ pr_ctx->lba_list_media[i] = le64_to_cpu(meta->lba);
+ meta->lba = cpu_to_le64(pr_ctx->lba_list_mem[i]);
}
/* Fill the holes in the original bio */
@@ -255,7 +264,8 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
kref_put(&line->ref, pblk_line_put);
- meta_list[hole].lba = lba_list_media[i];
+ meta = pblk_get_meta(pblk, meta_list, hole);
+ meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]);
src_bv = new_bio->bi_io_vec[i++];
dst_bv = bio->bi_io_vec[bio_init_idx + hole];
@@ -291,17 +301,13 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
unsigned long *read_bitmap,
int nr_holes)
{
- struct pblk_sec_meta *meta_list = rqd->meta_list;
+ void *meta_list = rqd->meta_list;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct pblk_pr_ctx *pr_ctx;
struct bio *new_bio, *bio = r_ctx->private;
- __le64 *lba_list_mem;
int nr_secs = rqd->nr_ppas;
int i;
- /* Re-use allocated memory for intermediate lbas */
- lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
-
new_bio = bio_alloc(GFP_KERNEL, nr_holes);
if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
@@ -312,12 +318,15 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
goto fail_free_pages;
}
- pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
+ pr_ctx = kzalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
if (!pr_ctx)
goto fail_free_pages;
- for (i = 0; i < nr_secs; i++)
- lba_list_mem[i] = meta_list[i].lba;
+ for (i = 0; i < nr_secs; i++) {
+ struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+
+ pr_ctx->lba_list_mem[i] = le64_to_cpu(meta->lba);
+ }
new_bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
@@ -325,7 +334,6 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
rqd->bio = new_bio;
rqd->nr_ppas = nr_holes;
- pr_ctx->ppa_ptr = NULL;
pr_ctx->orig_bio = bio;
bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
pr_ctx->bio_init_idx = bio_init_idx;
@@ -383,7 +391,7 @@ err:
static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
sector_t lba, unsigned long *read_bitmap)
{
- struct pblk_sec_meta *meta_list = rqd->meta_list;
+ struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0);
struct ppa_addr ppa;
pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
@@ -394,8 +402,10 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
retry:
if (pblk_ppa_empty(ppa)) {
+ __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
WARN_ON(test_and_set_bit(0, read_bitmap));
- meta_list[0].lba = cpu_to_le64(ADDR_EMPTY);
+ meta->lba = addr_empty;
return;
}
@@ -409,7 +419,7 @@ retry:
}
WARN_ON(test_and_set_bit(0, read_bitmap));
- meta_list[0].lba = cpu_to_le64(lba);
+ meta->lba = cpu_to_le64(lba);
#ifdef CONFIG_NVM_PBLK_DEBUG
atomic_long_inc(&pblk->cache_reads);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 5740b7509bd8..3fcf062d752c 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -13,6 +13,9 @@
* General Public License for more details.
*
* pblk-recovery.c - pblk's recovery path
+ *
+ * The L2P recovery path is single threaded as the L2P table is updated in order
+ * following the line sequence ID.
*/
#include "pblk.h"
@@ -124,7 +127,7 @@ static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line)
struct pblk_recov_alloc {
struct ppa_addr *ppa_list;
- struct pblk_sec_meta *meta_list;
+ void *meta_list;
struct nvm_rq *rqd;
void *data;
dma_addr_t dma_ppa_list;
@@ -158,7 +161,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
- struct pblk_sec_meta *meta_list;
+ void *meta_list;
struct pblk_pad_rq *pad_rq;
struct nvm_rq *rqd;
struct bio *bio;
@@ -188,7 +191,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
kref_init(&pad_rq->ref);
next_pad_rq:
- rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
if (rq_ppas < pblk->min_write_pgs) {
pblk_err(pblk, "corrupted pad line %d\n", line->id);
goto fail_free_pad;
@@ -237,12 +240,15 @@ next_pad_rq:
for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
struct ppa_addr dev_ppa;
+ struct pblk_sec_meta *meta;
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pblk_map_invalidate(pblk, dev_ppa);
- lba_list[w_ptr] = meta_list[i].lba = addr_empty;
+ lba_list[w_ptr] = addr_empty;
+ meta = pblk_get_meta(pblk, meta_list, i);
+ meta->lba = addr_empty;
rqd->ppa_list[i] = dev_ppa;
}
}
@@ -334,20 +340,21 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
struct pblk_recov_alloc p)
{
struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_line_meta *lm = &pblk->lm;
struct nvm_geo *geo = &dev->geo;
struct ppa_addr *ppa_list;
- struct pblk_sec_meta *meta_list;
+ void *meta_list;
struct nvm_rq *rqd;
struct bio *bio;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
__le64 *lba_list;
- u64 paddr = 0;
+ u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
bool padded = false;
int rq_ppas, rq_len;
int i, j;
int ret;
- u64 left_ppas = pblk_sec_in_open_line(pblk, line);
+ u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
if (pblk_line_wp_is_unbalanced(pblk, line))
pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);
@@ -364,17 +371,19 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
next_rq:
memset(rqd, 0, pblk_g_rq_size);
- rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
if (!rq_ppas)
rq_ppas = pblk->min_write_pgs;
rq_len = rq_ppas * geo->csecs;
+retry_rq:
bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
if (IS_ERR(bio))
return PTR_ERR(bio);
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio_get(bio);
rqd->bio = bio;
rqd->opcode = NVM_OP_PREAD;
@@ -387,7 +396,6 @@ next_rq:
if (pblk_io_aligned(pblk, rq_ppas))
rqd->is_seq = 1;
-retry_rq:
for (i = 0; i < rqd->nr_ppas; ) {
struct ppa_addr ppa;
int pos;
@@ -410,6 +418,7 @@ retry_rq:
if (ret) {
pblk_err(pblk, "I/O submission failed: %d\n", ret);
bio_put(bio);
+ bio_put(bio);
return ret;
}
@@ -421,20 +430,28 @@ retry_rq:
if (padded) {
pblk_log_read_err(pblk, rqd);
+ bio_put(bio);
return -EINTR;
}
pad_distance = pblk_pad_distance(pblk, line);
ret = pblk_recov_pad_line(pblk, line, pad_distance);
- if (ret)
+ if (ret) {
+ bio_put(bio);
return ret;
+ }
padded = true;
+ bio_put(bio);
goto retry_rq;
}
+ pblk_get_packed_meta(pblk, rqd);
+ bio_put(bio);
+
for (i = 0; i < rqd->nr_ppas; i++) {
- u64 lba = le64_to_cpu(meta_list[i].lba);
+ struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+ u64 lba = le64_to_cpu(meta->lba);
lba_list[paddr++] = cpu_to_le64(lba);
@@ -463,7 +480,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
struct nvm_geo *geo = &dev->geo;
struct nvm_rq *rqd;
struct ppa_addr *ppa_list;
- struct pblk_sec_meta *meta_list;
+ void *meta_list;
struct pblk_recov_alloc p;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
@@ -473,8 +490,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
if (!meta_list)
return -ENOMEM;
- ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
- dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+ ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk);
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL);
if (!data) {
@@ -804,7 +821,6 @@ next:
WARN_ON_ONCE(!test_and_clear_bit(meta_line,
&l_mg->meta_bitmap));
spin_unlock(&l_mg->free_lock);
- pblk_line_replace_data(pblk);
} else {
spin_lock(&l_mg->free_lock);
/* Allocate next line for preparation */
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index db55a1c89997..76116d5f78e4 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -214,11 +214,10 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
struct nvm_geo *geo = &dev->geo;
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
- int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
int sec_meta, blk_meta;
-
unsigned int rb_windows;
+
/* Consider sectors used for metadata */
sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
@@ -226,7 +225,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
rl->high_pw = get_count_order(rl->high);
- rl->rsv_blocks = min_blocks;
+ rl->rsv_blocks = pblk_get_min_chks(pblk);
/* This will always be a power-of-2 */
rb_windows = budget / NVM_MAX_VLBA;
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 2d2818155aa8..7d8958df9472 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -479,6 +479,13 @@ static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk,
if (kstrtouint(page, 0, &sec_per_write))
return -EINVAL;
+ if (!pblk_is_oob_meta_supported(pblk)) {
+ /* For packed metadata case it is
+ * not allowed to change sec_per_write.
+ */
+ return -EINVAL;
+ }
+
if (sec_per_write < pblk->min_write_pgs
|| sec_per_write > pblk->max_write_pgs
|| sec_per_write % pblk->min_write_pgs != 0)
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index fa8726493b39..06d56deb645d 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -105,14 +105,20 @@ retry:
}
/* Map remaining sectors in chunk, starting from ppa */
-static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
+static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa,
+ int rqd_ppas)
{
struct pblk_line *line;
struct ppa_addr map_ppa = *ppa;
+ __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+ __le64 *lba_list;
u64 paddr;
int done = 0;
+ int n = 0;
line = pblk_ppa_to_line(pblk, *ppa);
+ lba_list = emeta_to_lbas(pblk, line->emeta->buf);
+
spin_lock(&line->lock);
while (!done) {
@@ -121,10 +127,17 @@ static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
if (!test_and_set_bit(paddr, line->map_bitmap))
line->left_msecs--;
+ if (n < rqd_ppas && lba_list[paddr] != addr_empty)
+ line->nr_valid_lbas--;
+
+ lba_list[paddr] = addr_empty;
+
if (!test_and_set_bit(paddr, line->invalid_bitmap))
le32_add_cpu(line->vsc, -1);
done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa);
+
+ n++;
}
line->w_err_gc->has_write_err = 1;
@@ -148,9 +161,11 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
w_ctx = &entry->w_ctx;
/* Check if the lba has been overwritten */
- ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
- if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
- w_ctx->lba = ADDR_EMPTY;
+ if (w_ctx->lba != ADDR_EMPTY) {
+ ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
+ if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
+ w_ctx->lba = ADDR_EMPTY;
+ }
/* Mark up the entry as submittable again */
flags = READ_ONCE(w_ctx->flags);
@@ -200,7 +215,7 @@ static void pblk_submit_rec(struct work_struct *work)
pblk_log_write_err(pblk, rqd);
- pblk_map_remaining(pblk, ppa_list);
+ pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas);
pblk_queue_resubmit(pblk, c_ctx);
pblk_up_rq(pblk, c_ctx->lun_bitmap);
@@ -319,12 +334,13 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
}
if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
- pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+ ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+ valid, 0);
else
- pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+ ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
valid, erase_ppa);
- return 0;
+ return ret;
}
static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
@@ -332,7 +348,7 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
{
int secs_to_sync;
- secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+ secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true);
#ifdef CONFIG_NVM_PBLK_DEBUG
if ((!secs_to_sync && secs_to_flush)
@@ -548,15 +564,17 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
c_ctx->nr_padded);
}
-static int pblk_submit_write(struct pblk *pblk)
+static int pblk_submit_write(struct pblk *pblk, int *secs_left)
{
struct bio *bio;
struct nvm_rq *rqd;
unsigned int secs_avail, secs_to_sync, secs_to_com;
- unsigned int secs_to_flush;
+ unsigned int secs_to_flush, packed_meta_pgs;
unsigned long pos;
unsigned int resubmit;
+ *secs_left = 0;
+
spin_lock(&pblk->resubmit_lock);
resubmit = !list_empty(&pblk->resubmit_list);
spin_unlock(&pblk->resubmit_lock);
@@ -586,17 +604,17 @@ static int pblk_submit_write(struct pblk *pblk)
*/
secs_avail = pblk_rb_read_count(&pblk->rwb);
if (!secs_avail)
- return 1;
+ return 0;
secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
- if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
- return 1;
+ if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data)
+ return 0;
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
secs_to_flush);
if (secs_to_sync > pblk->max_write_pgs) {
pblk_err(pblk, "bad buffer sync calculation\n");
- return 1;
+ return 0;
}
secs_to_com = (secs_to_sync > secs_avail) ?
@@ -604,7 +622,8 @@ static int pblk_submit_write(struct pblk *pblk)
pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
}
- bio = bio_alloc(GFP_KERNEL, secs_to_sync);
+ packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data);
+ bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs);
bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -625,6 +644,7 @@ static int pblk_submit_write(struct pblk *pblk)
atomic_long_add(secs_to_sync, &pblk->sub_writes);
#endif
+ *secs_left = 1;
return 0;
fail_free_bio:
@@ -633,16 +653,22 @@ fail_put_bio:
bio_put(bio);
pblk_free_rqd(pblk, rqd, PBLK_WRITE);
- return 1;
+ return -EINTR;
}
int pblk_write_ts(void *data)
{
struct pblk *pblk = data;
+ int secs_left;
+ int write_failure = 0;
while (!kthread_should_stop()) {
- if (!pblk_submit_write(pblk))
- continue;
+ if (!write_failure) {
+ write_failure = pblk_submit_write(pblk, &secs_left);
+
+ if (secs_left)
+ continue;
+ }
set_current_state(TASK_INTERRUPTIBLE);
io_schedule();
}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 02bb2e98f8a9..85e38ed62f85 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -104,7 +104,6 @@ enum {
PBLK_RL_LOW = 4
};
-#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * NVM_MAX_VLBA)
#define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA)
/* write buffer completion context */
@@ -132,6 +131,8 @@ struct pblk_pr_ctx {
unsigned int bio_init_idx;
void *ppa_ptr;
dma_addr_t dma_ppa_list;
+ __le64 lba_list_mem[NVM_MAX_VLBA];
+ __le64 lba_list_media[NVM_MAX_VLBA];
};
/* Pad context */
@@ -631,7 +632,9 @@ struct pblk {
int state; /* pblk line state */
int min_write_pgs; /* Minimum amount of pages required by controller */
+ int min_write_pgs_data; /* Minimum amount of payload pages */
int max_write_pgs; /* Maximum amount of pages supported by controller */
+ int oob_meta_size; /* Size of OOB sector metadata */
sector_t capacity; /* Device capacity when bad blocks are subtracted */
@@ -836,7 +839,7 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
- unsigned long secs_to_flush);
+ unsigned long secs_to_flush, bool skip_meta);
void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa,
unsigned long *lun_bitmap);
void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa);
@@ -860,6 +863,8 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
u64 *lba_list, int nr_secs);
void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
sector_t blba, int nr_secs);
+void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd);
/*
* pblk user I/O write path
@@ -871,10 +876,10 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
/*
* pblk map
*/
-void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
unsigned int sentry, unsigned long *lun_bitmap,
unsigned int valid_secs, struct ppa_addr *erase_ppa);
-void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
unsigned long *lun_bitmap, unsigned int valid_secs,
unsigned int off);
@@ -905,7 +910,6 @@ int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */
#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */
#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */
-#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */
int pblk_gc_init(struct pblk *pblk);
void pblk_gc_exit(struct pblk *pblk, bool graceful);
@@ -1370,4 +1374,33 @@ static inline char *pblk_disk_name(struct pblk *pblk)
return disk->disk_name;
}
+
+static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ /* In a worst-case scenario every line will have OP invalid sectors.
+ * We will then need a minimum of 1/OP lines to free up a single line
+ */
+
+ return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line;
+}
+
+static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
+ void *meta, int index)
+{
+ return meta +
+ max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
+ * index;
+}
+
+static inline int pblk_dma_meta_size(struct pblk *pblk)
+{
+ return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
+ * NVM_MAX_VLBA;
+}
+
+static inline int pblk_is_oob_meta_supported(struct pblk *pblk)
+{
+ return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta);
+}
#endif /* PBLK_H_ */
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b61b83bbcfff..fdf75352e16a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -627,6 +627,20 @@ struct cache_set {
struct bkey gc_done;
/*
+ * For automatical garbage collection after writeback completed, this
+ * varialbe is used as bit fields,
+ * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
+ * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback
+ * This is an optimization for following write request after writeback
+ * finished, but read hit rate dropped due to clean data on cache is
+ * discarded. Unless user explicitly sets it via sysfs, it won't be
+ * enabled.
+ */
+#define BCH_ENABLE_AUTO_GC 1
+#define BCH_DO_AUTO_GC 2
+ uint8_t gc_after_writeback;
+
+ /*
* The allocation code needs gc_mark in struct bucket to be correct, but
* it's not while a gc is in progress. Protected by bucket_lock.
*/
@@ -658,7 +672,11 @@ struct cache_set {
/*
* A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - have to dynamically allocate them
+ * on the stack - have to dynamically allocate them.
+ * bch_cache_set_alloc() will make sure the pool can allocate iterators
+ * equipped with enough room that can host
+ * (sb.bucket_size / sb.block_size)
+ * btree_iter_sets, which is more than static MAX_BSETS.
*/
mempool_t fill_iter;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3f4211b5cd33..23cb1dc7296b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b)
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
+ /*
+ * c->fill_iter can allocate an iterator with more memory space
+ * than static MAX_BSETS.
+ * See the comment arount cache_set->fill_iter.
+ */
iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index a68d6c55783b..d1c72ef64edf 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c)
wake_up(&c->gc_wait);
}
+static inline void force_wake_up_gc(struct cache_set *c)
+{
+ /*
+ * Garbage collection thread only works when sectors_to_gc < 0,
+ * calling wake_up_gc() won't start gc thread if sectors_to_gc is
+ * not a nagetive value.
+ * Therefore sectors_to_gc is set to -1 here, before waking up
+ * gc thread by calling wake_up_gc(). Then gc_should_run() will
+ * give a chance to permit gc thread to run. "Give a chance" means
+ * before going into gc_should_run(), there is still possibility
+ * that c->sectors_to_gc being set to other positive value. So
+ * this routine won't 100% make sure gc thread will be woken up
+ * to run.
+ */
+ atomic_set(&c->sectors_to_gc, -1);
+ wake_up_gc(c);
+}
+
#define MAP_DONE 0
#define MAP_CONTINUE 1
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8f448b9c96a1..8b123be05254 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
void bch_debug_exit(void)
{
- if (!IS_ERR_OR_NULL(bcache_debug))
- debugfs_remove_recursive(bcache_debug);
+ debugfs_remove_recursive(bcache_debug);
}
void __init bch_debug_init(void)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 522c7426f3a0..b2fd412715b1 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl)
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
bch_bio_map(bio, w->data);
- trace_bcache_journal_write(bio);
+ trace_bcache_journal_write(bio, w->data->keys);
bio_list_add(&list, bio);
SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3bf35914bb57..15070412a32e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,11 +311,11 @@ err:
* data is written it calls bch_journal, and after the keys have been added to
* the next journal write they're inserted into the btree.
*
- * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
+ * It inserts the data in op->bio; bi_sector is used for the key offset,
* and op->inode is used for the key inode.
*
- * If s->bypass is true, instead of inserting the data it invalidates the
- * region of the cache represented by s->cache_bio and op->inode.
+ * If op->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
*/
void bch_data_insert(struct closure *cl)
{
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7bbd670a5a84..4dee119c3664 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,8 +25,8 @@
#include <linux/reboot.h>
#include <linux/sysfs.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+unsigned int bch_cutoff_writeback;
+unsigned int bch_cutoff_writeback_sync;
static const char bcache_magic[] = {
0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
@@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl)
struct cache *ca;
unsigned int i;
- if (!IS_ERR_OR_NULL(c->debug))
- debugfs_remove(c->debug);
+ debugfs_remove(c->debug);
bch_open_buckets_free(c);
bch_btree_cache_free(c);
@@ -2424,6 +2423,32 @@ static void bcache_exit(void)
mutex_destroy(&bch_register_lock);
}
+/* Check and fixup module parameters */
+static void check_module_parameters(void)
+{
+ if (bch_cutoff_writeback_sync == 0)
+ bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
+ else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
+ pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
+ bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
+ bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
+ }
+
+ if (bch_cutoff_writeback == 0)
+ bch_cutoff_writeback = CUTOFF_WRITEBACK;
+ else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
+ pr_warn("set bch_cutoff_writeback (%u) to max value %u",
+ bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
+ bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
+ }
+
+ if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
+ pr_warn("set bch_cutoff_writeback (%u) to %u",
+ bch_cutoff_writeback, bch_cutoff_writeback_sync);
+ bch_cutoff_writeback = bch_cutoff_writeback_sync;
+ }
+}
+
static int __init bcache_init(void)
{
static const struct attribute *files[] = {
@@ -2432,6 +2457,8 @@ static int __init bcache_init(void)
NULL
};
+ check_module_parameters();
+
mutex_init(&bch_register_lock);
init_waitqueue_head(&unregister_wait);
register_reboot_notifier(&reboot);
@@ -2468,5 +2495,18 @@ err:
return -ENOMEM;
}
+/*
+ * Module hooks
+ */
module_exit(bcache_exit);
module_init(bcache_init);
+
+module_param(bch_cutoff_writeback, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
+
+module_param(bch_cutoff_writeback_sync, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
+
+MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 26f035a0c5b9..557a8a3270a1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,7 +16,7 @@
#include <linux/sort.h>
#include <linux/sched/clock.h>
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
+/* Default is 0 ("writethrough") */
static const char * const bch_cache_modes[] = {
"writethrough",
"writeback",
@@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = {
NULL
};
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
+/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
"always",
@@ -88,6 +88,8 @@ read_attribute(writeback_keys_done);
read_attribute(writeback_keys_failed);
read_attribute(io_errors);
read_attribute(congested);
+read_attribute(cutoff_writeback);
+read_attribute(cutoff_writeback_sync);
rw_attribute(congested_read_threshold_us);
rw_attribute(congested_write_threshold_us);
@@ -128,6 +130,7 @@ rw_attribute(expensive_debug_checks);
rw_attribute(cache_replacement_policy);
rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
+rw_attribute(gc_after_writeback);
rw_attribute(size);
static ssize_t bch_snprint_string_list(char *buf,
@@ -264,7 +267,8 @@ STORE(__cached_dev)
d_strtoul(writeback_running);
d_strtoul(writeback_delay);
- sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+ sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
+ 0, bch_cutoff_writeback);
if (attr == &sysfs_writeback_rate) {
ssize_t ret;
@@ -384,8 +388,25 @@ STORE(bch_cached_dev)
mutex_lock(&bch_register_lock);
size = __cached_dev_store(kobj, attr, buf, size);
- if (attr == &sysfs_writeback_running)
- bch_writeback_queue(dc);
+ if (attr == &sysfs_writeback_running) {
+ /* dc->writeback_running changed in __cached_dev_store() */
+ if (IS_ERR_OR_NULL(dc->writeback_thread)) {
+ /*
+ * reject setting it to 1 via sysfs if writeback
+ * kthread is not created yet.
+ */
+ if (dc->writeback_running) {
+ dc->writeback_running = false;
+ pr_err("%s: failed to run non-existent writeback thread",
+ dc->disk.disk->disk_name);
+ }
+ } else
+ /*
+ * writeback kthread will check if dc->writeback_running
+ * is true or false.
+ */
+ bch_writeback_queue(dc);
+ }
if (attr == &sysfs_writeback_percent)
if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
@@ -668,6 +689,9 @@ SHOW(__bch_cache_set)
sysfs_print(congested_write_threshold_us,
c->congested_write_threshold_us);
+ sysfs_print(cutoff_writeback, bch_cutoff_writeback);
+ sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
+
sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
sysfs_printf(verify, "%i", c->verify);
sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
@@ -676,6 +700,7 @@ SHOW(__bch_cache_set)
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+ sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
sysfs_printf(io_disable, "%i",
test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -725,21 +750,8 @@ STORE(__bch_cache_set)
bch_cache_accounting_clear(&c->accounting);
}
- if (attr == &sysfs_trigger_gc) {
- /*
- * Garbage collection thread only works when sectors_to_gc < 0,
- * when users write to sysfs entry trigger_gc, most of time
- * they want to forcibly triger gargage collection. Here -1 is
- * set to c->sectors_to_gc, to make gc_should_run() give a
- * chance to permit gc thread to run. "give a chance" means
- * before going into gc_should_run(), there is still chance
- * that c->sectors_to_gc being set to other positive value. So
- * writing sysfs entry trigger_gc won't always make sure gc
- * thread takes effect.
- */
- atomic_set(&c->sectors_to_gc, -1);
- wake_up_gc(c);
- }
+ if (attr == &sysfs_trigger_gc)
+ force_wake_up_gc(c);
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;
@@ -789,6 +801,12 @@ STORE(__bch_cache_set)
sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
+ /*
+ * write gc_after_writeback here may overwrite an already set
+ * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
+ * set in next chance.
+ */
+ sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
return size;
}
@@ -869,7 +887,10 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_gc_always_rewrite,
&sysfs_btree_shrinker_disabled,
&sysfs_copy_gc_enabled,
+ &sysfs_gc_after_writeback,
&sysfs_io_disable,
+ &sysfs_cutoff_writeback,
+ &sysfs_cutoff_writeback_sync,
NULL
};
KTYPE(bch_cache_set_internal);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 08c3a9f9676c..73f0efac2b9f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -17,6 +17,15 @@
#include <linux/sched/clock.h>
#include <trace/events/bcache.h>
+static void update_gc_after_writeback(struct cache_set *c)
+{
+ if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
+ c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
+ return;
+
+ c->gc_after_writeback |= BCH_DO_AUTO_GC;
+}
+
/* Rate limiting */
static uint64_t __calc_target_rate(struct cached_dev *dc)
{
@@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work)
if (!set_at_max_writeback_rate(c, dc)) {
down_read(&dc->writeback_lock);
__update_writeback_rate(dc);
+ update_gc_after_writeback(c);
up_read(&dc->writeback_lock);
}
}
@@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg)
up_write(&dc->writeback_lock);
break;
}
+
+ /*
+ * When dirty data rate is high (e.g. 50%+), there might
+ * be heavy buckets fragmentation after writeback
+ * finished, which hurts following write performance.
+ * If users really care about write performance they
+ * may set BCH_ENABLE_AUTO_GC via sysfs, then when
+ * BCH_DO_AUTO_GC is set, garbage collection thread
+ * will be wake up here. After moving gc, the shrunk
+ * btree and discarded free buckets SSD space may be
+ * helpful for following write requests.
+ */
+ if (c->gc_after_writeback ==
+ (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
+ c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
+ force_wake_up_gc(c);
+ }
}
up_write(&dc->writeback_lock);
@@ -777,7 +804,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
bch_keybuf_init(&dc->writeback_keys);
dc->writeback_metadata = true;
- dc->writeback_running = true;
+ dc->writeback_running = false;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -805,6 +832,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
cached_dev_put(dc);
return PTR_ERR(dc->writeback_thread);
}
+ dc->writeback_running = true;
WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
schedule_delayed_work(&dc->writeback_rate_update,
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index d2b9fdbc8994..6a743d3bb338 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,12 +5,17 @@
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
+#define CUTOFF_WRITEBACK_MAX 70
+#define CUTOFF_WRITEBACK_SYNC_MAX 90
+
#define MAX_WRITEBACKS_IN_PASS 5
#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
+#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
+
/*
* 14 (16384ths) is chosen here as something that each backing device
* should be a reasonable fraction of the share, and not to blow up
@@ -53,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
}
}
+extern unsigned int bch_cutoff_writeback;
+extern unsigned int bch_cutoff_writeback_sync;
+
static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
unsigned int cache_mode, bool would_skip)
{
@@ -60,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
if (cache_mode != CACHE_MODE_WRITEBACK ||
test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
- in_use > CUTOFF_WRITEBACK_SYNC)
+ in_use > bch_cutoff_writeback_sync)
return false;
if (dc->partial_stripes_expensive &&
@@ -73,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
return (op_is_sync(bio->bi_opf) ||
bio->bi_opf & (REQ_META|REQ_PRIO) ||
- in_use <= CUTOFF_WRITEBACK);
+ in_use <= bch_cutoff_writeback);
}
static inline void bch_writeback_queue(struct cached_dev *dc)
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 224d44503a06..95c6d86ab5e8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -65,7 +65,6 @@ struct mapped_device {
*/
struct work_struct work;
wait_queue_head_t wait;
- atomic_t pending[2];
spinlock_t deferred_lock;
struct bio_list deferred;
@@ -107,9 +106,6 @@ struct mapped_device {
struct block_device *bdev;
- /* zero-length flush that will be cloned and submitted to targets */
- struct bio flush_bio;
-
struct dm_stats stats;
/* for blk-mq request-based DM support */
@@ -119,7 +115,6 @@ struct mapped_device {
struct srcu_struct io_barrier;
};
-int md_in_flight(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7cd36e4d1310..4e06be4f0a62 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
int dm_request_based(struct mapped_device *md)
{
- return queue_is_rq_based(md->queue);
+ return queue_is_mq(md->queue);
}
void dm_start_queue(struct request_queue *q)
@@ -130,10 +130,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
- atomic_dec(&md->pending[rw]);
-
/* nudge anyone waiting on suspend queue */
- if (!md_in_flight(md))
+ if (unlikely(waitqueue_active(&md->wait)))
wake_up(&md->wait);
/*
@@ -436,7 +434,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
static void dm_start_request(struct mapped_device *md, struct request *orig)
{
blk_mq_start_request(orig);
- atomic_inc(&md->pending[rq_data_dir(orig)]);
if (unlikely(dm_stats_used(&md->stats))) {
struct dm_rq_target_io *tio = tio_from_request(orig);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9038c302d5c2..844f7d0f2ef8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
struct request_queue *q = bdev_get_queue(dev->bdev);
struct verify_rq_based_data *v = data;
- if (q->mq_ops)
+ if (queue_is_mq(q))
v->mq_count++;
else
v->sq_count++;
- return queue_is_rq_based(q);
+ return queue_is_mq(q);
}
static int dm_table_determine_type(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 63a7c416b224..a4a06982ed91 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,26 +646,38 @@ static void free_tio(struct dm_target_io *tio)
bio_put(&tio->clone);
}
-int md_in_flight(struct mapped_device *md)
+static bool md_in_flight_bios(struct mapped_device *md)
{
- return atomic_read(&md->pending[READ]) +
- atomic_read(&md->pending[WRITE]);
+ int cpu;
+ struct hd_struct *part = &dm_disk(md)->part0;
+ long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+ sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
+ }
+
+ return sum != 0;
+}
+
+static bool md_in_flight(struct mapped_device *md)
+{
+ if (queue_is_mq(md->queue))
+ return blk_mq_queue_inflight(md->queue);
+ else
+ return md_in_flight_bios(md);
}
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
- int rw = bio_data_dir(bio);
io->start_time = jiffies;
generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
&dm_disk(md)->part0);
- atomic_set(&dm_disk(md)->part0.in_flight[rw],
- atomic_inc_return(&md->pending[rw]));
-
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -677,8 +689,6 @@ static void end_io_acct(struct dm_io *io)
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned long duration = jiffies - io->start_time;
- int pending;
- int rw = bio_data_dir(bio);
generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
io->start_time);
@@ -688,16 +698,8 @@ static void end_io_acct(struct dm_io *io)
bio->bi_iter.bi_sector, bio_sectors(bio),
true, duration, &io->stats_aux);
- /*
- * After this is decremented the bio must not be touched if it is
- * a flush.
- */
- pending = atomic_dec_return(&md->pending[rw]);
- atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
- pending += atomic_read(&md->pending[rw^0x1]);
-
/* nudge anyone waiting on suspend queue */
- if (!pending)
+ if (unlikely(waitqueue_active(&md->wait)))
wake_up(&md->wait);
}
@@ -1417,10 +1419,21 @@ static int __send_empty_flush(struct clone_info *ci)
unsigned target_nr = 0;
struct dm_target *ti;
+ /*
+ * Empty flush uses a statically initialized bio, as the base for
+ * cloning. However, blkg association requires that a bdev is
+ * associated with a gendisk, which doesn't happen until the bdev is
+ * opened. So, blkg association is done at issue time of the flush
+ * rather than when the device is created in alloc_dev().
+ */
+ bio_set_dev(ci->bio, ci->io->md->bdev);
+
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+ bio_disassociate_blkg(ci->bio);
+
return 0;
}
@@ -1598,7 +1611,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.io->md->flush_bio;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1654,7 +1676,16 @@ static blk_qc_t __process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.io->md->flush_bio;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1898,7 +1929,7 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
+ md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
md->queue->queuedata = md;
@@ -1908,8 +1939,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->disk)
goto bad;
- atomic_set(&md->pending[0], 0);
- atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
@@ -1940,10 +1969,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->bdev)
goto bad;
- bio_init(&md->flush_bio, NULL, 0);
- bio_set_dev(&md->flush_bio, md->bdev);
- md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
-
dm_stats_init(&md->stats);
/* Populate the mapping, nobody knows we exist yet */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fc488cb30a94..9a0a1e0934d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
- int cpu;
blk_queue_split(q, &bio);
@@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
md_handle_request(mddev, bio);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
+ part_stat_lock();
+ part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
+ part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ac1cffd2a09b..f3fb5bb8c82a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
!discard_bio)
continue;
bio_chain(discard_bio, bio);
- bio_clone_blkcg_association(discard_bio, bio);
+ bio_clone_blkg_association(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
discard_bio, disk_devt(mddev->gendisk),
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 8a02f11076f9..82daccc9ea62 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -15,7 +15,7 @@
#define pr_fmt(fmt) DRIVER_NAME ": " fmt
#include <linux/module.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/memstick.h>
#include <linux/idr.h>
#include <linux/hdreg.h>
@@ -1873,69 +1873,65 @@ static void msb_io_work(struct work_struct *work)
struct msb_data *msb = container_of(work, struct msb_data, io_work);
int page, error, len;
sector_t lba;
- unsigned long flags;
struct scatterlist *sg = msb->prealloc_sg;
+ struct request *req;
dbg_verbose("IO: work started");
while (1) {
- spin_lock_irqsave(&msb->q_lock, flags);
+ spin_lock_irq(&msb->q_lock);
if (msb->need_flush_cache) {
msb->need_flush_cache = false;
- spin_unlock_irqrestore(&msb->q_lock, flags);
+ spin_unlock_irq(&msb->q_lock);
msb_cache_flush(msb);
continue;
}
- if (!msb->req) {
- msb->req = blk_fetch_request(msb->queue);
- if (!msb->req) {
- dbg_verbose("IO: no more requests exiting");
- spin_unlock_irqrestore(&msb->q_lock, flags);
- return;
- }
+ req = msb->req;
+ if (!req) {
+ dbg_verbose("IO: no more requests exiting");
+ spin_unlock_irq(&msb->q_lock);
+ return;
}
- spin_unlock_irqrestore(&msb->q_lock, flags);
-
- /* If card was removed meanwhile */
- if (!msb->req)
- return;
+ spin_unlock_irq(&msb->q_lock);
/* process the request */
dbg_verbose("IO: processing new request");
- blk_rq_map_sg(msb->queue, msb->req, sg);
+ blk_rq_map_sg(msb->queue, req, sg);
- lba = blk_rq_pos(msb->req);
+ lba = blk_rq_pos(req);
sector_div(lba, msb->page_size / 512);
page = sector_div(lba, msb->pages_in_block);
if (rq_data_dir(msb->req) == READ)
error = msb_do_read_request(msb, lba, page, sg,
- blk_rq_bytes(msb->req), &len);
+ blk_rq_bytes(req), &len);
else
error = msb_do_write_request(msb, lba, page, sg,
- blk_rq_bytes(msb->req), &len);
-
- spin_lock_irqsave(&msb->q_lock, flags);
+ blk_rq_bytes(req), &len);
- if (len)
- if (!__blk_end_request(msb->req, BLK_STS_OK, len))
- msb->req = NULL;
+ if (len && !blk_update_request(req, BLK_STS_OK, len)) {
+ __blk_mq_end_request(req, BLK_STS_OK);
+ spin_lock_irq(&msb->q_lock);
+ msb->req = NULL;
+ spin_unlock_irq(&msb->q_lock);
+ }
if (error && msb->req) {
blk_status_t ret = errno_to_blk_status(error);
+
dbg_verbose("IO: ending one sector of the request with error");
- if (!__blk_end_request(msb->req, ret, msb->page_size))
- msb->req = NULL;
+ blk_mq_end_request(req, ret);
+ spin_lock_irq(&msb->q_lock);
+ msb->req = NULL;
+ spin_unlock_irq(&msb->q_lock);
}
if (msb->req)
dbg_verbose("IO: request still pending");
-
- spin_unlock_irqrestore(&msb->q_lock, flags);
}
}
@@ -2002,29 +1998,40 @@ static int msb_bd_getgeo(struct block_device *bdev,
return 0;
}
-static void msb_submit_req(struct request_queue *q)
+static blk_status_t msb_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
{
- struct memstick_dev *card = q->queuedata;
+ struct memstick_dev *card = hctx->queue->queuedata;
struct msb_data *msb = memstick_get_drvdata(card);
- struct request *req = NULL;
+ struct request *req = bd->rq;
dbg_verbose("Submit request");
+ spin_lock_irq(&msb->q_lock);
+
if (msb->card_dead) {
dbg("Refusing requests on removed card");
WARN_ON(!msb->io_queue_stopped);
- while ((req = blk_fetch_request(q)) != NULL)
- __blk_end_request_all(req, BLK_STS_IOERR);
- return;
+ spin_unlock_irq(&msb->q_lock);
+ blk_mq_start_request(req);
+ return BLK_STS_IOERR;
}
- if (msb->req)
- return;
+ if (msb->req) {
+ spin_unlock_irq(&msb->q_lock);
+ return BLK_STS_DEV_RESOURCE;
+ }
+
+ blk_mq_start_request(req);
+ msb->req = req;
if (!msb->io_queue_stopped)
queue_work(msb->io_queue, &msb->io_work);
+
+ spin_unlock_irq(&msb->q_lock);
+ return BLK_STS_OK;
}
static int msb_check_card(struct memstick_dev *card)
@@ -2040,21 +2047,20 @@ static void msb_stop(struct memstick_dev *card)
dbg("Stopping all msblock IO");
+ blk_mq_stop_hw_queues(msb->queue);
spin_lock_irqsave(&msb->q_lock, flags);
- blk_stop_queue(msb->queue);
msb->io_queue_stopped = true;
spin_unlock_irqrestore(&msb->q_lock, flags);
del_timer_sync(&msb->cache_flush_timer);
flush_workqueue(msb->io_queue);
+ spin_lock_irqsave(&msb->q_lock, flags);
if (msb->req) {
- spin_lock_irqsave(&msb->q_lock, flags);
- blk_requeue_request(msb->queue, msb->req);
+ blk_mq_requeue_request(msb->req, false);
msb->req = NULL;
- spin_unlock_irqrestore(&msb->q_lock, flags);
}
-
+ spin_unlock_irqrestore(&msb->q_lock, flags);
}
static void msb_start(struct memstick_dev *card)
@@ -2077,9 +2083,7 @@ static void msb_start(struct memstick_dev *card)
msb->need_flush_cache = true;
msb->io_queue_stopped = false;
- spin_lock_irqsave(&msb->q_lock, flags);
- blk_start_queue(msb->queue);
- spin_unlock_irqrestore(&msb->q_lock, flags);
+ blk_mq_start_hw_queues(msb->queue);
queue_work(msb->io_queue, &msb->io_work);
@@ -2092,6 +2096,10 @@ static const struct block_device_operations msb_bdops = {
.owner = THIS_MODULE
};
+static const struct blk_mq_ops msb_mq_ops = {
+ .queue_rq = msb_queue_rq,
+};
+
/* Registers the block device */
static int msb_init_disk(struct memstick_dev *card)
{
@@ -2112,9 +2120,11 @@ static int msb_init_disk(struct memstick_dev *card)
goto out_release_id;
}
- msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock);
- if (!msb->queue) {
- rc = -ENOMEM;
+ msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &msb_mq_ops, 2,
+ BLK_MQ_F_SHOULD_MERGE);
+ if (IS_ERR(msb->queue)) {
+ rc = PTR_ERR(msb->queue);
+ msb->queue = NULL;
goto out_put_disk;
}
@@ -2202,12 +2212,13 @@ static void msb_remove(struct memstick_dev *card)
/* Take care of unhandled + new requests from now on */
spin_lock_irqsave(&msb->q_lock, flags);
msb->card_dead = true;
- blk_start_queue(msb->queue);
spin_unlock_irqrestore(&msb->q_lock, flags);
+ blk_mq_start_hw_queues(msb->queue);
/* Remove the disk */
del_gendisk(msb->disk);
blk_cleanup_queue(msb->queue);
+ blk_mq_free_tag_set(&msb->tag_set);
msb->queue = NULL;
mutex_lock(&msb_disk_lock);
diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h
index 53962c3b21df..9ba84e0ced63 100644
--- a/drivers/memstick/core/ms_block.h
+++ b/drivers/memstick/core/ms_block.h
@@ -152,6 +152,7 @@ struct msb_data {
struct gendisk *disk;
struct request_queue *queue;
spinlock_t q_lock;
+ struct blk_mq_tag_set tag_set;
struct hd_geometry geometry;
struct attribute_group attr_group;
struct request *req;
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 0cd30dcb6801..aba50ec98b4d 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -12,7 +12,7 @@
*
*/
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/kthread.h>
@@ -142,6 +142,7 @@ struct mspro_block_data {
struct gendisk *disk;
struct request_queue *queue;
struct request *block_req;
+ struct blk_mq_tag_set tag_set;
spinlock_t q_lock;
unsigned short page_size;
@@ -152,7 +153,6 @@ struct mspro_block_data {
unsigned char system;
unsigned char read_only:1,
eject:1,
- has_request:1,
data_dir:1,
active:1;
unsigned char transfer_cmd;
@@ -694,13 +694,12 @@ static void h_mspro_block_setup_cmd(struct memstick_dev *card, u64 offset,
/*** Data transfer ***/
-static int mspro_block_issue_req(struct memstick_dev *card, int chunk)
+static int mspro_block_issue_req(struct memstick_dev *card, bool chunk)
{
struct mspro_block_data *msb = memstick_get_drvdata(card);
u64 t_off;
unsigned int count;
-try_again:
while (chunk) {
msb->current_page = 0;
msb->current_seg = 0;
@@ -709,9 +708,17 @@ try_again:
msb->req_sg);
if (!msb->seg_count) {
- chunk = __blk_end_request_cur(msb->block_req,
- BLK_STS_RESOURCE);
- continue;
+ unsigned int bytes = blk_rq_cur_bytes(msb->block_req);
+
+ chunk = blk_update_request(msb->block_req,
+ BLK_STS_RESOURCE,
+ bytes);
+ if (chunk)
+ continue;
+ __blk_mq_end_request(msb->block_req,
+ BLK_STS_RESOURCE);
+ msb->block_req = NULL;
+ break;
}
t_off = blk_rq_pos(msb->block_req);
@@ -729,30 +736,22 @@ try_again:
return 0;
}
- dev_dbg(&card->dev, "blk_fetch\n");
- msb->block_req = blk_fetch_request(msb->queue);
- if (!msb->block_req) {
- dev_dbg(&card->dev, "issue end\n");
- return -EAGAIN;
- }
-
- dev_dbg(&card->dev, "trying again\n");
- chunk = 1;
- goto try_again;
+ return 1;
}
static int mspro_block_complete_req(struct memstick_dev *card, int error)
{
struct mspro_block_data *msb = memstick_get_drvdata(card);
- int chunk, cnt;
+ int cnt;
+ bool chunk;
unsigned int t_len = 0;
unsigned long flags;
spin_lock_irqsave(&msb->q_lock, flags);
- dev_dbg(&card->dev, "complete %d, %d\n", msb->has_request ? 1 : 0,
+ dev_dbg(&card->dev, "complete %d, %d\n", msb->block_req ? 1 : 0,
error);
- if (msb->has_request) {
+ if (msb->block_req) {
/* Nothing to do - not really an error */
if (error == -EAGAIN)
error = 0;
@@ -777,15 +776,17 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
if (error && !t_len)
t_len = blk_rq_cur_bytes(msb->block_req);
- chunk = __blk_end_request(msb->block_req,
+ chunk = blk_update_request(msb->block_req,
errno_to_blk_status(error), t_len);
-
- error = mspro_block_issue_req(card, chunk);
-
- if (!error)
- goto out;
- else
- msb->has_request = 0;
+ if (chunk) {
+ error = mspro_block_issue_req(card, chunk);
+ if (!error)
+ goto out;
+ } else {
+ __blk_mq_end_request(msb->block_req,
+ errno_to_blk_status(error));
+ msb->block_req = NULL;
+ }
} else {
if (!error)
error = -EAGAIN;
@@ -806,8 +807,8 @@ static void mspro_block_stop(struct memstick_dev *card)
while (1) {
spin_lock_irqsave(&msb->q_lock, flags);
- if (!msb->has_request) {
- blk_stop_queue(msb->queue);
+ if (!msb->block_req) {
+ blk_mq_stop_hw_queues(msb->queue);
rc = 1;
}
spin_unlock_irqrestore(&msb->q_lock, flags);
@@ -822,32 +823,37 @@ static void mspro_block_stop(struct memstick_dev *card)
static void mspro_block_start(struct memstick_dev *card)
{
struct mspro_block_data *msb = memstick_get_drvdata(card);
- unsigned long flags;
- spin_lock_irqsave(&msb->q_lock, flags);
- blk_start_queue(msb->queue);
- spin_unlock_irqrestore(&msb->q_lock, flags);
+ blk_mq_start_hw_queues(msb->queue);
}
-static void mspro_block_submit_req(struct request_queue *q)
+static blk_status_t mspro_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
{
- struct memstick_dev *card = q->queuedata;
+ struct memstick_dev *card = hctx->queue->queuedata;
struct mspro_block_data *msb = memstick_get_drvdata(card);
- struct request *req = NULL;
- if (msb->has_request)
- return;
+ spin_lock_irq(&msb->q_lock);
- if (msb->eject) {
- while ((req = blk_fetch_request(q)) != NULL)
- __blk_end_request_all(req, BLK_STS_IOERR);
+ if (msb->block_req) {
+ spin_unlock_irq(&msb->q_lock);
+ return BLK_STS_DEV_RESOURCE;
+ }
- return;
+ if (msb->eject) {
+ spin_unlock_irq(&msb->q_lock);
+ blk_mq_start_request(bd->rq);
+ return BLK_STS_IOERR;
}
- msb->has_request = 1;
- if (mspro_block_issue_req(card, 0))
- msb->has_request = 0;
+ msb->block_req = bd->rq;
+ blk_mq_start_request(bd->rq);
+
+ if (mspro_block_issue_req(card, true))
+ msb->block_req = NULL;
+
+ spin_unlock_irq(&msb->q_lock);
+ return BLK_STS_OK;
}
/*** Initialization ***/
@@ -1167,6 +1173,10 @@ static int mspro_block_init_card(struct memstick_dev *card)
}
+static const struct blk_mq_ops mspro_mq_ops = {
+ .queue_rq = mspro_queue_rq,
+};
+
static int mspro_block_init_disk(struct memstick_dev *card)
{
struct mspro_block_data *msb = memstick_get_drvdata(card);
@@ -1206,9 +1216,11 @@ static int mspro_block_init_disk(struct memstick_dev *card)
goto out_release_id;
}
- msb->queue = blk_init_queue(mspro_block_submit_req, &msb->q_lock);
- if (!msb->queue) {
- rc = -ENOMEM;
+ msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &mspro_mq_ops, 2,
+ BLK_MQ_F_SHOULD_MERGE);
+ if (IS_ERR(msb->queue)) {
+ rc = PTR_ERR(msb->queue);
+ msb->queue = NULL;
goto out_put_disk;
}
@@ -1318,13 +1330,14 @@ static void mspro_block_remove(struct memstick_dev *card)
spin_lock_irqsave(&msb->q_lock, flags);
msb->eject = 1;
- blk_start_queue(msb->queue);
spin_unlock_irqrestore(&msb->q_lock, flags);
+ blk_mq_start_hw_queues(msb->queue);
del_gendisk(msb->disk);
dev_dbg(&card->dev, "mspro block remove\n");
blk_cleanup_queue(msb->queue);
+ blk_mq_free_tag_set(&msb->tag_set);
msb->queue = NULL;
sysfs_remove_group(&card->dev.kobj, &msb->attr_group);
@@ -1344,8 +1357,9 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state)
struct mspro_block_data *msb = memstick_get_drvdata(card);
unsigned long flags;
+ blk_mq_stop_hw_queues(msb->queue);
+
spin_lock_irqsave(&msb->q_lock, flags);
- blk_stop_queue(msb->queue);
msb->active = 0;
spin_unlock_irqrestore(&msb->q_lock, flags);
@@ -1355,7 +1369,6 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state)
static int mspro_block_resume(struct memstick_dev *card)
{
struct mspro_block_data *msb = memstick_get_drvdata(card);
- unsigned long flags;
int rc = 0;
#ifdef CONFIG_MEMSTICK_UNSAFE_RESUME
@@ -1401,9 +1414,7 @@ out_unlock:
#endif /* CONFIG_MEMSTICK_UNSAFE_RESUME */
- spin_lock_irqsave(&msb->q_lock, flags);
- blk_start_queue(msb->queue);
- spin_unlock_irqrestore(&msb->q_lock, flags);
+ blk_mq_start_hw_queues(msb->queue);
return rc;
}
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 111934838da2..62e7619d5a4d 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -100,7 +100,6 @@ static DEFINE_IDA(mmc_rpmb_ida);
* There is one mmc_blk_data per slot.
*/
struct mmc_blk_data {
- spinlock_t lock;
struct device *parent;
struct gendisk *disk;
struct mmc_queue queue;
@@ -1488,7 +1487,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
blk_mq_end_request(req, BLK_STS_OK);
}
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irqsave(&mq->lock, flags);
mq->in_flight[mmc_issue_type(mq, req)] -= 1;
@@ -1496,7 +1495,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
mmc_cqe_check_busy(mq);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irqrestore(&mq->lock, flags);
if (!mq->cqe_busy)
blk_mq_run_hw_queues(q, true);
@@ -1993,17 +1992,16 @@ static void mmc_blk_mq_poll_completion(struct mmc_queue *mq,
static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req)
{
- struct request_queue *q = req->q;
unsigned long flags;
bool put_card;
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irqsave(&mq->lock, flags);
mq->in_flight[mmc_issue_type(mq, req)] -= 1;
put_card = (mmc_tot_in_flight(mq) == 0);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irqrestore(&mq->lock, flags);
if (put_card)
mmc_put_card(mq->card, &mq->ctx);
@@ -2099,11 +2097,11 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
* request does not need to wait (although it does need to
* complete complete_req first).
*/
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irqsave(&mq->lock, flags);
mq->complete_req = req;
mq->rw_wait = false;
waiting = mq->waiting;
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irqrestore(&mq->lock, flags);
/*
* If 'waiting' then the waiting task will complete this
@@ -2122,10 +2120,10 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
/* Take the recovery path for errors or urgent background operations */
if (mmc_blk_rq_error(&mqrq->brq) ||
mmc_blk_urgent_bkops_needed(mq, mqrq)) {
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irqsave(&mq->lock, flags);
mq->recovery_needed = true;
mq->recovery_req = req;
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irqrestore(&mq->lock, flags);
wake_up(&mq->wait);
schedule_work(&mq->recovery_work);
return;
@@ -2141,7 +2139,6 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
{
- struct request_queue *q = mq->queue;
unsigned long flags;
bool done;
@@ -2149,7 +2146,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
* Wait while there is another request in progress, but not if recovery
* is needed. Also indicate whether there is a request waiting to start.
*/
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irqsave(&mq->lock, flags);
if (mq->recovery_needed) {
*err = -EBUSY;
done = true;
@@ -2157,7 +2154,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
done = !mq->rw_wait;
}
mq->waiting = !done;
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irqrestore(&mq->lock, flags);
return done;
}
@@ -2334,12 +2331,11 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
goto err_kfree;
}
- spin_lock_init(&md->lock);
INIT_LIST_HEAD(&md->part);
INIT_LIST_HEAD(&md->rpmbs);
md->usage = 1;
- ret = mmc_init_queue(&md->queue, card, &md->lock, subname);
+ ret = mmc_init_queue(&md->queue, card);
if (ret)
goto err_putdisk;
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 6edffeed9953..35cc138b096d 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -89,9 +89,9 @@ void mmc_cqe_recovery_notifier(struct mmc_request *mrq)
struct mmc_queue *mq = q->queuedata;
unsigned long flags;
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irqsave(&mq->lock, flags);
__mmc_cqe_recovery_notifier(mq);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irqrestore(&mq->lock, flags);
}
static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
@@ -128,14 +128,14 @@ static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
unsigned long flags;
int ret;
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irqsave(&mq->lock, flags);
if (mq->recovery_needed || !mq->use_cqe)
ret = BLK_EH_RESET_TIMER;
else
ret = mmc_cqe_timed_out(req);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irqrestore(&mq->lock, flags);
return ret;
}
@@ -157,9 +157,9 @@ static void mmc_mq_recovery_handler(struct work_struct *work)
mq->in_recovery = false;
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&mq->lock);
mq->recovery_needed = false;
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&mq->lock);
mmc_put_card(mq->card, &mq->ctx);
@@ -258,10 +258,10 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
issue_type = mmc_issue_type(mq, req);
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&mq->lock);
if (mq->recovery_needed || mq->busy) {
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&mq->lock);
return BLK_STS_RESOURCE;
}
@@ -269,7 +269,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
case MMC_ISSUE_DCMD:
if (mmc_cqe_dcmd_busy(mq)) {
mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&mq->lock);
return BLK_STS_RESOURCE;
}
break;
@@ -294,7 +294,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
get_card = (mmc_tot_in_flight(mq) == 1);
cqe_retune_ok = (mmc_cqe_qcnt(mq) == 1);
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&mq->lock);
if (!(req->rq_flags & RQF_DONTPREP)) {
req_to_mmc_queue_req(req)->retries = 0;
@@ -328,12 +328,12 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
if (issued != MMC_REQ_STARTED) {
bool put_card = false;
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&mq->lock);
mq->in_flight[issue_type] -= 1;
if (mmc_tot_in_flight(mq) == 0)
put_card = true;
mq->busy = false;
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&mq->lock);
if (put_card)
mmc_put_card(card, &mq->ctx);
} else {
@@ -378,14 +378,37 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
init_waitqueue_head(&mq->wait);
}
-static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth,
- const struct blk_mq_ops *mq_ops, spinlock_t *lock)
+/* Set queue depth to get a reasonable value for q->nr_requests */
+#define MMC_QUEUE_DEPTH 64
+
+/**
+ * mmc_init_queue - initialise a queue structure.
+ * @mq: mmc queue
+ * @card: mmc card to attach this queue
+ *
+ * Initialise a MMC card request queue.
+ */
+int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
{
+ struct mmc_host *host = card->host;
int ret;
+ mq->card = card;
+ mq->use_cqe = host->cqe_enabled;
+
+ spin_lock_init(&mq->lock);
+
memset(&mq->tag_set, 0, sizeof(mq->tag_set));
- mq->tag_set.ops = mq_ops;
- mq->tag_set.queue_depth = q_depth;
+ mq->tag_set.ops = &mmc_mq_ops;
+ /*
+ * The queue depth for CQE must match the hardware because the request
+ * tag is used to index the hardware queue.
+ */
+ if (mq->use_cqe)
+ mq->tag_set.queue_depth =
+ min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth);
+ else
+ mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
mq->tag_set.numa_node = NUMA_NO_NODE;
mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
BLK_MQ_F_BLOCKING;
@@ -403,68 +426,17 @@ static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth,
goto free_tag_set;
}
- mq->queue->queue_lock = lock;
mq->queue->queuedata = mq;
+ blk_queue_rq_timeout(mq->queue, 60 * HZ);
+ mmc_setup_queue(mq, card);
return 0;
free_tag_set:
blk_mq_free_tag_set(&mq->tag_set);
-
return ret;
}
-/* Set queue depth to get a reasonable value for q->nr_requests */
-#define MMC_QUEUE_DEPTH 64
-
-static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
- spinlock_t *lock)
-{
- struct mmc_host *host = card->host;
- int q_depth;
- int ret;
-
- /*
- * The queue depth for CQE must match the hardware because the request
- * tag is used to index the hardware queue.
- */
- if (mq->use_cqe)
- q_depth = min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth);
- else
- q_depth = MMC_QUEUE_DEPTH;
-
- ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);
- if (ret)
- return ret;
-
- blk_queue_rq_timeout(mq->queue, 60 * HZ);
-
- mmc_setup_queue(mq, card);
-
- return 0;
-}
-
-/**
- * mmc_init_queue - initialise a queue structure.
- * @mq: mmc queue
- * @card: mmc card to attach this queue
- * @lock: queue lock
- * @subname: partition subname
- *
- * Initialise a MMC card request queue.
- */
-int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
- spinlock_t *lock, const char *subname)
-{
- struct mmc_host *host = card->host;
-
- mq->card = card;
-
- mq->use_cqe = host->cqe_enabled;
-
- return mmc_mq_init(mq, card, lock);
-}
-
void mmc_queue_suspend(struct mmc_queue *mq)
{
blk_mq_quiesce_queue(mq->queue);
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 9bf3c9245075..fd11491ced9f 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -77,6 +77,7 @@ struct mmc_queue {
struct blk_mq_tag_set tag_set;
struct mmc_blk_data *blkdata;
struct request_queue *queue;
+ spinlock_t lock;
int in_flight[MMC_ISSUE_MAX];
unsigned int cqe_busy;
#define MMC_CQE_DCMD_BUSY BIT(0)
@@ -95,8 +96,7 @@ struct mmc_queue {
struct work_struct complete_work;
};
-extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
- const char *);
+extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *);
extern void mmc_cleanup_queue(struct mmc_queue *);
extern void mmc_queue_suspend(struct mmc_queue *);
extern void mmc_queue_resume(struct mmc_queue *);
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 59dd50866932..5477a014e1fb 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1322,7 +1322,7 @@ static int ath6kl_cfg80211_set_default_key(struct wiphy *wiphy,
struct ath6kl_vif *vif = netdev_priv(ndev);
struct ath6kl_key *key = NULL;
u8 key_usage;
- enum crypto_type key_type = NONE_CRYPT;
+ enum ath6kl_crypto_type key_type = NONE_CRYPT;
ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: index %d\n", __func__, key_index);
diff --git a/drivers/net/wireless/ath/ath6kl/common.h b/drivers/net/wireless/ath/ath6kl/common.h
index 4f82e8632d37..d6e5234f67a1 100644
--- a/drivers/net/wireless/ath/ath6kl/common.h
+++ b/drivers/net/wireless/ath/ath6kl/common.h
@@ -67,7 +67,7 @@ struct ath6kl_llc_snap_hdr {
__be16 eth_type;
} __packed;
-enum crypto_type {
+enum ath6kl_crypto_type {
NONE_CRYPT = 0x01,
WEP_CRYPT = 0x02,
TKIP_CRYPT = 0x04,
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c
index 777acc564ac9..9d7ac1ab2d02 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.c
+++ b/drivers/net/wireless/ath/ath6kl/wmi.c
@@ -1849,9 +1849,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx,
enum network_type nw_type,
enum dot11_auth_mode dot11_auth_mode,
enum auth_mode auth_mode,
- enum crypto_type pairwise_crypto,
+ enum ath6kl_crypto_type pairwise_crypto,
u8 pairwise_crypto_len,
- enum crypto_type group_crypto,
+ enum ath6kl_crypto_type group_crypto,
u8 group_crypto_len, int ssid_len, u8 *ssid,
u8 *bssid, u16 channel, u32 ctrl_flags,
u8 nw_subtype)
@@ -2301,7 +2301,7 @@ int ath6kl_wmi_disctimeout_cmd(struct wmi *wmi, u8 if_idx, u8 timeout)
}
int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index,
- enum crypto_type key_type,
+ enum ath6kl_crypto_type key_type,
u8 key_usage, u8 key_len,
u8 *key_rsc, unsigned int key_rsc_len,
u8 *key_material,
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.h b/drivers/net/wireless/ath/ath6kl/wmi.h
index a60bb49fe920..784940ba4c90 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.h
+++ b/drivers/net/wireless/ath/ath6kl/wmi.h
@@ -2556,9 +2556,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx,
enum network_type nw_type,
enum dot11_auth_mode dot11_auth_mode,
enum auth_mode auth_mode,
- enum crypto_type pairwise_crypto,
+ enum ath6kl_crypto_type pairwise_crypto,
u8 pairwise_crypto_len,
- enum crypto_type group_crypto,
+ enum ath6kl_crypto_type group_crypto,
u8 group_crypto_len, int ssid_len, u8 *ssid,
u8 *bssid, u16 channel, u32 ctrl_flags,
u8 nw_subtype);
@@ -2610,7 +2610,7 @@ int ath6kl_wmi_config_debug_module_cmd(struct wmi *wmi, u32 valid, u32 config);
int ath6kl_wmi_get_stats_cmd(struct wmi *wmi, u8 if_idx);
int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index,
- enum crypto_type key_type,
+ enum ath6kl_crypto_type key_type,
u8 key_usage, u8 key_len,
u8 *key_rsc, unsigned int key_rsc_len,
u8 *key_material,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 0e39e3d1846f..f7019294740c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -393,7 +393,7 @@ static int pmem_attach_disk(struct device *dev,
return -EBUSY;
}
- q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL);
+ q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
if (!q)
return -ENOMEM;
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 88a8b5916624..0f345e207675 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -57,3 +57,18 @@ config NVME_FC
from https://github.com/linux-nvme/nvme-cli.
If unsure, say N.
+
+config NVME_TCP
+ tristate "NVM Express over Fabrics TCP host driver"
+ depends on INET
+ depends on BLK_DEV_NVME
+ select NVME_FABRICS
+ help
+ This provides support for the NVMe over Fabrics protocol using
+ the TCP transport. This allows you to use remote block devices
+ exported using the NVMe protocol set.
+
+ To configure a NVMe over Fabrics controller use the nvme-cli tool
+ from https://github.com/linux-nvme/nvme-cli.
+
+ If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index aea459c65ae1..8a4b671c5f0c 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
obj-$(CONFIG_NVME_FC) += nvme-fc.o
+obj-$(CONFIG_NVME_TCP) += nvme-tcp.o
nvme-core-y := core.o
nvme-core-$(CONFIG_TRACING) += trace.o
@@ -21,3 +22,5 @@ nvme-fabrics-y += fabrics.o
nvme-rdma-y += rdma.o
nvme-fc-y += fc.o
+
+nvme-tcp-y += tcp.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 962012135b62..08f2c92602f4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -97,7 +97,6 @@ static dev_t nvme_chr_devt;
static struct class *nvme_class;
static struct class *nvme_subsys_class;
-static void nvme_ns_remove(struct nvme_ns *ns);
static int nvme_revalidate_disk(struct gendisk *disk);
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -245,12 +244,31 @@ static inline bool nvme_req_needs_retry(struct request *req)
return true;
}
+static void nvme_retry_req(struct request *req)
+{
+ struct nvme_ns *ns = req->q->queuedata;
+ unsigned long delay = 0;
+ u16 crd;
+
+ /* The mask and shift result must be <= 3 */
+ crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
+ if (ns && crd)
+ delay = ns->ctrl->crdt[crd - 1] * 100;
+
+ nvme_req(req)->retries++;
+ blk_mq_requeue_request(req, false);
+ blk_mq_delay_kick_requeue_list(req->q, delay);
+}
+
void nvme_complete_rq(struct request *req)
{
blk_status_t status = nvme_error_status(req);
trace_nvme_complete_rq(req);
+ if (nvme_req(req)->ctrl->kas)
+ nvme_req(req)->ctrl->comp_seen = true;
+
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
if ((req->cmd_flags & REQ_NVME_MPATH) &&
blk_path_error(status)) {
@@ -259,8 +277,7 @@ void nvme_complete_rq(struct request *req)
}
if (!blk_queue_dying(req->q)) {
- nvme_req(req)->retries++;
- blk_mq_requeue_request(req, true);
+ nvme_retry_req(req);
return;
}
}
@@ -268,14 +285,14 @@ void nvme_complete_rq(struct request *req)
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
-void nvme_cancel_request(struct request *req, void *data, bool reserved)
+bool nvme_cancel_request(struct request *req, void *data, bool reserved)
{
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
nvme_req(req)->status = NVME_SC_ABORT_REQ;
blk_mq_complete_request(req);
-
+ return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -536,7 +553,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
- memset(cmnd, 0, sizeof(*cmnd));
cmnd->common.opcode = nvme_cmd_flush;
cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
}
@@ -548,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
struct nvme_dsm_range *range;
struct bio *bio;
- range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
- if (!range)
- return BLK_STS_RESOURCE;
+ range = kmalloc_array(segments, sizeof(*range),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!range) {
+ /*
+ * If we fail allocation our range, fallback to the controller
+ * discard page. If that's also busy, it's safe to return
+ * busy, as we know we can make progress once that's freed.
+ */
+ if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
+ return BLK_STS_RESOURCE;
+
+ range = page_address(ns->ctrl->discard_page);
+ }
__rq_for_each_bio(bio, req) {
u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
@@ -565,11 +591,13 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
}
if (WARN_ON_ONCE(n != segments)) {
- kfree(range);
+ if (virt_to_page(range) == ns->ctrl->discard_page)
+ clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+ else
+ kfree(range);
return BLK_STS_IOERR;
}
- memset(cmnd, 0, sizeof(*cmnd));
cmnd->dsm.opcode = nvme_cmd_dsm;
cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->dsm.nr = cpu_to_le32(segments - 1);
@@ -598,7 +626,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
- memset(cmnd, 0, sizeof(*cmnd));
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
@@ -650,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req)
blk_rq_bytes(req) >> ns->lba_shift);
}
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
- kfree(page_address(req->special_vec.bv_page) +
- req->special_vec.bv_offset);
+ struct nvme_ns *ns = req->rq_disk->private_data;
+ struct page *page = req->special_vec.bv_page;
+
+ if (page == ns->ctrl->discard_page)
+ clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+ else
+ kfree(page_address(page) + req->special_vec.bv_offset);
}
}
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -663,6 +695,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
nvme_clear_nvme_request(req);
+ memset(cmd, 0, sizeof(*cmd));
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
@@ -691,6 +724,31 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
+static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
+{
+ struct completion *waiting = rq->end_io_data;
+
+ rq->end_io_data = NULL;
+ complete(waiting);
+}
+
+static void nvme_execute_rq_polled(struct request_queue *q,
+ struct gendisk *bd_disk, struct request *rq, int at_head)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
+
+ rq->cmd_flags |= REQ_HIPRI;
+ rq->end_io_data = &wait;
+ blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
+
+ while (!completion_done(&wait)) {
+ blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
+ cond_resched();
+ }
+}
+
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code
@@ -698,7 +756,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head,
- blk_mq_req_flags_t flags)
+ blk_mq_req_flags_t flags, bool poll)
{
struct request *req;
int ret;
@@ -715,7 +773,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
goto out;
}
- blk_execute_rq(req->q, NULL, req, at_head);
+ if (poll)
+ nvme_execute_rq_polled(req->q, NULL, req, at_head);
+ else
+ blk_execute_rq(req->q, NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -732,7 +793,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0, 0, false);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@@ -843,6 +904,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
return;
}
+ ctrl->comp_seen = false;
spin_lock_irqsave(&ctrl->lock, flags);
if (ctrl->state == NVME_CTRL_LIVE ||
ctrl->state == NVME_CTRL_CONNECTING)
@@ -873,6 +935,15 @@ static void nvme_keep_alive_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_ctrl, ka_work);
+ bool comp_seen = ctrl->comp_seen;
+
+ if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
+ dev_dbg(ctrl->device,
+ "reschedule traffic based keep-alive timer\n");
+ ctrl->comp_seen = false;
+ schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ return;
+ }
if (nvme_keep_alive(ctrl)) {
/* allocation failure, reset the controller */
@@ -1041,7 +1112,7 @@ static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
- buffer, buflen, 0, NVME_QID_ANY, 0, 0);
+ buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
@@ -1240,12 +1311,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.nsid = cpu_to_le32(cmd.nsid);
c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
- c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
- c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
- c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
- c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
- c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
- c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+ c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+ c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+ c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+ c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+ c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+ c.common.cdw15 = cpu_to_le32(cmd.cdw15);
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -1524,8 +1595,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
if (ns->noiob)
nvme_set_chunk_size(ns);
nvme_update_disk_info(disk, ns, id);
- if (ns->ndev)
- nvme_nvm_update_nvm_info(ns);
#ifdef CONFIG_NVME_MULTIPATH
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
@@ -1608,7 +1677,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
memset(&c, 0, sizeof(c));
c.common.opcode = op;
c.common.nsid = cpu_to_le32(ns->head->ns_id);
- c.common.cdw10[0] = cpu_to_le32(cdw10);
+ c.common.cdw10 = cpu_to_le32(cdw10);
ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
nvme_put_ns_from_disk(head, srcu_idx);
@@ -1682,11 +1751,11 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
else
cmd.common.opcode = nvme_admin_security_recv;
cmd.common.nsid = 0;
- cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
- cmd.common.cdw10[1] = cpu_to_le32(len);
+ cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+ cmd.common.cdw11 = cpu_to_le32(len);
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
- ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
+ ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
}
EXPORT_SYMBOL_GPL(nvme_sec_submit);
#endif /* CONFIG_BLK_SED_OPAL */
@@ -1881,6 +1950,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
return ret;
}
+static int nvme_configure_acre(struct nvme_ctrl *ctrl)
+{
+ struct nvme_feat_host_behavior *host;
+ int ret;
+
+ /* Don't bother enabling the feature if retry delay is not reported */
+ if (!ctrl->crdt[0])
+ return 0;
+
+ host = kzalloc(sizeof(*host), GFP_KERNEL);
+ if (!host)
+ return 0;
+
+ host->acre = NVME_ENABLE_ACRE;
+ ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
+ host, sizeof(*host), NULL);
+ kfree(host);
+ return ret;
+}
+
static int nvme_configure_apst(struct nvme_ctrl *ctrl)
{
/*
@@ -2402,6 +2491,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
}
+ ctrl->crdt[0] = le16_to_cpu(id->crdt1);
+ ctrl->crdt[1] = le16_to_cpu(id->crdt2);
+ ctrl->crdt[2] = le16_to_cpu(id->crdt3);
+
ctrl->oacs = le16_to_cpu(id->oacs);
ctrl->oncs = le16_to_cpup(&id->oncs);
ctrl->oaes = le32_to_cpu(id->oaes);
@@ -2419,6 +2512,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
+ ctrl->ctratt = le32_to_cpu(id->ctratt);
if (id->rtd3e) {
/* us -> s */
@@ -2501,6 +2595,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ret < 0)
return ret;
+ ret = nvme_configure_acre(ctrl);
+ if (ret < 0)
+ return ret;
+
ctrl->identified = true;
return 0;
@@ -2776,6 +2874,7 @@ static ssize_t field##_show(struct device *dev, \
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
nvme_show_int_function(cntlid);
+nvme_show_int_function(numa_node);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
@@ -2855,6 +2954,7 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
&dev_attr_state.attr,
+ &dev_attr_numa_node.attr,
NULL
};
@@ -3065,7 +3165,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
struct gendisk *disk;
struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN];
- int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+ int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
@@ -3100,13 +3200,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_setup_streams_ns(ctrl, ns);
nvme_set_disk_name(disk_name, ns, ctrl, &flags);
- if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
- if (nvme_nvm_register(ns, disk_name, node)) {
- dev_warn(ctrl->device, "LightNVM init failure\n");
- goto out_unlink_ns;
- }
- }
-
disk = alloc_disk_node(0, node);
if (!disk)
goto out_unlink_ns;
@@ -3120,6 +3213,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
__nvme_revalidate_disk(disk, id);
+ if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
+ if (nvme_nvm_register(ns, disk_name, node)) {
+ dev_warn(ctrl->device, "LightNVM init failure\n");
+ goto out_put_disk;
+ }
+ }
+
down_write(&ctrl->namespaces_rwsem);
list_add_tail(&ns->list, &ctrl->namespaces);
up_write(&ctrl->namespaces_rwsem);
@@ -3133,6 +3233,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
kfree(id);
return;
+ out_put_disk:
+ put_disk(ns->disk);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
@@ -3522,6 +3624,7 @@ static void nvme_free_ctrl(struct device *dev)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects);
nvme_mpath_uninit(ctrl);
+ __free_page(ctrl->discard_page);
if (subsys) {
mutex_lock(&subsys->lock);
@@ -3562,6 +3665,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+ BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
+ PAGE_SIZE);
+ ctrl->discard_page = alloc_page(GFP_KERNEL);
+ if (!ctrl->discard_page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
if (ret < 0)
goto out;
@@ -3599,6 +3710,8 @@ out_free_name:
out_release_instance:
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
out:
+ if (ctrl->discard_page)
+ __free_page(ctrl->discard_page);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -3746,7 +3859,7 @@ out:
return result;
}
-void nvme_core_exit(void)
+void __exit nvme_core_exit(void)
{
ida_destroy(&nvme_subsystems_ida);
class_destroy(nvme_subsys_class);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index bd0969db6225..b2ab213f43de 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -159,7 +159,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
*val = le64_to_cpu(res.u64);
@@ -206,7 +206,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
cmd.prop_get.offset = cpu_to_le32(off);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
*val = le64_to_cpu(res.u64);
@@ -252,7 +252,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
cmd.prop_set.value = cpu_to_le64(val);
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0, 0, false);
if (unlikely(ret))
dev_err(ctrl->device,
"Property Set error: %d, offset %#x\n",
@@ -392,6 +392,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
+ if (ctrl->opts->disable_sqflow)
+ cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
@@ -403,7 +406,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
data, sizeof(*data), 0, NVME_QID_ANY, 1,
- BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+ BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
@@ -438,7 +441,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
-int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll)
{
struct nvme_command cmd;
struct nvmf_connect_data *data;
@@ -451,6 +454,9 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
cmd.connect.qid = cpu_to_le16(qid);
cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
+ if (ctrl->opts->disable_sqflow)
+ cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
@@ -462,7 +468,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
data, sizeof(*data), 0, qid, 1,
- BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+ BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, poll);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
@@ -607,6 +613,11 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
{ NVMF_OPT_HOST_ID, "hostid=%s" },
{ NVMF_OPT_DUP_CONNECT, "duplicate_connect" },
+ { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" },
+ { NVMF_OPT_HDR_DIGEST, "hdr_digest" },
+ { NVMF_OPT_DATA_DIGEST, "data_digest" },
+ { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
+ { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
{ NVMF_OPT_ERR, NULL }
};
@@ -626,6 +637,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
opts->kato = NVME_DEFAULT_KATO;
opts->duplicate_connect = false;
+ opts->hdr_digest = false;
+ opts->data_digest = false;
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
@@ -817,6 +830,39 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
case NVMF_OPT_DUP_CONNECT:
opts->duplicate_connect = true;
break;
+ case NVMF_OPT_DISABLE_SQFLOW:
+ opts->disable_sqflow = true;
+ break;
+ case NVMF_OPT_HDR_DIGEST:
+ opts->hdr_digest = true;
+ break;
+ case NVMF_OPT_DATA_DIGEST:
+ opts->data_digest = true;
+ break;
+ case NVMF_OPT_NR_WRITE_QUEUES:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token <= 0) {
+ pr_err("Invalid nr_write_queues %d\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_write_queues = token;
+ break;
+ case NVMF_OPT_NR_POLL_QUEUES:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token <= 0) {
+ pr_err("Invalid nr_poll_queues %d\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_poll_queues = token;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -933,7 +979,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
- NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT)
+ NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
+ NVMF_OPT_DISABLE_SQFLOW)
static struct nvme_ctrl *
nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 6ea6275f332a..478343b73e38 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -58,6 +58,11 @@ enum {
NVMF_OPT_CTRL_LOSS_TMO = 1 << 11,
NVMF_OPT_HOST_ID = 1 << 12,
NVMF_OPT_DUP_CONNECT = 1 << 13,
+ NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
+ NVMF_OPT_HDR_DIGEST = 1 << 15,
+ NVMF_OPT_DATA_DIGEST = 1 << 16,
+ NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
+ NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
};
/**
@@ -85,6 +90,11 @@ enum {
* @max_reconnects: maximum number of allowed reconnect attempts before removing
* the controller, (-1) means reconnect forever, zero means remove
* immediately;
+ * @disable_sqflow: disable controller sq flow control
+ * @hdr_digest: generate/verify header digest (TCP)
+ * @data_digest: generate/verify data digest (TCP)
+ * @nr_write_queues: number of queues for write I/O
+ * @nr_poll_queues: number of queues for polling I/O
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -101,6 +111,11 @@ struct nvmf_ctrl_options {
unsigned int kato;
struct nvmf_host *host;
int max_reconnects;
+ bool disable_sqflow;
+ bool hdr_digest;
+ bool data_digest;
+ unsigned int nr_write_queues;
+ unsigned int nr_poll_queues;
};
/*
@@ -156,7 +171,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
-int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll);
int nvmf_register_transport(struct nvmf_transport_ops *ops);
void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
void nvmf_free_options(struct nvmf_ctrl_options *opts);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index feb86b59170e..89accc76d71c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1975,7 +1975,7 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
(qsize / 5));
if (ret)
break;
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false);
if (ret)
break;
@@ -2326,38 +2326,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
}
-static struct blk_mq_tags *
-nvme_fc_tagset(struct nvme_fc_queue *queue)
-{
- if (queue->qnum == 0)
- return queue->ctrl->admin_tag_set.tags[queue->qnum];
-
- return queue->ctrl->tag_set.tags[queue->qnum - 1];
-}
-
-static int
-nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
-
-{
- struct nvme_fc_queue *queue = hctx->driver_data;
- struct nvme_fc_ctrl *ctrl = queue->ctrl;
- struct request *req;
- struct nvme_fc_fcp_op *op;
-
- req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
- if (!req)
- return 0;
-
- op = blk_mq_rq_to_pdu(req);
-
- if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) &&
- (ctrl->lport->ops->poll_queue))
- ctrl->lport->ops->poll_queue(&ctrl->lport->localport,
- queue->lldd_handle);
-
- return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE));
-}
-
static void
nvme_fc_submit_async_event(struct nvme_ctrl *arg)
{
@@ -2410,7 +2378,7 @@ nvme_fc_complete_rq(struct request *rq)
* status. The done path will return the io request back to the block
* layer with an error status.
*/
-static void
+static bool
nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
{
struct nvme_ctrl *nctrl = data;
@@ -2418,6 +2386,7 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
__nvme_fc_abort_op(ctrl, op);
+ return true;
}
@@ -2427,7 +2396,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = {
.init_request = nvme_fc_init_request,
.exit_request = nvme_fc_exit_request,
.init_hctx = nvme_fc_init_hctx,
- .poll = nvme_fc_poll,
.timeout = nvme_fc_timeout,
};
@@ -2457,7 +2425,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
ctrl->tag_set.ops = &nvme_fc_mq_ops;
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
ctrl->tag_set.reserved_tags = 1; /* fabric connect */
- ctrl->tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
@@ -3050,6 +3018,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
+ ctrl->ctrl.numa_node = dev_to_node(lport->dev);
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
@@ -3090,7 +3059,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
- ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->admin_tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index a4f3b263cd6c..b759c25c89c8 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -577,7 +577,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
struct ppa_addr ppa;
size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
size_t log_pos, offset, len;
- int ret, i, max_len;
+ int i, max_len;
+ int ret = 0;
/*
* limit requests to maximum 256K to avoid issuing arbitrary large
@@ -731,11 +732,12 @@ static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
return ret;
}
-static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
+static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name,
+ int size)
{
struct nvme_ns *ns = nvmdev->q->queuedata;
- return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0);
+ return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0);
}
static void nvme_nvm_destroy_dma_pool(void *pool)
@@ -935,9 +937,9 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
/* cdw11-12 */
c.ph_rw.length = cpu_to_le16(vcmd.nppas);
c.ph_rw.control = cpu_to_le16(vcmd.control);
- c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
- c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
- c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
+ c.common.cdw13 = cpu_to_le32(vcmd.cdw13);
+ c.common.cdw14 = cpu_to_le32(vcmd.cdw14);
+ c.common.cdw15 = cpu_to_le32(vcmd.cdw15);
if (vcmd.timeout_ms)
timeout = msecs_to_jiffies(vcmd.timeout_ms);
@@ -972,22 +974,11 @@ int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
}
}
-void nvme_nvm_update_nvm_info(struct nvme_ns *ns)
-{
- struct nvm_dev *ndev = ns->ndev;
- struct nvm_geo *geo = &ndev->geo;
-
- if (geo->version == NVM_OCSSD_SPEC_12)
- return;
-
- geo->csecs = 1 << ns->lba_shift;
- geo->sos = ns->ms;
-}
-
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
{
struct request_queue *q = ns->queue;
struct nvm_dev *dev;
+ struct nvm_geo *geo;
_nvme_nvm_check_size();
@@ -995,6 +986,12 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
if (!dev)
return -ENOMEM;
+ /* Note that csecs and sos will be overridden if it is a 1.2 drive. */
+ geo = &dev->geo;
+ geo->csecs = 1 << ns->lba_shift;
+ geo->sos = ns->ms;
+ geo->ext = ns->ext;
+
dev->q = q;
memcpy(dev->name, disk_name, DISK_NAME_LEN);
dev->ops = &nvme_nvm_dev_ops;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 9901afd804ce..183ec17ba067 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
- distance = node_distance(node, dev_to_node(ns->ctrl->dev));
+ distance = node_distance(node, ns->ctrl->numa_node);
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
@@ -220,21 +220,6 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
return ret;
}
-static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
-{
- struct nvme_ns_head *head = q->queuedata;
- struct nvme_ns *ns;
- bool found = false;
- int srcu_idx;
-
- srcu_idx = srcu_read_lock(&head->srcu);
- ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu);
- if (likely(ns && nvme_path_is_optimized(ns)))
- found = ns->queue->poll_fn(q, qc);
- srcu_read_unlock(&head->srcu, srcu_idx);
- return found;
-}
-
static void nvme_requeue_work(struct work_struct *work)
{
struct nvme_ns_head *head =
@@ -276,12 +261,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
return 0;
- q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+ q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
if (!q)
goto out;
q->queuedata = head;
blk_queue_make_request(q, nvme_ns_head_make_request);
- q->poll_fn = nvme_ns_head_poll;
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
/* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size(q, 512);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 081cbdcce880..2b36ac922596 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -145,6 +145,7 @@ enum nvme_ctrl_state {
};
struct nvme_ctrl {
+ bool comp_seen;
enum nvme_ctrl_state state;
bool identified;
spinlock_t lock;
@@ -153,6 +154,7 @@ struct nvme_ctrl {
struct request_queue *connect_q;
struct device *dev;
int instance;
+ int numa_node;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
@@ -179,6 +181,7 @@ struct nvme_ctrl {
u32 page_size;
u32 max_hw_sectors;
u32 max_segments;
+ u16 crdt[3];
u16 oncs;
u16 oacs;
u16 nssa;
@@ -193,6 +196,7 @@ struct nvme_ctrl {
u8 apsta;
u32 oaes;
u32 aen_result;
+ u32 ctratt;
unsigned int shutdown_timeout;
unsigned int kato;
bool subsystem;
@@ -237,6 +241,9 @@ struct nvme_ctrl {
u16 maxcmd;
int nr_reconnects;
struct nvmf_ctrl_options *opts;
+
+ struct page *discard_page;
+ unsigned long discard_page_busy;
};
struct nvme_subsystem {
@@ -364,15 +371,6 @@ static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {}
static inline void nvme_should_fail(struct request *req) {}
#endif
-static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
-{
- u32 val = 0;
-
- if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
- return false;
- return val & NVME_CSTS_RDY;
-}
-
static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
{
if (!ctrl->subsystem)
@@ -408,7 +406,7 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
}
void nvme_complete_rq(struct request *req);
-void nvme_cancel_request(struct request *req, void *data, bool reserved);
+bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
@@ -449,7 +447,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head,
- blk_mq_req_flags_t flags);
+ blk_mq_req_flags_t flags, bool poll);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
@@ -545,13 +543,11 @@ static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
-void nvme_nvm_update_nvm_info(struct nvme_ns *ns);
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
void nvme_nvm_unregister(struct nvme_ns *ns);
extern const struct attribute_group nvme_nvm_attr_group;
int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
#else
-static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {};
static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
int node)
{
@@ -572,6 +568,6 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
}
int __init nvme_core_init(void);
-void nvme_core_exit(void);
+void __exit nvme_core_exit(void);
#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c33bb201b884..5a0bf6a24d50 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -32,6 +32,7 @@
#include <linux/sed-opal.h>
#include <linux/pci-p2pdma.h>
+#include "trace.h"
#include "nvme.h"
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
@@ -74,6 +75,22 @@ static int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
+static int queue_count_set(const char *val, const struct kernel_param *kp);
+static const struct kernel_param_ops queue_count_ops = {
+ .set = queue_count_set,
+ .get = param_get_int,
+};
+
+static int write_queues;
+module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
+MODULE_PARM_DESC(write_queues,
+ "Number of queues to use for writes. If not set, reads and writes "
+ "will share a queue set.");
+
+static int poll_queues = 0;
+module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
+MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
+
struct nvme_dev;
struct nvme_queue;
@@ -92,6 +109,7 @@ struct nvme_dev {
struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
+ unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
int q_depth;
u32 db_stride;
@@ -105,7 +123,6 @@ struct nvme_dev {
u32 cmbsz;
u32 cmbloc;
struct nvme_ctrl ctrl;
- struct completion ioq_wait;
mempool_t *iod_mempool;
@@ -134,6 +151,17 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
return param_set_int(val, kp);
}
+static int queue_count_set(const char *val, const struct kernel_param *kp)
+{
+ int n = 0, ret;
+
+ ret = kstrtoint(val, 10, &n);
+ if (n > num_possible_cpus())
+ n = num_possible_cpus();
+
+ return param_set_int(val, kp);
+}
+
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
return qid * 2 * stride;
@@ -158,8 +186,8 @@ struct nvme_queue {
struct nvme_dev *dev;
spinlock_t sq_lock;
struct nvme_command *sq_cmds;
- bool sq_cmds_is_io;
- spinlock_t cq_lock ____cacheline_aligned_in_smp;
+ /* only used for poll queues: */
+ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
@@ -168,14 +196,20 @@ struct nvme_queue {
u16 q_depth;
s16 cq_vector;
u16 sq_tail;
+ u16 last_sq_tail;
u16 cq_head;
u16 last_cq_head;
u16 qid;
u8 cq_phase;
+ unsigned long flags;
+#define NVMEQ_ENABLED 0
+#define NVMEQ_SQ_CMB 1
+#define NVMEQ_DELETE_ERROR 2
u32 *dbbuf_sq_db;
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
u32 *dbbuf_cq_ei;
+ struct completion delete_done;
};
/*
@@ -218,9 +252,20 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
}
+static unsigned int max_io_queues(void)
+{
+ return num_possible_cpus() + write_queues + poll_queues;
+}
+
+static unsigned int max_queue_count(void)
+{
+ /* IO queues + admin queue */
+ return 1 + max_io_queues();
+}
+
static inline unsigned int nvme_dbbuf_size(u32 stride)
{
- return ((num_possible_cpus() + 1) * 8 * stride);
+ return (max_queue_count() * 8 * stride);
}
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
@@ -431,30 +476,90 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
return 0;
}
+static int queue_irq_offset(struct nvme_dev *dev)
+{
+ /* if we have more than 1 vec, admin queue offsets us by 1 */
+ if (dev->num_vecs > 1)
+ return 1;
+
+ return 0;
+}
+
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_dev *dev = set->driver_data;
+ int i, qoff, offset;
+
+ offset = queue_irq_offset(dev);
+ for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+ struct blk_mq_queue_map *map = &set->map[i];
+
+ map->nr_queues = dev->io_queues[i];
+ if (!map->nr_queues) {
+ BUG_ON(i == HCTX_TYPE_DEFAULT);
+ continue;
+ }
- return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
- dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
+ /*
+ * The poll queue(s) doesn't have an IRQ (and hence IRQ
+ * affinity), so use the regular blk-mq cpu mapping
+ */
+ map->queue_offset = qoff;
+ if (i != HCTX_TYPE_POLL)
+ blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+ else
+ blk_mq_map_queues(map);
+ qoff += map->nr_queues;
+ offset += map->nr_queues;
+ }
+
+ return 0;
+}
+
+/*
+ * Write sq tail if we are asked to, or if the next command would wrap.
+ */
+static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
+{
+ if (!write_sq) {
+ u16 next_tail = nvmeq->sq_tail + 1;
+
+ if (next_tail == nvmeq->q_depth)
+ next_tail = 0;
+ if (next_tail != nvmeq->last_sq_tail)
+ return;
+ }
+
+ if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
+ nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+ nvmeq->last_sq_tail = nvmeq->sq_tail;
}
/**
* nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
* @cmd: The command to send
+ * @write_sq: whether to write to the SQ doorbell
*/
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+ bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
-
memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
-
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
- if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
- nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ nvme_write_sq_db(nvmeq, write_sq);
+ spin_unlock(&nvmeq->sq_lock);
+}
+
+static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+
+ spin_lock(&nvmeq->sq_lock);
+ if (nvmeq->sq_tail != nvmeq->last_sq_tail)
+ nvme_write_sq_db(nvmeq, true);
spin_unlock(&nvmeq->sq_lock);
}
@@ -822,7 +927,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
* We should not need to do this, but we're still using this to
* ensure we can drain requests on a dying queue.
*/
- if (unlikely(nvmeq->cq_vector < 0))
+ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
return BLK_STS_IOERR;
ret = nvme_setup_cmd(ns, req, &cmnd);
@@ -840,7 +945,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
}
blk_mq_start_request(req);
- nvme_submit_cmd(nvmeq, &cmnd);
+ nvme_submit_cmd(nvmeq, &cmnd, bd->last);
return BLK_STS_OK;
out_cleanup_iod:
nvme_free_iod(dev, req);
@@ -899,6 +1004,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
}
req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
+ trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
nvme_end_request(req, cqe->status, cqe->result);
}
@@ -919,15 +1025,15 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
}
}
-static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
- u16 *end, int tag)
+static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
+ u16 *end, unsigned int tag)
{
- bool found = false;
+ int found = 0;
*start = nvmeq->cq_head;
- while (!found && nvme_cqe_pending(nvmeq)) {
- if (nvmeq->cqes[nvmeq->cq_head].command_id == tag)
- found = true;
+ while (nvme_cqe_pending(nvmeq)) {
+ if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
+ found++;
nvme_update_cq_head(nvmeq);
}
*end = nvmeq->cq_head;
@@ -943,12 +1049,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
irqreturn_t ret = IRQ_NONE;
u16 start, end;
- spin_lock(&nvmeq->cq_lock);
+ /*
+ * The rmb/wmb pair ensures we see all updates from a previous run of
+ * the irq handler, even if that was on another CPU.
+ */
+ rmb();
if (nvmeq->cq_head != nvmeq->last_cq_head)
ret = IRQ_HANDLED;
nvme_process_cq(nvmeq, &start, &end, -1);
nvmeq->last_cq_head = nvmeq->cq_head;
- spin_unlock(&nvmeq->cq_lock);
+ wmb();
if (start != end) {
nvme_complete_cqes(nvmeq, start, end);
@@ -966,27 +1076,50 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
return IRQ_NONE;
}
-static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
+/*
+ * Poll for completions any queue, including those not dedicated to polling.
+ * Can be called from any context.
+ */
+static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
{
+ struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
u16 start, end;
- bool found;
+ int found;
- if (!nvme_cqe_pending(nvmeq))
- return 0;
-
- spin_lock_irq(&nvmeq->cq_lock);
- found = nvme_process_cq(nvmeq, &start, &end, tag);
- spin_unlock_irq(&nvmeq->cq_lock);
+ /*
+ * For a poll queue we need to protect against the polling thread
+ * using the CQ lock. For normal interrupt driven threads we have
+ * to disable the interrupt to avoid racing with it.
+ */
+ if (nvmeq->cq_vector == -1) {
+ spin_lock(&nvmeq->cq_poll_lock);
+ found = nvme_process_cq(nvmeq, &start, &end, tag);
+ spin_unlock(&nvmeq->cq_poll_lock);
+ } else {
+ disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ found = nvme_process_cq(nvmeq, &start, &end, tag);
+ enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ }
nvme_complete_cqes(nvmeq, start, end);
return found;
}
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx)
{
struct nvme_queue *nvmeq = hctx->driver_data;
+ u16 start, end;
+ bool found;
+
+ if (!nvme_cqe_pending(nvmeq))
+ return 0;
+
+ spin_lock(&nvmeq->cq_poll_lock);
+ found = nvme_process_cq(nvmeq, &start, &end, -1);
+ spin_unlock(&nvmeq->cq_poll_lock);
- return __nvme_poll(nvmeq, tag);
+ nvme_complete_cqes(nvmeq, start, end);
+ return found;
}
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
@@ -998,7 +1131,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_async_event;
c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
- nvme_submit_cmd(nvmeq, &c);
+ nvme_submit_cmd(nvmeq, &c, true);
}
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1016,7 +1149,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq, s16 vector)
{
struct nvme_command c;
- int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+ int flags = NVME_QUEUE_PHYS_CONTIG;
+
+ if (vector != -1)
+ flags |= NVME_CQ_IRQ_ENABLED;
/*
* Note: we (ab)use the fact that the prp fields survive if no data
@@ -1028,7 +1164,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
c.create_cq.cqid = cpu_to_le16(qid);
c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_cq.cq_flags = cpu_to_le16(flags);
- c.create_cq.irq_vector = cpu_to_le16(vector);
+ if (vector != -1)
+ c.create_cq.irq_vector = cpu_to_le16(vector);
+ else
+ c.create_cq.irq_vector = 0;
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
@@ -1157,7 +1296,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
/*
* Did we miss an interrupt?
*/
- if (__nvme_poll(nvmeq, req->tag)) {
+ if (nvme_poll_irqdisable(nvmeq, req->tag)) {
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, completion polled\n",
req->tag, nvmeq->qid);
@@ -1237,17 +1376,15 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
{
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+ if (!nvmeq->sq_cmds)
+ return;
- if (nvmeq->sq_cmds) {
- if (nvmeq->sq_cmds_is_io)
- pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
- nvmeq->sq_cmds,
- SQ_SIZE(nvmeq->q_depth));
- else
- dma_free_coherent(nvmeq->q_dmadev,
- SQ_SIZE(nvmeq->q_depth),
- nvmeq->sq_cmds,
- nvmeq->sq_dma_addr);
+ if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
+ pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
+ nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
+ } else {
+ dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+ nvmeq->sq_cmds, nvmeq->sq_dma_addr);
}
}
@@ -1267,47 +1404,32 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
*/
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
{
- int vector;
-
- spin_lock_irq(&nvmeq->cq_lock);
- if (nvmeq->cq_vector == -1) {
- spin_unlock_irq(&nvmeq->cq_lock);
+ if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
return 1;
- }
- vector = nvmeq->cq_vector;
- nvmeq->dev->online_queues--;
- nvmeq->cq_vector = -1;
- spin_unlock_irq(&nvmeq->cq_lock);
- /*
- * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without
- * having to grab the lock.
- */
+ /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
mb();
+ nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
-
- pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
-
+ if (nvmeq->cq_vector == -1)
+ return 0;
+ pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
+ nvmeq->cq_vector = -1;
return 0;
}
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
struct nvme_queue *nvmeq = &dev->queues[0];
- u16 start, end;
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
else
nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
- spin_lock_irq(&nvmeq->cq_lock);
- nvme_process_cq(nvmeq, &start, &end, -1);
- spin_unlock_irq(&nvmeq->cq_lock);
-
- nvme_complete_cqes(nvmeq, start, end);
+ nvme_poll_irqdisable(nvmeq, -1);
}
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -1343,15 +1465,14 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
nvmeq->sq_cmds);
- nvmeq->sq_cmds_is_io = true;
- }
-
- if (!nvmeq->sq_cmds) {
- nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
- &nvmeq->sq_dma_addr, GFP_KERNEL);
- nvmeq->sq_cmds_is_io = false;
+ if (nvmeq->sq_dma_addr) {
+ set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
+ return 0;
+ }
}
+ nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+ &nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
return 0;
@@ -1375,7 +1496,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
nvmeq->q_dmadev = dev->dev;
nvmeq->dev = dev;
spin_lock_init(&nvmeq->sq_lock);
- spin_lock_init(&nvmeq->cq_lock);
+ spin_lock_init(&nvmeq->cq_poll_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1411,28 +1532,34 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
{
struct nvme_dev *dev = nvmeq->dev;
- spin_lock_irq(&nvmeq->cq_lock);
nvmeq->sq_tail = 0;
+ nvmeq->last_sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
- spin_unlock_irq(&nvmeq->cq_lock);
+ wmb(); /* ensure the first interrupt sees the initialization */
}
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
s16 vector;
+ clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
+
/*
* A queue's vector matches the queue identifier unless the controller
* has only one vector available.
*/
- vector = dev->num_vecs == 1 ? 0 : qid;
+ if (!polled)
+ vector = dev->num_vecs == 1 ? 0 : qid;
+ else
+ vector = -1;
+
result = adapter_alloc_cq(dev, qid, nvmeq, vector);
if (result)
return result;
@@ -1443,17 +1570,16 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
else if (result)
goto release_cq;
- /*
- * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will
- * invoke free_irq for it and cause a 'Trying to free already-free IRQ
- * xxx' warning if the create CQ/SQ command times out.
- */
nvmeq->cq_vector = vector;
nvme_init_queue(nvmeq, qid);
- result = queue_request_irq(nvmeq);
- if (result < 0)
- goto release_sq;
+ if (vector != -1) {
+ result = queue_request_irq(nvmeq);
+ if (result < 0)
+ goto release_sq;
+ }
+
+ set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
release_sq:
@@ -1477,6 +1603,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
static const struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
+ .commit_rqs = nvme_commit_rqs,
.init_hctx = nvme_init_hctx,
.init_request = nvme_init_request,
.map_queues = nvme_pci_map_queues,
@@ -1602,12 +1729,13 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
return result;
}
+ set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
}
static int nvme_create_io_queues(struct nvme_dev *dev)
{
- unsigned i, max;
+ unsigned i, max, rw_queues;
int ret = 0;
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1618,8 +1746,17 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
}
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
+ if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
+ rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
+ dev->io_queues[HCTX_TYPE_READ];
+ } else {
+ rw_queues = max;
+ }
+
for (i = dev->online_queues; i <= max; i++) {
- ret = nvme_create_queue(&dev->queues[i], i);
+ bool polled = i > rw_queues;
+
+ ret = nvme_create_queue(&dev->queues[i], i, polled);
if (ret)
break;
}
@@ -1891,6 +2028,110 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
return ret;
}
+static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
+{
+ unsigned int this_w_queues = write_queues;
+
+ /*
+ * Setup read/write queue split
+ */
+ if (irq_queues == 1) {
+ dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+ dev->io_queues[HCTX_TYPE_READ] = 0;
+ return;
+ }
+
+ /*
+ * If 'write_queues' is set, ensure it leaves room for at least
+ * one read queue
+ */
+ if (this_w_queues >= irq_queues)
+ this_w_queues = irq_queues - 1;
+
+ /*
+ * If 'write_queues' is set to zero, reads and writes will share
+ * a queue set.
+ */
+ if (!this_w_queues) {
+ dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues;
+ dev->io_queues[HCTX_TYPE_READ] = 0;
+ } else {
+ dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
+ dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
+ }
+}
+
+static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
+{
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+ int irq_sets[2];
+ struct irq_affinity affd = {
+ .pre_vectors = 1,
+ .nr_sets = ARRAY_SIZE(irq_sets),
+ .sets = irq_sets,
+ };
+ int result = 0;
+ unsigned int irq_queues, this_p_queues;
+
+ /*
+ * Poll queues don't need interrupts, but we need at least one IO
+ * queue left over for non-polled IO.
+ */
+ this_p_queues = poll_queues;
+ if (this_p_queues >= nr_io_queues) {
+ this_p_queues = nr_io_queues - 1;
+ irq_queues = 1;
+ } else {
+ irq_queues = nr_io_queues - this_p_queues;
+ }
+ dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
+
+ /*
+ * For irq sets, we have to ask for minvec == maxvec. This passes
+ * any reduction back to us, so we can adjust our queue counts and
+ * IRQ vector needs.
+ */
+ do {
+ nvme_calc_io_queues(dev, irq_queues);
+ irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
+ irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
+ if (!irq_sets[1])
+ affd.nr_sets = 1;
+
+ /*
+ * If we got a failure and we're down to asking for just
+ * 1 + 1 queues, just ask for a single vector. We'll share
+ * that between the single IO queue and the admin queue.
+ */
+ if (result >= 0 && irq_queues > 1)
+ irq_queues = irq_sets[0] + irq_sets[1] + 1;
+
+ result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
+ irq_queues,
+ PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+ /*
+ * Need to reduce our vec counts. If we get ENOSPC, the
+ * platform should support mulitple vecs, we just need
+ * to decrease our ask. If we get EINVAL, the platform
+ * likely does not. Back down to ask for just one vector.
+ */
+ if (result == -ENOSPC) {
+ irq_queues--;
+ if (!irq_queues)
+ return result;
+ continue;
+ } else if (result == -EINVAL) {
+ irq_queues = 1;
+ continue;
+ } else if (result <= 0)
+ return -EIO;
+ break;
+ } while (1);
+
+ return result;
+}
+
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = &dev->queues[0];
@@ -1898,17 +2139,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
int result, nr_io_queues;
unsigned long size;
- struct irq_affinity affd = {
- .pre_vectors = 1
- };
-
- nr_io_queues = num_possible_cpus();
+ nr_io_queues = max_io_queues();
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
if (nr_io_queues == 0)
return 0;
+
+ clear_bit(NVMEQ_ENABLED, &adminq->flags);
if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -1937,12 +2176,19 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
* setting up the full range we need.
*/
pci_free_irq_vectors(pdev);
- result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+ result = nvme_setup_irqs(dev, nr_io_queues);
if (result <= 0)
return -EIO;
+
dev->num_vecs = result;
- dev->max_qid = max(result - 1, 1);
+ result = max(result - 1, 1);
+ dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
+
+ dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+ dev->io_queues[HCTX_TYPE_DEFAULT],
+ dev->io_queues[HCTX_TYPE_READ],
+ dev->io_queues[HCTX_TYPE_POLL]);
/*
* Should investigate if there's a performance win from allocating
@@ -1956,6 +2202,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
adminq->cq_vector = -1;
return result;
}
+ set_bit(NVMEQ_ENABLED, &adminq->flags);
return nvme_create_io_queues(dev);
}
@@ -1964,23 +2211,15 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
struct nvme_queue *nvmeq = req->end_io_data;
blk_mq_free_request(req);
- complete(&nvmeq->dev->ioq_wait);
+ complete(&nvmeq->delete_done);
}
static void nvme_del_cq_end(struct request *req, blk_status_t error)
{
struct nvme_queue *nvmeq = req->end_io_data;
- u16 start, end;
-
- if (!error) {
- unsigned long flags;
- spin_lock_irqsave(&nvmeq->cq_lock, flags);
- nvme_process_cq(nvmeq, &start, &end, -1);
- spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
-
- nvme_complete_cqes(nvmeq, start, end);
- }
+ if (error)
+ set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
nvme_del_queue_end(req, error);
}
@@ -2002,37 +2241,44 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
req->timeout = ADMIN_TIMEOUT;
req->end_io_data = nvmeq;
+ init_completion(&nvmeq->delete_done);
blk_execute_rq_nowait(q, NULL, req, false,
opcode == nvme_admin_delete_cq ?
nvme_del_cq_end : nvme_del_queue_end);
return 0;
}
-static void nvme_disable_io_queues(struct nvme_dev *dev)
+static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
{
- int pass, queues = dev->online_queues - 1;
+ int nr_queues = dev->online_queues - 1, sent = 0;
unsigned long timeout;
- u8 opcode = nvme_admin_delete_sq;
-
- for (pass = 0; pass < 2; pass++) {
- int sent = 0, i = queues;
- reinit_completion(&dev->ioq_wait);
retry:
- timeout = ADMIN_TIMEOUT;
- for (; i > 0; i--, sent++)
- if (nvme_delete_queue(&dev->queues[i], opcode))
- break;
-
- while (sent--) {
- timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
- if (timeout == 0)
- return;
- if (i)
- goto retry;
- }
- opcode = nvme_admin_delete_cq;
+ timeout = ADMIN_TIMEOUT;
+ while (nr_queues > 0) {
+ if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
+ break;
+ nr_queues--;
+ sent++;
}
+ while (sent) {
+ struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
+
+ timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
+ timeout);
+ if (timeout == 0)
+ return false;
+
+ /* handle any remaining CQEs */
+ if (opcode == nvme_admin_delete_cq &&
+ !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
+ nvme_poll_irqdisable(nvmeq, -1);
+
+ sent--;
+ if (nr_queues)
+ goto retry;
+ }
+ return true;
}
/*
@@ -2045,6 +2291,10 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (!dev->ctrl.tagset) {
dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1;
+ dev->tagset.nr_maps = 2; /* default + read */
+ if (dev->io_queues[HCTX_TYPE_POLL])
+ dev->tagset.nr_maps++;
+ dev->tagset.nr_maps = HCTX_MAX_TYPES;
dev->tagset.timeout = NVME_IO_TIMEOUT;
dev->tagset.numa_node = dev_to_node(dev->dev);
dev->tagset.queue_depth =
@@ -2187,7 +2437,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_stop_queues(&dev->ctrl);
if (!dead && dev->ctrl.queue_count > 0) {
- nvme_disable_io_queues(dev);
+ if (nvme_disable_io_queues(dev, nvme_admin_delete_sq))
+ nvme_disable_io_queues(dev, nvme_admin_delete_cq);
nvme_disable_admin_queue(dev, shutdown);
}
for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
@@ -2491,8 +2742,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (!dev)
return -ENOMEM;
- dev->queues = kcalloc_node(num_possible_cpus() + 1,
- sizeof(struct nvme_queue), GFP_KERNEL, node);
+ dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
+ GFP_KERNEL, node);
if (!dev->queues)
goto free;
@@ -2506,7 +2757,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
mutex_init(&dev->shutdown_lock);
- init_completion(&dev->ioq_wait);
result = nvme_setup_prp_pools(dev);
if (result)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ab6ec7295bf9..0a2fd2949ad7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -162,6 +162,13 @@ static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
return queue - queue->ctrl->queues;
}
+static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
+{
+ return nvme_rdma_queue_idx(queue) >
+ queue->ctrl->ctrl.opts->nr_io_queues +
+ queue->ctrl->ctrl.opts->nr_write_queues;
+}
+
static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
{
return queue->cmnd_capsule_len - sizeof(struct nvme_command);
@@ -440,6 +447,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
const int send_wr_factor = 3; /* MR, SEND, INV */
const int cq_factor = send_wr_factor + 1; /* + RECV */
int comp_vector, idx = nvme_rdma_queue_idx(queue);
+ enum ib_poll_context poll_ctx;
int ret;
queue->device = nvme_rdma_find_get_device(queue->cm_id);
@@ -456,10 +464,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
*/
comp_vector = idx == 0 ? idx : idx - 1;
+ /* Polling queues need direct cq polling context */
+ if (nvme_rdma_poll_queue(queue))
+ poll_ctx = IB_POLL_DIRECT;
+ else
+ poll_ctx = IB_POLL_SOFTIRQ;
+
/* +1 for ib_stop_cq */
queue->ib_cq = ib_alloc_cq(ibdev, queue,
cq_factor * queue->queue_size + 1,
- comp_vector, IB_POLL_SOFTIRQ);
+ comp_vector, poll_ctx);
if (IS_ERR(queue->ib_cq)) {
ret = PTR_ERR(queue->ib_cq);
goto out_put_dev;
@@ -595,15 +609,17 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
{
+ struct nvme_rdma_queue *queue = &ctrl->queues[idx];
+ bool poll = nvme_rdma_poll_queue(queue);
int ret;
if (idx)
- ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
else
ret = nvmf_connect_admin_queue(&ctrl->ctrl);
if (!ret)
- set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags);
+ set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
else
dev_info(ctrl->ctrl.device,
"failed to connect queue: %d ret=%d\n", idx, ret);
@@ -645,6 +661,9 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
nr_io_queues = min_t(unsigned int, nr_io_queues,
ibdev->num_comp_vectors);
+ nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
+ nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
+
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret)
return ret;
@@ -694,7 +713,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
@@ -707,13 +726,14 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
}
ret = blk_mq_alloc_tag_set(set);
@@ -763,6 +783,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
return error;
ctrl->device = ctrl->queues[0].device;
+ ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
@@ -1411,12 +1432,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
WARN_ON_ONCE(ret);
}
-static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
- struct nvme_completion *cqe, struct ib_wc *wc, int tag)
+static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
+ struct nvme_completion *cqe, struct ib_wc *wc)
{
struct request *rq;
struct nvme_rdma_request *req;
- int ret = 0;
rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
if (!rq) {
@@ -1424,7 +1444,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
"tag 0x%x on QP %#x not found\n",
cqe->command_id, queue->qp->qp_num);
nvme_rdma_error_recovery(queue->ctrl);
- return ret;
+ return;
}
req = blk_mq_rq_to_pdu(rq);
@@ -1439,6 +1459,8 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
nvme_rdma_error_recovery(queue->ctrl);
}
} else if (req->mr) {
+ int ret;
+
ret = nvme_rdma_inv_rkey(queue, req);
if (unlikely(ret < 0)) {
dev_err(queue->ctrl->ctrl.device,
@@ -1447,19 +1469,14 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
nvme_rdma_error_recovery(queue->ctrl);
}
/* the local invalidation completion will end the request */
- return 0;
+ return;
}
- if (refcount_dec_and_test(&req->ref)) {
- if (rq->tag == tag)
- ret = 1;
+ if (refcount_dec_and_test(&req->ref))
nvme_end_request(rq, req->status, req->result);
- }
-
- return ret;
}
-static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvme_rdma_qe *qe =
container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
@@ -1467,11 +1484,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
struct ib_device *ibdev = queue->device->dev;
struct nvme_completion *cqe = qe->data;
const size_t len = sizeof(struct nvme_completion);
- int ret = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
nvme_rdma_wr_error(cq, wc, "RECV");
- return 0;
+ return;
}
ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
@@ -1486,16 +1502,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
- ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
+ nvme_rdma_process_nvme_rsp(queue, cqe, wc);
ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
nvme_rdma_post_recv(queue, qe);
- return ret;
-}
-
-static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- __nvme_rdma_recv_done(cq, wc, -1);
}
static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
@@ -1749,25 +1759,11 @@ err:
return BLK_STS_IOERR;
}
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
{
struct nvme_rdma_queue *queue = hctx->driver_data;
- struct ib_cq *cq = queue->ib_cq;
- struct ib_wc wc;
- int found = 0;
-
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- struct ib_cqe *cqe = wc.wr_cqe;
-
- if (cqe) {
- if (cqe->done == nvme_rdma_recv_done)
- found |= __nvme_rdma_recv_done(cq, &wc, tag);
- else
- cqe->done(cq, &wc);
- }
- }
- return found;
+ return ib_process_cq_direct(queue->ib_cq, -1);
}
static void nvme_rdma_complete_rq(struct request *rq)
@@ -1782,7 +1778,36 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_rdma_ctrl *ctrl = set->driver_data;
- return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
+ if (ctrl->ctrl.opts->nr_write_queues) {
+ /* separate read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->ctrl.opts->nr_write_queues;
+ set->map[HCTX_TYPE_READ].queue_offset =
+ ctrl->ctrl.opts->nr_write_queues;
+ } else {
+ /* mixed read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->ctrl.opts->nr_io_queues;
+ set->map[HCTX_TYPE_READ].queue_offset = 0;
+ }
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
+ ctrl->device->dev, 0);
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
+ ctrl->device->dev, 0);
+
+ if (ctrl->ctrl.opts->nr_poll_queues) {
+ set->map[HCTX_TYPE_POLL].nr_queues =
+ ctrl->ctrl.opts->nr_poll_queues;
+ set->map[HCTX_TYPE_POLL].queue_offset =
+ ctrl->ctrl.opts->nr_io_queues;
+ if (ctrl->ctrl.opts->nr_write_queues)
+ set->map[HCTX_TYPE_POLL].queue_offset +=
+ ctrl->ctrl.opts->nr_write_queues;
+ blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+ }
+ return 0;
}
static const struct blk_mq_ops nvme_rdma_mq_ops = {
@@ -1791,9 +1816,9 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = {
.init_request = nvme_rdma_init_request,
.exit_request = nvme_rdma_exit_request,
.init_hctx = nvme_rdma_init_hctx,
- .poll = nvme_rdma_poll,
.timeout = nvme_rdma_timeout,
.map_queues = nvme_rdma_map_queues,
+ .poll = nvme_rdma_poll,
};
static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
@@ -1938,7 +1963,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
- ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
+ opts->nr_poll_queues + 1;
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
@@ -1989,7 +2015,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
- NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES,
.create_ctrl = nvme_rdma_create_ctrl,
};
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
new file mode 100644
index 000000000000..de174912445e
--- /dev/null
+++ b/drivers/nvme/host/tcp.c
@@ -0,0 +1,2278 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP host.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/blk-mq.h>
+#include <crypto/hash.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+
+struct nvme_tcp_queue;
+
+enum nvme_tcp_send_state {
+ NVME_TCP_SEND_CMD_PDU = 0,
+ NVME_TCP_SEND_H2C_PDU,
+ NVME_TCP_SEND_DATA,
+ NVME_TCP_SEND_DDGST,
+};
+
+struct nvme_tcp_request {
+ struct nvme_request req;
+ void *pdu;
+ struct nvme_tcp_queue *queue;
+ u32 data_len;
+ u32 pdu_len;
+ u32 pdu_sent;
+ u16 ttag;
+ struct list_head entry;
+ __le32 ddgst;
+
+ struct bio *curr_bio;
+ struct iov_iter iter;
+
+ /* send state */
+ size_t offset;
+ size_t data_sent;
+ enum nvme_tcp_send_state state;
+};
+
+enum nvme_tcp_queue_flags {
+ NVME_TCP_Q_ALLOCATED = 0,
+ NVME_TCP_Q_LIVE = 1,
+};
+
+enum nvme_tcp_recv_state {
+ NVME_TCP_RECV_PDU = 0,
+ NVME_TCP_RECV_DATA,
+ NVME_TCP_RECV_DDGST,
+};
+
+struct nvme_tcp_ctrl;
+struct nvme_tcp_queue {
+ struct socket *sock;
+ struct work_struct io_work;
+ int io_cpu;
+
+ spinlock_t lock;
+ struct list_head send_list;
+
+ /* recv state */
+ void *pdu;
+ int pdu_remaining;
+ int pdu_offset;
+ size_t data_remaining;
+ size_t ddgst_remaining;
+
+ /* send state */
+ struct nvme_tcp_request *request;
+
+ int queue_size;
+ size_t cmnd_capsule_len;
+ struct nvme_tcp_ctrl *ctrl;
+ unsigned long flags;
+ bool rd_enabled;
+
+ bool hdr_digest;
+ bool data_digest;
+ struct ahash_request *rcv_hash;
+ struct ahash_request *snd_hash;
+ __le32 exp_ddgst;
+ __le32 recv_ddgst;
+
+ struct page_frag_cache pf_cache;
+
+ void (*state_change)(struct sock *);
+ void (*data_ready)(struct sock *);
+ void (*write_space)(struct sock *);
+};
+
+struct nvme_tcp_ctrl {
+ /* read only in the hot path */
+ struct nvme_tcp_queue *queues;
+ struct blk_mq_tag_set tag_set;
+
+ /* other member variables */
+ struct list_head list;
+ struct blk_mq_tag_set admin_tag_set;
+ struct sockaddr_storage addr;
+ struct sockaddr_storage src_addr;
+ struct nvme_ctrl ctrl;
+
+ struct work_struct err_work;
+ struct delayed_work connect_work;
+ struct nvme_tcp_request async_req;
+};
+
+static LIST_HEAD(nvme_tcp_ctrl_list);
+static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
+static struct workqueue_struct *nvme_tcp_wq;
+static struct blk_mq_ops nvme_tcp_mq_ops;
+static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+
+static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
+{
+ return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
+}
+
+static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
+{
+ return queue - queue->ctrl->queues;
+}
+
+static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
+{
+ u32 queue_idx = nvme_tcp_queue_id(queue);
+
+ if (queue_idx == 0)
+ return queue->ctrl->admin_tag_set.tags[queue_idx];
+ return queue->ctrl->tag_set.tags[queue_idx - 1];
+}
+
+static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
+{
+ return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
+{
+ return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
+{
+ return queue->cmnd_capsule_len - sizeof(struct nvme_command);
+}
+
+static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
+{
+ return req == &req->queue->ctrl->async_req;
+}
+
+static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
+{
+ struct request *rq;
+ unsigned int bytes;
+
+ if (unlikely(nvme_tcp_async_req(req)))
+ return false; /* async events don't have a request */
+
+ rq = blk_mq_rq_from_pdu(req);
+ bytes = blk_rq_payload_bytes(rq);
+
+ return rq_data_dir(rq) == WRITE && bytes &&
+ bytes <= nvme_tcp_inline_data_size(req->queue);
+}
+
+static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
+{
+ return req->iter.bvec->bv_page;
+}
+
+static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
+{
+ return req->iter.bvec->bv_offset + req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
+{
+ return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
+ req->pdu_len - req->pdu_sent);
+}
+
+static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
+{
+ return req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
+{
+ return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
+ req->pdu_len - req->pdu_sent : 0;
+}
+
+static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
+ int len)
+{
+ return nvme_tcp_pdu_data_left(req) <= len;
+}
+
+static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
+ unsigned int dir)
+{
+ struct request *rq = blk_mq_rq_from_pdu(req);
+ struct bio_vec *vec;
+ unsigned int size;
+ int nsegs;
+ size_t offset;
+
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ vec = &rq->special_vec;
+ nsegs = 1;
+ size = blk_rq_payload_bytes(rq);
+ offset = 0;
+ } else {
+ struct bio *bio = req->curr_bio;
+
+ vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ nsegs = bio_segments(bio);
+ size = bio->bi_iter.bi_size;
+ offset = bio->bi_iter.bi_bvec_done;
+ }
+
+ iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
+ req->iter.iov_offset = offset;
+}
+
+static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
+ int len)
+{
+ req->data_sent += len;
+ req->pdu_sent += len;
+ iov_iter_advance(&req->iter, len);
+ if (!iov_iter_count(&req->iter) &&
+ req->data_sent < req->data_len) {
+ req->curr_bio = req->curr_bio->bi_next;
+ nvme_tcp_init_iter(req, WRITE);
+ }
+}
+
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+
+ spin_lock(&queue->lock);
+ list_add_tail(&req->entry, &queue->send_list);
+ spin_unlock(&queue->lock);
+
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static inline struct nvme_tcp_request *
+nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_request *req;
+
+ spin_lock(&queue->lock);
+ req = list_first_entry_or_null(&queue->send_list,
+ struct nvme_tcp_request, entry);
+ if (req)
+ list_del(&req->entry);
+ spin_unlock(&queue->lock);
+
+ return req;
+}
+
+static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
+ __le32 *dgst)
+{
+ ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
+ crypto_ahash_final(hash);
+}
+
+static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
+ struct page *page, off_t off, size_t len)
+{
+ struct scatterlist sg;
+
+ sg_init_marker(&sg, 1);
+ sg_set_page(&sg, page, len, off);
+ ahash_request_set_crypt(hash, &sg, NULL, len);
+ crypto_ahash_update(hash);
+}
+
+static inline void nvme_tcp_hdgst(struct ahash_request *hash,
+ void *pdu, size_t len)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, pdu, len);
+ ahash_request_set_crypt(hash, &sg, pdu + len, len);
+ crypto_ahash_digest(hash);
+}
+
+static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
+ void *pdu, size_t pdu_len)
+{
+ struct nvme_tcp_hdr *hdr = pdu;
+ __le32 recv_digest;
+ __le32 exp_digest;
+
+ if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d: header digest flag is cleared\n",
+ nvme_tcp_queue_id(queue));
+ return -EPROTO;
+ }
+
+ recv_digest = *(__le32 *)(pdu + hdr->hlen);
+ nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
+ exp_digest = *(__le32 *)(pdu + hdr->hlen);
+ if (recv_digest != exp_digest) {
+ dev_err(queue->ctrl->ctrl.device,
+ "header digest error: recv %#x expected %#x\n",
+ le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
+{
+ struct nvme_tcp_hdr *hdr = pdu;
+ u8 digest_len = nvme_tcp_hdgst_len(queue);
+ u32 len;
+
+ len = le32_to_cpu(hdr->plen) - hdr->hlen -
+ ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
+
+ if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d: data digest flag is cleared\n",
+ nvme_tcp_queue_id(queue));
+ return -EPROTO;
+ }
+ crypto_ahash_init(queue->rcv_hash);
+
+ return 0;
+}
+
+static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
+ struct request *rq, unsigned int hctx_idx)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+ page_frag_free(req->pdu);
+}
+
+static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
+ struct request *rq, unsigned int hctx_idx,
+ unsigned int numa_node)
+{
+ struct nvme_tcp_ctrl *ctrl = set->driver_data;
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
+ struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+ req->pdu = page_frag_alloc(&queue->pf_cache,
+ sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!req->pdu)
+ return -ENOMEM;
+
+ req->queue = queue;
+ nvme_req(rq)->ctrl = &ctrl->ctrl;
+
+ return 0;
+}
+
+static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ struct nvme_tcp_ctrl *ctrl = data;
+ struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
+
+ hctx->driver_data = queue;
+ return 0;
+}
+
+static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ struct nvme_tcp_ctrl *ctrl = data;
+ struct nvme_tcp_queue *queue = &ctrl->queues[0];
+
+ hctx->driver_data = queue;
+ return 0;
+}
+
+static enum nvme_tcp_recv_state
+nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
+{
+ return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
+ (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
+ NVME_TCP_RECV_DATA;
+}
+
+static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
+{
+ queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
+ nvme_tcp_hdgst_len(queue);
+ queue->pdu_offset = 0;
+ queue->data_remaining = -1;
+ queue->ddgst_remaining = 0;
+}
+
+static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
+{
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ return;
+
+ queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
+}
+
+static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
+ struct nvme_completion *cqe)
+{
+ struct request *rq;
+
+ rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag 0x%x not found\n",
+ nvme_tcp_queue_id(queue), cqe->command_id);
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ return -EINVAL;
+ }
+
+ nvme_end_request(rq, cqe->status, cqe->result);
+
+ return 0;
+}
+
+static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
+ struct nvme_tcp_data_pdu *pdu)
+{
+ struct request *rq;
+
+ rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x not found\n",
+ nvme_tcp_queue_id(queue), pdu->command_id);
+ return -ENOENT;
+ }
+
+ if (!blk_rq_payload_bytes(rq)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x unexpected data\n",
+ nvme_tcp_queue_id(queue), rq->tag);
+ return -EIO;
+ }
+
+ queue->data_remaining = le32_to_cpu(pdu->data_length);
+
+ return 0;
+
+}
+
+static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
+ struct nvme_tcp_rsp_pdu *pdu)
+{
+ struct nvme_completion *cqe = &pdu->cqe;
+ int ret = 0;
+
+ /*
+ * AEN requests are special as they don't time out and can
+ * survive any kind of queue freeze and often don't respond to
+ * aborts. We don't even bother to allocate a struct request
+ * for them but rather special case them here.
+ */
+ if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
+ cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+ nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
+ &cqe->result);
+ else
+ ret = nvme_tcp_process_nvme_cqe(queue, cqe);
+
+ return ret;
+}
+
+static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
+ struct nvme_tcp_r2t_pdu *pdu)
+{
+ struct nvme_tcp_data_pdu *data = req->pdu;
+ struct nvme_tcp_queue *queue = req->queue;
+ struct request *rq = blk_mq_rq_from_pdu(req);
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+ u8 ddgst = nvme_tcp_ddgst_len(queue);
+
+ req->pdu_len = le32_to_cpu(pdu->r2t_length);
+ req->pdu_sent = 0;
+
+ if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "req %d r2t len %u exceeded data len %u (%zu sent)\n",
+ rq->tag, req->pdu_len, req->data_len,
+ req->data_sent);
+ return -EPROTO;
+ }
+
+ if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "req %d unexpected r2t offset %u (expected %zu)\n",
+ rq->tag, le32_to_cpu(pdu->r2t_offset),
+ req->data_sent);
+ return -EPROTO;
+ }
+
+ memset(data, 0, sizeof(*data));
+ data->hdr.type = nvme_tcp_h2c_data;
+ data->hdr.flags = NVME_TCP_F_DATA_LAST;
+ if (queue->hdr_digest)
+ data->hdr.flags |= NVME_TCP_F_HDGST;
+ if (queue->data_digest)
+ data->hdr.flags |= NVME_TCP_F_DDGST;
+ data->hdr.hlen = sizeof(*data);
+ data->hdr.pdo = data->hdr.hlen + hdgst;
+ data->hdr.plen =
+ cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
+ data->ttag = pdu->ttag;
+ data->command_id = rq->tag;
+ data->data_offset = cpu_to_le32(req->data_sent);
+ data->data_length = cpu_to_le32(req->pdu_len);
+ return 0;
+}
+
+static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
+ struct nvme_tcp_r2t_pdu *pdu)
+{
+ struct nvme_tcp_request *req;
+ struct request *rq;
+ int ret;
+
+ rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x not found\n",
+ nvme_tcp_queue_id(queue), pdu->command_id);
+ return -ENOENT;
+ }
+ req = blk_mq_rq_to_pdu(rq);
+
+ ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
+ if (unlikely(ret))
+ return ret;
+
+ req->state = NVME_TCP_SEND_H2C_PDU;
+ req->offset = 0;
+
+ nvme_tcp_queue_request(req);
+
+ return 0;
+}
+
+static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+ unsigned int *offset, size_t *len)
+{
+ struct nvme_tcp_hdr *hdr;
+ char *pdu = queue->pdu;
+ size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
+ int ret;
+
+ ret = skb_copy_bits(skb, *offset,
+ &pdu[queue->pdu_offset], rcv_len);
+ if (unlikely(ret))
+ return ret;
+
+ queue->pdu_remaining -= rcv_len;
+ queue->pdu_offset += rcv_len;
+ *offset += rcv_len;
+ *len -= rcv_len;
+ if (queue->pdu_remaining)
+ return 0;
+
+ hdr = queue->pdu;
+ if (queue->hdr_digest) {
+ ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
+ if (unlikely(ret))
+ return ret;
+ }
+
+
+ if (queue->data_digest) {
+ ret = nvme_tcp_check_ddgst(queue, queue->pdu);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ switch (hdr->type) {
+ case nvme_tcp_c2h_data:
+ ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
+ break;
+ case nvme_tcp_rsp:
+ nvme_tcp_init_recv_ctx(queue);
+ ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
+ break;
+ case nvme_tcp_r2t:
+ nvme_tcp_init_recv_ctx(queue);
+ ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
+ break;
+ default:
+ dev_err(queue->ctrl->ctrl.device,
+ "unsupported pdu type (%d)\n", hdr->type);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+ unsigned int *offset, size_t *len)
+{
+ struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
+ struct nvme_tcp_request *req;
+ struct request *rq;
+
+ rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x not found\n",
+ nvme_tcp_queue_id(queue), pdu->command_id);
+ return -ENOENT;
+ }
+ req = blk_mq_rq_to_pdu(rq);
+
+ while (true) {
+ int recv_len, ret;
+
+ recv_len = min_t(size_t, *len, queue->data_remaining);
+ if (!recv_len)
+ break;
+
+ if (!iov_iter_count(&req->iter)) {
+ req->curr_bio = req->curr_bio->bi_next;
+
+ /*
+ * If we don`t have any bios it means that controller
+ * sent more data than we requested, hence error
+ */
+ if (!req->curr_bio) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d no space in request %#x",
+ nvme_tcp_queue_id(queue), rq->tag);
+ nvme_tcp_init_recv_ctx(queue);
+ return -EIO;
+ }
+ nvme_tcp_init_iter(req, READ);
+ }
+
+ /* we can read only from what is left in this bio */
+ recv_len = min_t(size_t, recv_len,
+ iov_iter_count(&req->iter));
+
+ if (queue->data_digest)
+ ret = skb_copy_and_hash_datagram_iter(skb, *offset,
+ &req->iter, recv_len, queue->rcv_hash);
+ else
+ ret = skb_copy_datagram_iter(skb, *offset,
+ &req->iter, recv_len);
+ if (ret) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d failed to copy request %#x data",
+ nvme_tcp_queue_id(queue), rq->tag);
+ return ret;
+ }
+
+ *len -= recv_len;
+ *offset += recv_len;
+ queue->data_remaining -= recv_len;
+ }
+
+ if (!queue->data_remaining) {
+ if (queue->data_digest) {
+ nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
+ queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
+ } else {
+ nvme_tcp_init_recv_ctx(queue);
+ }
+ }
+
+ return 0;
+}
+
+static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
+ struct sk_buff *skb, unsigned int *offset, size_t *len)
+{
+ char *ddgst = (char *)&queue->recv_ddgst;
+ size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
+ off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
+ int ret;
+
+ ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
+ if (unlikely(ret))
+ return ret;
+
+ queue->ddgst_remaining -= recv_len;
+ *offset += recv_len;
+ *len -= recv_len;
+ if (queue->ddgst_remaining)
+ return 0;
+
+ if (queue->recv_ddgst != queue->exp_ddgst) {
+ dev_err(queue->ctrl->ctrl.device,
+ "data digest error: recv %#x expected %#x\n",
+ le32_to_cpu(queue->recv_ddgst),
+ le32_to_cpu(queue->exp_ddgst));
+ return -EIO;
+ }
+
+ nvme_tcp_init_recv_ctx(queue);
+ return 0;
+}
+
+static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct nvme_tcp_queue *queue = desc->arg.data;
+ size_t consumed = len;
+ int result;
+
+ while (len) {
+ switch (nvme_tcp_recv_state(queue)) {
+ case NVME_TCP_RECV_PDU:
+ result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
+ break;
+ case NVME_TCP_RECV_DATA:
+ result = nvme_tcp_recv_data(queue, skb, &offset, &len);
+ break;
+ case NVME_TCP_RECV_DDGST:
+ result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
+ break;
+ default:
+ result = -EFAULT;
+ }
+ if (result) {
+ dev_err(queue->ctrl->ctrl.device,
+ "receive failed: %d\n", result);
+ queue->rd_enabled = false;
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ return result;
+ }
+ }
+
+ return consumed;
+}
+
+static void nvme_tcp_data_ready(struct sock *sk)
+{
+ struct nvme_tcp_queue *queue;
+
+ read_lock(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (likely(queue && queue->rd_enabled))
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_write_space(struct sock *sk)
+{
+ struct nvme_tcp_queue *queue;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (likely(queue && sk_stream_is_writeable(sk))) {
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_state_change(struct sock *sk)
+{
+ struct nvme_tcp_queue *queue;
+
+ read_lock(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (!queue)
+ goto done;
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ case TCP_CLOSE_WAIT:
+ case TCP_LAST_ACK:
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ /* fallthrough */
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ break;
+ default:
+ dev_info(queue->ctrl->ctrl.device,
+ "queue %d socket state %d\n",
+ nvme_tcp_queue_id(queue), sk->sk_state);
+ }
+
+ queue->state_change(sk);
+done:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
+{
+ queue->request = NULL;
+}
+
+static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
+{
+ union nvme_result res = {};
+
+ nvme_end_request(blk_mq_rq_from_pdu(req),
+ cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res);
+}
+
+static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+
+ while (true) {
+ struct page *page = nvme_tcp_req_cur_page(req);
+ size_t offset = nvme_tcp_req_cur_offset(req);
+ size_t len = nvme_tcp_req_cur_length(req);
+ bool last = nvme_tcp_pdu_last_send(req, len);
+ int ret, flags = MSG_DONTWAIT;
+
+ if (last && !queue->data_digest)
+ flags |= MSG_EOR;
+ else
+ flags |= MSG_MORE;
+
+ ret = kernel_sendpage(queue->sock, page, offset, len, flags);
+ if (ret <= 0)
+ return ret;
+
+ nvme_tcp_advance_req(req, ret);
+ if (queue->data_digest)
+ nvme_tcp_ddgst_update(queue->snd_hash, page,
+ offset, ret);
+
+ /* fully successful last write*/
+ if (last && ret == len) {
+ if (queue->data_digest) {
+ nvme_tcp_ddgst_final(queue->snd_hash,
+ &req->ddgst);
+ req->state = NVME_TCP_SEND_DDGST;
+ req->offset = 0;
+ } else {
+ nvme_tcp_done_send_req(queue);
+ }
+ return 1;
+ }
+ }
+ return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+ struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ bool inline_data = nvme_tcp_has_inline_data(req);
+ int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+ int len = sizeof(*pdu) + hdgst - req->offset;
+ int ret;
+
+ if (queue->hdr_digest && !req->offset)
+ nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+ ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+ offset_in_page(pdu) + req->offset, len, flags);
+ if (unlikely(ret <= 0))
+ return ret;
+
+ len -= ret;
+ if (!len) {
+ if (inline_data) {
+ req->state = NVME_TCP_SEND_DATA;
+ if (queue->data_digest)
+ crypto_ahash_init(queue->snd_hash);
+ nvme_tcp_init_iter(req, WRITE);
+ } else {
+ nvme_tcp_done_send_req(queue);
+ }
+ return 1;
+ }
+ req->offset += ret;
+
+ return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+ struct nvme_tcp_data_pdu *pdu = req->pdu;
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+ int len = sizeof(*pdu) - req->offset + hdgst;
+ int ret;
+
+ if (queue->hdr_digest && !req->offset)
+ nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+ ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+ offset_in_page(pdu) + req->offset, len,
+ MSG_DONTWAIT | MSG_MORE);
+ if (unlikely(ret <= 0))
+ return ret;
+
+ len -= ret;
+ if (!len) {
+ req->state = NVME_TCP_SEND_DATA;
+ if (queue->data_digest)
+ crypto_ahash_init(queue->snd_hash);
+ if (!req->data_sent)
+ nvme_tcp_init_iter(req, WRITE);
+ return 1;
+ }
+ req->offset += ret;
+
+ return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+ int ret;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+ struct kvec iov = {
+ .iov_base = &req->ddgst + req->offset,
+ .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
+ };
+
+ ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+ if (unlikely(ret <= 0))
+ return ret;
+
+ if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
+ nvme_tcp_done_send_req(queue);
+ return 1;
+ }
+
+ req->offset += ret;
+ return -EAGAIN;
+}
+
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_request *req;
+ int ret = 1;
+
+ if (!queue->request) {
+ queue->request = nvme_tcp_fetch_request(queue);
+ if (!queue->request)
+ return 0;
+ }
+ req = queue->request;
+
+ if (req->state == NVME_TCP_SEND_CMD_PDU) {
+ ret = nvme_tcp_try_send_cmd_pdu(req);
+ if (ret <= 0)
+ goto done;
+ if (!nvme_tcp_has_inline_data(req))
+ return ret;
+ }
+
+ if (req->state == NVME_TCP_SEND_H2C_PDU) {
+ ret = nvme_tcp_try_send_data_pdu(req);
+ if (ret <= 0)
+ goto done;
+ }
+
+ if (req->state == NVME_TCP_SEND_DATA) {
+ ret = nvme_tcp_try_send_data(req);
+ if (ret <= 0)
+ goto done;
+ }
+
+ if (req->state == NVME_TCP_SEND_DDGST)
+ ret = nvme_tcp_try_send_ddgst(req);
+done:
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+}
+
+static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+{
+ struct sock *sk = queue->sock->sk;
+ read_descriptor_t rd_desc;
+ int consumed;
+
+ rd_desc.arg.data = queue;
+ rd_desc.count = 1;
+ lock_sock(sk);
+ consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
+ release_sock(sk);
+ return consumed;
+}
+
+static void nvme_tcp_io_work(struct work_struct *w)
+{
+ struct nvme_tcp_queue *queue =
+ container_of(w, struct nvme_tcp_queue, io_work);
+ unsigned long start = jiffies + msecs_to_jiffies(1);
+
+ do {
+ bool pending = false;
+ int result;
+
+ result = nvme_tcp_try_send(queue);
+ if (result > 0) {
+ pending = true;
+ } else if (unlikely(result < 0)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "failed to send request %d\n", result);
+ if (result != -EPIPE)
+ nvme_tcp_fail_request(queue->request);
+ nvme_tcp_done_send_req(queue);
+ return;
+ }
+
+ result = nvme_tcp_try_recv(queue);
+ if (result > 0)
+ pending = true;
+
+ if (!pending)
+ return;
+
+ } while (time_after(jiffies, start)); /* quota is exhausted */
+
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+ ahash_request_free(queue->rcv_hash);
+ ahash_request_free(queue->snd_hash);
+ crypto_free_ahash(tfm);
+}
+
+static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
+{
+ struct crypto_ahash *tfm;
+
+ tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+ if (!queue->snd_hash)
+ goto free_tfm;
+ ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+ queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+ if (!queue->rcv_hash)
+ goto free_snd_hash;
+ ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+ return 0;
+free_snd_hash:
+ ahash_request_free(queue->snd_hash);
+free_tfm:
+ crypto_free_ahash(tfm);
+ return -ENOMEM;
+}
+
+static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+ struct nvme_tcp_request *async = &ctrl->async_req;
+
+ page_frag_free(async->pdu);
+}
+
+static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+ struct nvme_tcp_queue *queue = &ctrl->queues[0];
+ struct nvme_tcp_request *async = &ctrl->async_req;
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+ async->pdu = page_frag_alloc(&queue->pf_cache,
+ sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!async->pdu)
+ return -ENOMEM;
+
+ async->queue = &ctrl->queues[0];
+ return 0;
+}
+
+static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+ if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
+ return;
+
+ if (queue->hdr_digest || queue->data_digest)
+ nvme_tcp_free_crypto(queue);
+
+ sock_release(queue->sock);
+ kfree(queue->pdu);
+}
+
+static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_icreq_pdu *icreq;
+ struct nvme_tcp_icresp_pdu *icresp;
+ struct msghdr msg = {};
+ struct kvec iov;
+ bool ctrl_hdgst, ctrl_ddgst;
+ int ret;
+
+ icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
+ if (!icreq)
+ return -ENOMEM;
+
+ icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
+ if (!icresp) {
+ ret = -ENOMEM;
+ goto free_icreq;
+ }
+
+ icreq->hdr.type = nvme_tcp_icreq;
+ icreq->hdr.hlen = sizeof(*icreq);
+ icreq->hdr.pdo = 0;
+ icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
+ icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+ icreq->maxr2t = 0; /* single inflight r2t supported */
+ icreq->hpda = 0; /* no alignment constraint */
+ if (queue->hdr_digest)
+ icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+ if (queue->data_digest)
+ icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+ iov.iov_base = icreq;
+ iov.iov_len = sizeof(*icreq);
+ ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+ if (ret < 0)
+ goto free_icresp;
+
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = icresp;
+ iov.iov_len = sizeof(*icresp);
+ ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+ iov.iov_len, msg.msg_flags);
+ if (ret < 0)
+ goto free_icresp;
+
+ ret = -EINVAL;
+ if (icresp->hdr.type != nvme_tcp_icresp) {
+ pr_err("queue %d: bad type returned %d\n",
+ nvme_tcp_queue_id(queue), icresp->hdr.type);
+ goto free_icresp;
+ }
+
+ if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
+ pr_err("queue %d: bad pdu length returned %d\n",
+ nvme_tcp_queue_id(queue), icresp->hdr.plen);
+ goto free_icresp;
+ }
+
+ if (icresp->pfv != NVME_TCP_PFV_1_0) {
+ pr_err("queue %d: bad pfv returned %d\n",
+ nvme_tcp_queue_id(queue), icresp->pfv);
+ goto free_icresp;
+ }
+
+ ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+ if ((queue->data_digest && !ctrl_ddgst) ||
+ (!queue->data_digest && ctrl_ddgst)) {
+ pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
+ nvme_tcp_queue_id(queue),
+ queue->data_digest ? "enabled" : "disabled",
+ ctrl_ddgst ? "enabled" : "disabled");
+ goto free_icresp;
+ }
+
+ ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+ if ((queue->hdr_digest && !ctrl_hdgst) ||
+ (!queue->hdr_digest && ctrl_hdgst)) {
+ pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
+ nvme_tcp_queue_id(queue),
+ queue->hdr_digest ? "enabled" : "disabled",
+ ctrl_hdgst ? "enabled" : "disabled");
+ goto free_icresp;
+ }
+
+ if (icresp->cpda != 0) {
+ pr_err("queue %d: unsupported cpda returned %d\n",
+ nvme_tcp_queue_id(queue), icresp->cpda);
+ goto free_icresp;
+ }
+
+ ret = 0;
+free_icresp:
+ kfree(icresp);
+free_icreq:
+ kfree(icreq);
+ return ret;
+}
+
+static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
+ int qid, size_t queue_size)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+ struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+ int ret, opt, rcv_pdu_size, n;
+
+ queue->ctrl = ctrl;
+ INIT_LIST_HEAD(&queue->send_list);
+ spin_lock_init(&queue->lock);
+ INIT_WORK(&queue->io_work, nvme_tcp_io_work);
+ queue->queue_size = queue_size;
+
+ if (qid > 0)
+ queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+ else
+ queue->cmnd_capsule_len = sizeof(struct nvme_command) +
+ NVME_TCP_ADMIN_CCSZ;
+
+ ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &queue->sock);
+ if (ret) {
+ dev_err(ctrl->ctrl.device,
+ "failed to create socket: %d\n", ret);
+ return ret;
+ }
+
+ /* Single syn retry */
+ opt = 1;
+ ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
+ (char *)&opt, sizeof(opt));
+ if (ret) {
+ dev_err(ctrl->ctrl.device,
+ "failed to set TCP_SYNCNT sock opt %d\n", ret);
+ goto err_sock;
+ }
+
+ /* Set TCP no delay */
+ opt = 1;
+ ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
+ TCP_NODELAY, (char *)&opt, sizeof(opt));
+ if (ret) {
+ dev_err(ctrl->ctrl.device,
+ "failed to set TCP_NODELAY sock opt %d\n", ret);
+ goto err_sock;
+ }
+
+ /*
+ * Cleanup whatever is sitting in the TCP transmit queue on socket
+ * close. This is done to prevent stale data from being sent should
+ * the network connection be restored before TCP times out.
+ */
+ ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
+ (char *)&sol, sizeof(sol));
+ if (ret) {
+ dev_err(ctrl->ctrl.device,
+ "failed to set SO_LINGER sock opt %d\n", ret);
+ goto err_sock;
+ }
+
+ queue->sock->sk->sk_allocation = GFP_ATOMIC;
+ if (!qid)
+ n = 0;
+ else
+ n = (qid - 1) % num_online_cpus();
+ queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+ queue->request = NULL;
+ queue->data_remaining = 0;
+ queue->ddgst_remaining = 0;
+ queue->pdu_remaining = 0;
+ queue->pdu_offset = 0;
+ sk_set_memalloc(queue->sock->sk);
+
+ if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+ ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
+ sizeof(ctrl->src_addr));
+ if (ret) {
+ dev_err(ctrl->ctrl.device,
+ "failed to bind queue %d socket %d\n",
+ qid, ret);
+ goto err_sock;
+ }
+ }
+
+ queue->hdr_digest = nctrl->opts->hdr_digest;
+ queue->data_digest = nctrl->opts->data_digest;
+ if (queue->hdr_digest || queue->data_digest) {
+ ret = nvme_tcp_alloc_crypto(queue);
+ if (ret) {
+ dev_err(ctrl->ctrl.device,
+ "failed to allocate queue %d crypto\n", qid);
+ goto err_sock;
+ }
+ }
+
+ rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
+ nvme_tcp_hdgst_len(queue);
+ queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
+ if (!queue->pdu) {
+ ret = -ENOMEM;
+ goto err_crypto;
+ }
+
+ dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
+ nvme_tcp_queue_id(queue));
+
+ ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
+ sizeof(ctrl->addr), 0);
+ if (ret) {
+ dev_err(ctrl->ctrl.device,
+ "failed to connect socket: %d\n", ret);
+ goto err_rcv_pdu;
+ }
+
+ ret = nvme_tcp_init_connection(queue);
+ if (ret)
+ goto err_init_connect;
+
+ queue->rd_enabled = true;
+ set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
+ nvme_tcp_init_recv_ctx(queue);
+
+ write_lock_bh(&queue->sock->sk->sk_callback_lock);
+ queue->sock->sk->sk_user_data = queue;
+ queue->state_change = queue->sock->sk->sk_state_change;
+ queue->data_ready = queue->sock->sk->sk_data_ready;
+ queue->write_space = queue->sock->sk->sk_write_space;
+ queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
+ queue->sock->sk->sk_state_change = nvme_tcp_state_change;
+ queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+ write_unlock_bh(&queue->sock->sk->sk_callback_lock);
+
+ return 0;
+
+err_init_connect:
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+err_rcv_pdu:
+ kfree(queue->pdu);
+err_crypto:
+ if (queue->hdr_digest || queue->data_digest)
+ nvme_tcp_free_crypto(queue);
+err_sock:
+ sock_release(queue->sock);
+ queue->sock = NULL;
+ return ret;
+}
+
+static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
+{
+ struct socket *sock = queue->sock;
+
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ sock->sk->sk_user_data = NULL;
+ sock->sk->sk_data_ready = queue->data_ready;
+ sock->sk->sk_state_change = queue->state_change;
+ sock->sk->sk_write_space = queue->write_space;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
+{
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ nvme_tcp_restore_sock_calls(queue);
+ cancel_work_sync(&queue->io_work);
+}
+
+static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+ if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
+ return;
+
+ __nvme_tcp_stop_queue(queue);
+}
+
+static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ int ret;
+
+ if (idx)
+ ret = nvmf_connect_io_queue(nctrl, idx, false);
+ else
+ ret = nvmf_connect_admin_queue(nctrl);
+
+ if (!ret) {
+ set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
+ } else {
+ __nvme_tcp_stop_queue(&ctrl->queues[idx]);
+ dev_err(nctrl->device,
+ "failed to connect queue: %d ret=%d\n", idx, ret);
+ }
+ return ret;
+}
+
+static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
+ bool admin)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct blk_mq_tag_set *set;
+ int ret;
+
+ if (admin) {
+ set = &ctrl->admin_tag_set;
+ memset(set, 0, sizeof(*set));
+ set->ops = &nvme_tcp_admin_mq_ops;
+ set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+ set->reserved_tags = 2; /* connect + keep-alive */
+ set->numa_node = NUMA_NO_NODE;
+ set->cmd_size = sizeof(struct nvme_tcp_request);
+ set->driver_data = ctrl;
+ set->nr_hw_queues = 1;
+ set->timeout = ADMIN_TIMEOUT;
+ } else {
+ set = &ctrl->tag_set;
+ memset(set, 0, sizeof(*set));
+ set->ops = &nvme_tcp_mq_ops;
+ set->queue_depth = nctrl->sqsize + 1;
+ set->reserved_tags = 1; /* fabric connect */
+ set->numa_node = NUMA_NO_NODE;
+ set->flags = BLK_MQ_F_SHOULD_MERGE;
+ set->cmd_size = sizeof(struct nvme_tcp_request);
+ set->driver_data = ctrl;
+ set->nr_hw_queues = nctrl->queue_count - 1;
+ set->timeout = NVME_IO_TIMEOUT;
+ set->nr_maps = 2 /* default + read */;
+ }
+
+ ret = blk_mq_alloc_tag_set(set);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return set;
+}
+
+static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
+{
+ if (to_tcp_ctrl(ctrl)->async_req.pdu) {
+ nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
+ to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
+ }
+
+ nvme_tcp_free_queue(ctrl, 0);
+}
+
+static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
+{
+ int i;
+
+ for (i = 1; i < ctrl->queue_count; i++)
+ nvme_tcp_free_queue(ctrl, i);
+}
+
+static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
+{
+ int i;
+
+ for (i = 1; i < ctrl->queue_count; i++)
+ nvme_tcp_stop_queue(ctrl, i);
+}
+
+static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
+{
+ int i, ret = 0;
+
+ for (i = 1; i < ctrl->queue_count; i++) {
+ ret = nvme_tcp_start_queue(ctrl, i);
+ if (ret)
+ goto out_stop_queues;
+ }
+
+ return 0;
+
+out_stop_queues:
+ for (i--; i >= 1; i--)
+ nvme_tcp_stop_queue(ctrl, i);
+ return ret;
+}
+
+static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
+{
+ int ret;
+
+ ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
+ if (ret)
+ return ret;
+
+ ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
+ if (ret)
+ goto out_free_queue;
+
+ return 0;
+
+out_free_queue:
+ nvme_tcp_free_queue(ctrl, 0);
+ return ret;
+}
+
+static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+ int i, ret;
+
+ for (i = 1; i < ctrl->queue_count; i++) {
+ ret = nvme_tcp_alloc_queue(ctrl, i,
+ ctrl->sqsize + 1);
+ if (ret)
+ goto out_free_queues;
+ }
+
+ return 0;
+
+out_free_queues:
+ for (i--; i >= 1; i--)
+ nvme_tcp_free_queue(ctrl, i);
+
+ return ret;
+}
+
+static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
+{
+ unsigned int nr_io_queues;
+
+ nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
+ nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
+
+ return nr_io_queues;
+}
+
+static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+ unsigned int nr_io_queues;
+ int ret;
+
+ nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
+ ret = nvme_set_queue_count(ctrl, &nr_io_queues);
+ if (ret)
+ return ret;
+
+ ctrl->queue_count = nr_io_queues + 1;
+ if (ctrl->queue_count < 2)
+ return 0;
+
+ dev_info(ctrl->device,
+ "creating %d I/O queues.\n", nr_io_queues);
+
+ return nvme_tcp_alloc_io_queues(ctrl);
+}
+
+static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
+{
+ nvme_tcp_stop_io_queues(ctrl);
+ if (remove) {
+ if (ctrl->ops->flags & NVME_F_FABRICS)
+ blk_cleanup_queue(ctrl->connect_q);
+ blk_mq_free_tag_set(ctrl->tagset);
+ }
+ nvme_tcp_free_io_queues(ctrl);
+}
+
+static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
+{
+ int ret;
+
+ ret = nvme_alloc_io_queues(ctrl);
+ if (ret)
+ return ret;
+
+ if (new) {
+ ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
+ if (IS_ERR(ctrl->tagset)) {
+ ret = PTR_ERR(ctrl->tagset);
+ goto out_free_io_queues;
+ }
+
+ if (ctrl->ops->flags & NVME_F_FABRICS) {
+ ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
+ if (IS_ERR(ctrl->connect_q)) {
+ ret = PTR_ERR(ctrl->connect_q);
+ goto out_free_tag_set;
+ }
+ }
+ } else {
+ blk_mq_update_nr_hw_queues(ctrl->tagset,
+ ctrl->queue_count - 1);
+ }
+
+ ret = nvme_tcp_start_io_queues(ctrl);
+ if (ret)
+ goto out_cleanup_connect_q;
+
+ return 0;
+
+out_cleanup_connect_q:
+ if (new && (ctrl->ops->flags & NVME_F_FABRICS))
+ blk_cleanup_queue(ctrl->connect_q);
+out_free_tag_set:
+ if (new)
+ blk_mq_free_tag_set(ctrl->tagset);
+out_free_io_queues:
+ nvme_tcp_free_io_queues(ctrl);
+ return ret;
+}
+
+static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
+{
+ nvme_tcp_stop_queue(ctrl, 0);
+ if (remove) {
+ free_opal_dev(ctrl->opal_dev);
+ blk_cleanup_queue(ctrl->admin_q);
+ blk_mq_free_tag_set(ctrl->admin_tagset);
+ }
+ nvme_tcp_free_admin_queue(ctrl);
+}
+
+static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
+{
+ int error;
+
+ error = nvme_tcp_alloc_admin_queue(ctrl);
+ if (error)
+ return error;
+
+ if (new) {
+ ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
+ if (IS_ERR(ctrl->admin_tagset)) {
+ error = PTR_ERR(ctrl->admin_tagset);
+ goto out_free_queue;
+ }
+
+ ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
+ if (IS_ERR(ctrl->admin_q)) {
+ error = PTR_ERR(ctrl->admin_q);
+ goto out_free_tagset;
+ }
+ }
+
+ error = nvme_tcp_start_queue(ctrl, 0);
+ if (error)
+ goto out_cleanup_queue;
+
+ error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+ if (error) {
+ dev_err(ctrl->device,
+ "prop_get NVME_REG_CAP failed\n");
+ goto out_stop_queue;
+ }
+
+ ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
+
+ error = nvme_enable_ctrl(ctrl, ctrl->cap);
+ if (error)
+ goto out_stop_queue;
+
+ error = nvme_init_identify(ctrl);
+ if (error)
+ goto out_stop_queue;
+
+ return 0;
+
+out_stop_queue:
+ nvme_tcp_stop_queue(ctrl, 0);
+out_cleanup_queue:
+ if (new)
+ blk_cleanup_queue(ctrl->admin_q);
+out_free_tagset:
+ if (new)
+ blk_mq_free_tag_set(ctrl->admin_tagset);
+out_free_queue:
+ nvme_tcp_free_admin_queue(ctrl);
+ return error;
+}
+
+static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
+ bool remove)
+{
+ blk_mq_quiesce_queue(ctrl->admin_q);
+ nvme_tcp_stop_queue(ctrl, 0);
+ blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
+ blk_mq_unquiesce_queue(ctrl->admin_q);
+ nvme_tcp_destroy_admin_queue(ctrl, remove);
+}
+
+static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
+ bool remove)
+{
+ if (ctrl->queue_count <= 1)
+ return;
+ nvme_stop_queues(ctrl);
+ nvme_tcp_stop_io_queues(ctrl);
+ blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
+ if (remove)
+ nvme_start_queues(ctrl);
+ nvme_tcp_destroy_io_queues(ctrl, remove);
+}
+
+static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
+{
+ /* If we are resetting/deleting then do nothing */
+ if (ctrl->state != NVME_CTRL_CONNECTING) {
+ WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
+ ctrl->state == NVME_CTRL_LIVE);
+ return;
+ }
+
+ if (nvmf_should_reconnect(ctrl)) {
+ dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
+ ctrl->opts->reconnect_delay);
+ queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
+ ctrl->opts->reconnect_delay * HZ);
+ } else {
+ dev_info(ctrl->device, "Removing controller...\n");
+ nvme_delete_ctrl(ctrl);
+ }
+}
+
+static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
+{
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ int ret = -EINVAL;
+
+ ret = nvme_tcp_configure_admin_queue(ctrl, new);
+ if (ret)
+ return ret;
+
+ if (ctrl->icdoff) {
+ dev_err(ctrl->device, "icdoff is not supported!\n");
+ goto destroy_admin;
+ }
+
+ if (opts->queue_size > ctrl->sqsize + 1)
+ dev_warn(ctrl->device,
+ "queue_size %zu > ctrl sqsize %u, clamping down\n",
+ opts->queue_size, ctrl->sqsize + 1);
+
+ if (ctrl->sqsize + 1 > ctrl->maxcmd) {
+ dev_warn(ctrl->device,
+ "sqsize %u > ctrl maxcmd %u, clamping down\n",
+ ctrl->sqsize + 1, ctrl->maxcmd);
+ ctrl->sqsize = ctrl->maxcmd - 1;
+ }
+
+ if (ctrl->queue_count > 1) {
+ ret = nvme_tcp_configure_io_queues(ctrl, new);
+ if (ret)
+ goto destroy_admin;
+ }
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
+ /* state change failure is ok if we're in DELETING state */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ ret = -EINVAL;
+ goto destroy_io;
+ }
+
+ nvme_start_ctrl(ctrl);
+ return 0;
+
+destroy_io:
+ if (ctrl->queue_count > 1)
+ nvme_tcp_destroy_io_queues(ctrl, new);
+destroy_admin:
+ nvme_tcp_stop_queue(ctrl, 0);
+ nvme_tcp_destroy_admin_queue(ctrl, new);
+ return ret;
+}
+
+static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+ struct nvme_tcp_ctrl, connect_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+ ++ctrl->nr_reconnects;
+
+ if (nvme_tcp_setup_ctrl(ctrl, false))
+ goto requeue;
+
+ dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
+ ctrl->nr_reconnects);
+
+ ctrl->nr_reconnects = 0;
+
+ return;
+
+requeue:
+ dev_info(ctrl->device, "Failed reconnect attempt %d\n",
+ ctrl->nr_reconnects);
+ nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_error_recovery_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+ struct nvme_tcp_ctrl, err_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+ nvme_stop_keep_alive(ctrl);
+ nvme_tcp_teardown_io_queues(ctrl, false);
+ /* unquiesce to fail fast pending requests */
+ nvme_start_queues(ctrl);
+ nvme_tcp_teardown_admin_queue(ctrl, false);
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+ /* state change failure is ok if we're in DELETING state */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ return;
+ }
+
+ nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
+{
+ nvme_tcp_teardown_io_queues(ctrl, shutdown);
+ if (shutdown)
+ nvme_shutdown_ctrl(ctrl);
+ else
+ nvme_disable_ctrl(ctrl, ctrl->cap);
+ nvme_tcp_teardown_admin_queue(ctrl, shutdown);
+}
+
+static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+ nvme_tcp_teardown_ctrl(ctrl, true);
+}
+
+static void nvme_reset_ctrl_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(work, struct nvme_ctrl, reset_work);
+
+ nvme_stop_ctrl(ctrl);
+ nvme_tcp_teardown_ctrl(ctrl, false);
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+ /* state change failure is ok if we're in DELETING state */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ return;
+ }
+
+ if (nvme_tcp_setup_ctrl(ctrl, false))
+ goto out_fail;
+
+ return;
+
+out_fail:
+ ++ctrl->nr_reconnects;
+ nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+ cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+ cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+}
+
+static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+
+ if (list_empty(&ctrl->list))
+ goto free_ctrl;
+
+ mutex_lock(&nvme_tcp_ctrl_mutex);
+ list_del(&ctrl->list);
+ mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+ nvmf_free_options(nctrl->opts);
+free_ctrl:
+ kfree(ctrl->queues);
+ kfree(ctrl);
+}
+
+static void nvme_tcp_set_sg_null(struct nvme_command *c)
+{
+ struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+ sg->addr = 0;
+ sg->length = 0;
+ sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+ NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
+ struct nvme_command *c, u32 data_len)
+{
+ struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+ sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
+ sg->length = cpu_to_le32(data_len);
+ sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
+}
+
+static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
+ u32 data_len)
+{
+ struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+ sg->addr = 0;
+ sg->length = cpu_to_le32(data_len);
+ sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+ NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
+ struct nvme_tcp_queue *queue = &ctrl->queues[0];
+ struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
+ struct nvme_command *cmd = &pdu->cmd;
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+ memset(pdu, 0, sizeof(*pdu));
+ pdu->hdr.type = nvme_tcp_cmd;
+ if (queue->hdr_digest)
+ pdu->hdr.flags |= NVME_TCP_F_HDGST;
+ pdu->hdr.hlen = sizeof(*pdu);
+ pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+ cmd->common.opcode = nvme_admin_async_event;
+ cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+ cmd->common.flags |= NVME_CMD_SGL_METABUF;
+ nvme_tcp_set_sg_null(cmd);
+
+ ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
+ ctrl->async_req.offset = 0;
+ ctrl->async_req.curr_bio = NULL;
+ ctrl->async_req.data_len = 0;
+
+ nvme_tcp_queue_request(&ctrl->async_req);
+}
+
+static enum blk_eh_timer_return
+nvme_tcp_timeout(struct request *rq, bool reserved)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
+ struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+
+ dev_dbg(ctrl->ctrl.device,
+ "queue %d: timeout request %#x type %d\n",
+ nvme_tcp_queue_id(req->queue), rq->tag,
+ pdu->hdr.type);
+
+ if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+ union nvme_result res = {};
+
+ nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
+ nvme_end_request(rq, cpu_to_le16(NVME_SC_ABORT_REQ), res);
+ return BLK_EH_DONE;
+ }
+
+ /* queue error recovery */
+ nvme_tcp_error_recovery(&ctrl->ctrl);
+
+ return BLK_EH_RESET_TIMER;
+}
+
+static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
+ struct request *rq)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_command *c = &pdu->cmd;
+
+ c->common.flags |= NVME_CMD_SGL_METABUF;
+
+ if (rq_data_dir(rq) == WRITE && req->data_len &&
+ req->data_len <= nvme_tcp_inline_data_size(queue))
+ nvme_tcp_set_sg_inline(queue, c, req->data_len);
+ else
+ nvme_tcp_set_sg_host_data(c, req->data_len);
+
+ return 0;
+}
+
+static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
+ struct request *rq)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_queue *queue = req->queue;
+ u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
+ blk_status_t ret;
+
+ ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
+ if (ret)
+ return ret;
+
+ req->state = NVME_TCP_SEND_CMD_PDU;
+ req->offset = 0;
+ req->data_sent = 0;
+ req->pdu_len = 0;
+ req->pdu_sent = 0;
+ req->data_len = blk_rq_payload_bytes(rq);
+ req->curr_bio = rq->bio;
+
+ if (rq_data_dir(rq) == WRITE &&
+ req->data_len <= nvme_tcp_inline_data_size(queue))
+ req->pdu_len = req->data_len;
+ else if (req->curr_bio)
+ nvme_tcp_init_iter(req, READ);
+
+ pdu->hdr.type = nvme_tcp_cmd;
+ pdu->hdr.flags = 0;
+ if (queue->hdr_digest)
+ pdu->hdr.flags |= NVME_TCP_F_HDGST;
+ if (queue->data_digest && req->pdu_len) {
+ pdu->hdr.flags |= NVME_TCP_F_DDGST;
+ ddgst = nvme_tcp_ddgst_len(queue);
+ }
+ pdu->hdr.hlen = sizeof(*pdu);
+ pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
+ pdu->hdr.plen =
+ cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
+
+ ret = nvme_tcp_map_data(queue, rq);
+ if (unlikely(ret)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "Failed to map data (%d)\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct nvme_ns *ns = hctx->queue->queuedata;
+ struct nvme_tcp_queue *queue = hctx->driver_data;
+ struct request *rq = bd->rq;
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
+ blk_status_t ret;
+
+ if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+ return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+
+ ret = nvme_tcp_setup_cmd_pdu(ns, rq);
+ if (unlikely(ret))
+ return ret;
+
+ blk_mq_start_request(rq);
+
+ nvme_tcp_queue_request(req);
+
+ return BLK_STS_OK;
+}
+
+static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
+{
+ struct nvme_tcp_ctrl *ctrl = set->driver_data;
+
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
+ if (ctrl->ctrl.opts->nr_write_queues) {
+ /* separate read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->ctrl.opts->nr_write_queues;
+ set->map[HCTX_TYPE_READ].queue_offset =
+ ctrl->ctrl.opts->nr_write_queues;
+ } else {
+ /* mixed read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->ctrl.opts->nr_io_queues;
+ set->map[HCTX_TYPE_READ].queue_offset = 0;
+ }
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
+ blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+ return 0;
+}
+
+static struct blk_mq_ops nvme_tcp_mq_ops = {
+ .queue_rq = nvme_tcp_queue_rq,
+ .complete = nvme_complete_rq,
+ .init_request = nvme_tcp_init_request,
+ .exit_request = nvme_tcp_exit_request,
+ .init_hctx = nvme_tcp_init_hctx,
+ .timeout = nvme_tcp_timeout,
+ .map_queues = nvme_tcp_map_queues,
+};
+
+static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
+ .queue_rq = nvme_tcp_queue_rq,
+ .complete = nvme_complete_rq,
+ .init_request = nvme_tcp_init_request,
+ .exit_request = nvme_tcp_exit_request,
+ .init_hctx = nvme_tcp_init_admin_hctx,
+ .timeout = nvme_tcp_timeout,
+};
+
+static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
+ .name = "tcp",
+ .module = THIS_MODULE,
+ .flags = NVME_F_FABRICS,
+ .reg_read32 = nvmf_reg_read32,
+ .reg_read64 = nvmf_reg_read64,
+ .reg_write32 = nvmf_reg_write32,
+ .free_ctrl = nvme_tcp_free_ctrl,
+ .submit_async_event = nvme_tcp_submit_async_event,
+ .delete_ctrl = nvme_tcp_delete_ctrl,
+ .get_address = nvmf_get_address,
+ .stop_ctrl = nvme_tcp_stop_ctrl,
+};
+
+static bool
+nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
+{
+ struct nvme_tcp_ctrl *ctrl;
+ bool found = false;
+
+ mutex_lock(&nvme_tcp_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
+ found = nvmf_ip_options_match(&ctrl->ctrl, opts);
+ if (found)
+ break;
+ }
+ mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+ return found;
+}
+
+static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
+ struct nvmf_ctrl_options *opts)
+{
+ struct nvme_tcp_ctrl *ctrl;
+ int ret;
+
+ ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+ if (!ctrl)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ctrl->list);
+ ctrl->ctrl.opts = opts;
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
+ ctrl->ctrl.sqsize = opts->queue_size - 1;
+ ctrl->ctrl.kato = opts->kato;
+
+ INIT_DELAYED_WORK(&ctrl->connect_work,
+ nvme_tcp_reconnect_ctrl_work);
+ INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
+ INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
+
+ if (!(opts->mask & NVMF_OPT_TRSVCID)) {
+ opts->trsvcid =
+ kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
+ if (!opts->trsvcid) {
+ ret = -ENOMEM;
+ goto out_free_ctrl;
+ }
+ opts->mask |= NVMF_OPT_TRSVCID;
+ }
+
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->traddr, opts->trsvcid, &ctrl->addr);
+ if (ret) {
+ pr_err("malformed address passed: %s:%s\n",
+ opts->traddr, opts->trsvcid);
+ goto out_free_ctrl;
+ }
+
+ if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->host_traddr, NULL, &ctrl->src_addr);
+ if (ret) {
+ pr_err("malformed src address passed: %s\n",
+ opts->host_traddr);
+ goto out_free_ctrl;
+ }
+ }
+
+ if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
+ ret = -EALREADY;
+ goto out_free_ctrl;
+ }
+
+ ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
+ GFP_KERNEL);
+ if (!ctrl->queues) {
+ ret = -ENOMEM;
+ goto out_free_ctrl;
+ }
+
+ ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
+ if (ret)
+ goto out_kfree_queues;
+
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+ WARN_ON_ONCE(1);
+ ret = -EINTR;
+ goto out_uninit_ctrl;
+ }
+
+ ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
+ if (ret)
+ goto out_uninit_ctrl;
+
+ dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+ ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+
+ nvme_get_ctrl(&ctrl->ctrl);
+
+ mutex_lock(&nvme_tcp_ctrl_mutex);
+ list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
+ mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+ return &ctrl->ctrl;
+
+out_uninit_ctrl:
+ nvme_uninit_ctrl(&ctrl->ctrl);
+ nvme_put_ctrl(&ctrl->ctrl);
+ if (ret > 0)
+ ret = -EIO;
+ return ERR_PTR(ret);
+out_kfree_queues:
+ kfree(ctrl->queues);
+out_free_ctrl:
+ kfree(ctrl);
+ return ERR_PTR(ret);
+}
+
+static struct nvmf_transport_ops nvme_tcp_transport = {
+ .name = "tcp",
+ .module = THIS_MODULE,
+ .required_opts = NVMF_OPT_TRADDR,
+ .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+ NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
+ NVMF_OPT_NR_WRITE_QUEUES,
+ .create_ctrl = nvme_tcp_create_ctrl,
+};
+
+static int __init nvme_tcp_init_module(void)
+{
+ nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ if (!nvme_tcp_wq)
+ return -ENOMEM;
+
+ nvmf_register_transport(&nvme_tcp_transport);
+ return 0;
+}
+
+static void __exit nvme_tcp_cleanup_module(void)
+{
+ struct nvme_tcp_ctrl *ctrl;
+
+ nvmf_unregister_transport(&nvme_tcp_transport);
+
+ mutex_lock(&nvme_tcp_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
+ nvme_delete_ctrl(&ctrl->ctrl);
+ mutex_unlock(&nvme_tcp_ctrl_mutex);
+ flush_workqueue(nvme_delete_wq);
+
+ destroy_workqueue(nvme_tcp_wq);
+}
+
+module_init(nvme_tcp_init_module);
+module_exit(nvme_tcp_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 25b0e310f4a8..5566dda3237a 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -139,3 +139,6 @@ const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
return ret;
}
+EXPORT_SYMBOL_GPL(nvme_trace_disk_name);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 196d5bd56718..3564120aa7b3 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -115,8 +115,8 @@ TRACE_EVENT(nvme_setup_cmd,
__entry->nsid = le32_to_cpu(cmd->common.nsid);
__entry->metadata = le64_to_cpu(cmd->common.metadata);
__assign_disk_name(__entry->disk, req->rq_disk);
- memcpy(__entry->cdw10, cmd->common.cdw10,
- sizeof(__entry->cdw10));
+ memcpy(__entry->cdw10, &cmd->common.cdw10,
+ 6 * sizeof(__entry->cdw10));
),
TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
__entry->ctrl_id, __print_disk_name(__entry->disk),
@@ -184,6 +184,29 @@ TRACE_EVENT(nvme_async_event,
#undef aer_name
+TRACE_EVENT(nvme_sq,
+ TP_PROTO(struct request *req, __le16 sq_head, int sq_tail),
+ TP_ARGS(req, sq_head, sq_tail),
+ TP_STRUCT__entry(
+ __field(int, ctrl_id)
+ __array(char, disk, DISK_NAME_LEN)
+ __field(int, qid)
+ __field(u16, sq_head)
+ __field(u16, sq_tail)
+ ),
+ TP_fast_assign(
+ __entry->ctrl_id = nvme_req(req)->ctrl->instance;
+ __assign_disk_name(__entry->disk, req->rq_disk);
+ __entry->qid = nvme_req_qid(req);
+ __entry->sq_head = le16_to_cpu(sq_head);
+ __entry->sq_tail = sq_tail;
+ ),
+ TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u",
+ __entry->ctrl_id, __print_disk_name(__entry->disk),
+ __entry->qid, __entry->sq_head, __entry->sq_tail
+ )
+);
+
#endif /* _TRACE_NVME_H */
#undef TRACE_INCLUDE_PATH
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 3c7b61ddb0d1..d94f25cde019 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -60,3 +60,13 @@ config NVME_TARGET_FCLOOP
to test NVMe-FC transport interfaces.
If unsure, say N.
+
+config NVME_TARGET_TCP
+ tristate "NVMe over Fabrics TCP target support"
+ depends on INET
+ depends on NVME_TARGET
+ help
+ This enables the NVMe TCP target support, which allows exporting NVMe
+ devices over TCP.
+
+ If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 8118c93391c6..8c3ad0fb6860 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o
obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o
obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o
obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o
+obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o
nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
discovery.o io-cmd-file.o io-cmd-bdev.o
@@ -12,3 +13,4 @@ nvme-loop-y += loop.o
nvmet-rdma-y += rdma.o
nvmet-fc-y += fc.o
nvme-fcloop-y += fcloop.o
+nvmet-tcp-y += tcp.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 1179f6314323..11baeb14c388 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -19,19 +19,6 @@
#include <asm/unaligned.h>
#include "nvmet.h"
-/*
- * This helper allows us to clear the AEN based on the RAE bit,
- * Please use this helper when processing the log pages which are
- * associated with the AEN.
- */
-static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
-{
- int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
-
- if (!rae)
- clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
-}
-
u32 nvmet_get_log_page_len(struct nvme_command *cmd)
{
u32 len = le16_to_cpu(cmd->get_log_page.numdu);
@@ -50,6 +37,34 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len));
}
+static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ u16 status = NVME_SC_SUCCESS;
+ unsigned long flags;
+ off_t offset = 0;
+ u64 slot;
+ u64 i;
+
+ spin_lock_irqsave(&ctrl->error_lock, flags);
+ slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS;
+
+ for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) {
+ status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot],
+ sizeof(struct nvme_error_slot));
+ if (status)
+ break;
+
+ if (slot == 0)
+ slot = NVMET_ERROR_LOG_SLOTS - 1;
+ else
+ slot--;
+ offset += sizeof(struct nvme_error_slot);
+ }
+ spin_unlock_irqrestore(&ctrl->error_lock, flags);
+ nvmet_req_complete(req, status);
+}
+
static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
struct nvme_smart_log *slog)
{
@@ -60,6 +75,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
if (!ns) {
pr_err("Could not find namespace id : %d\n",
le32_to_cpu(req->cmd->get_log_page.nsid));
+ req->error_loc = offsetof(struct nvme_rw_command, nsid);
return NVME_SC_INVALID_NS;
}
@@ -119,6 +135,7 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
{
struct nvme_smart_log *log;
u16 status = NVME_SC_INTERNAL;
+ unsigned long flags;
if (req->data_len != sizeof(*log))
goto out;
@@ -134,6 +151,11 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
if (status)
goto out_free_log;
+ spin_lock_irqsave(&req->sq->ctrl->error_lock, flags);
+ put_unaligned_le64(req->sq->ctrl->err_counter,
+ &log->num_err_log_entries);
+ spin_unlock_irqrestore(&req->sq->ctrl->error_lock, flags);
+
status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
out_free_log:
kfree(log);
@@ -189,7 +211,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
if (!status)
status = nvmet_zero_sgl(req, len, req->data_len - len);
ctrl->nr_changed_ns = 0;
- nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR);
+ nvmet_clear_aen_bit(req, NVME_AEN_BIT_NS_ATTR);
mutex_unlock(&ctrl->lock);
out:
nvmet_req_complete(req, status);
@@ -252,7 +274,7 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
hdr.ngrps = cpu_to_le16(ngrps);
- nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE);
+ nvmet_clear_aen_bit(req, NVME_AEN_BIT_ANA_CHANGE);
up_read(&nvmet_ana_sem);
kfree(desc);
@@ -304,7 +326,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
/* XXX: figure out what to do about RTD3R/RTD3 */
id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
- id->ctratt = cpu_to_le32(1 << 0);
+ id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT |
+ NVME_CTRL_ATTR_TBKAS);
id->oacs = 0;
@@ -392,6 +415,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
u16 status = 0;
if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
+ req->error_loc = offsetof(struct nvme_identify, nsid);
status = NVME_SC_INVALID_NS | NVME_SC_DNR;
goto out;
}
@@ -512,6 +536,7 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req)
ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
if (!ns) {
+ req->error_loc = offsetof(struct nvme_identify, nsid);
status = NVME_SC_INVALID_NS | NVME_SC_DNR;
goto out;
}
@@ -569,13 +594,15 @@ static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req)
static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
{
- u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]);
+ u32 write_protect = le32_to_cpu(req->cmd->common.cdw11);
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid);
- if (unlikely(!req->ns))
+ if (unlikely(!req->ns)) {
+ req->error_loc = offsetof(struct nvme_common_command, nsid);
return status;
+ }
mutex_lock(&subsys->lock);
switch (write_protect) {
@@ -599,11 +626,36 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
return status;
}
+u16 nvmet_set_feat_kato(struct nvmet_req *req)
+{
+ u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
+
+ req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
+
+ nvmet_set_result(req, req->sq->ctrl->kato);
+
+ return 0;
+}
+
+u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
+{
+ u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
+
+ if (val32 & ~mask) {
+ req->error_loc = offsetof(struct nvme_common_command, cdw11);
+ return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ }
+
+ WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
+ nvmet_set_result(req, val32);
+
+ return 0;
+}
+
static void nvmet_execute_set_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
- u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
- u32 val32;
+ u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
u16 status = 0;
switch (cdw10 & 0xff) {
@@ -612,19 +664,10 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
(subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
break;
case NVME_FEAT_KATO:
- val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
- req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
- nvmet_set_result(req, req->sq->ctrl->kato);
+ status = nvmet_set_feat_kato(req);
break;
case NVME_FEAT_ASYNC_EVENT:
- val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
- if (val32 & ~NVMET_AEN_CFG_ALL) {
- status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
- break;
- }
-
- WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
- nvmet_set_result(req, val32);
+ status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL);
break;
case NVME_FEAT_HOST_ID:
status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
@@ -633,6 +676,7 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
status = nvmet_set_feat_write_protect(req);
break;
default:
+ req->error_loc = offsetof(struct nvme_common_command, cdw10);
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
}
@@ -646,9 +690,10 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
u32 result;
req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid);
- if (!req->ns)
+ if (!req->ns) {
+ req->error_loc = offsetof(struct nvme_common_command, nsid);
return NVME_SC_INVALID_NS | NVME_SC_DNR;
-
+ }
mutex_lock(&subsys->lock);
if (req->ns->readonly == true)
result = NVME_NS_WRITE_PROTECT;
@@ -660,10 +705,20 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
return 0;
}
+void nvmet_get_feat_kato(struct nvmet_req *req)
+{
+ nvmet_set_result(req, req->sq->ctrl->kato * 1000);
+}
+
+void nvmet_get_feat_async_event(struct nvmet_req *req)
+{
+ nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
+}
+
static void nvmet_execute_get_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
- u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+ u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
u16 status = 0;
switch (cdw10 & 0xff) {
@@ -689,7 +744,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
break;
#endif
case NVME_FEAT_ASYNC_EVENT:
- nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
+ nvmet_get_feat_async_event(req);
break;
case NVME_FEAT_VOLATILE_WC:
nvmet_set_result(req, 1);
@@ -699,11 +754,13 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
(subsys->max_qid-1) | ((subsys->max_qid-1) << 16));
break;
case NVME_FEAT_KATO:
- nvmet_set_result(req, req->sq->ctrl->kato * 1000);
+ nvmet_get_feat_kato(req);
break;
case NVME_FEAT_HOST_ID:
/* need 128-bit host identifier flag */
- if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) {
+ if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) {
+ req->error_loc =
+ offsetof(struct nvme_common_command, cdw11);
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
}
@@ -715,6 +772,8 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
status = nvmet_get_feat_write_protect(req);
break;
default:
+ req->error_loc =
+ offsetof(struct nvme_common_command, cdw10);
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
break;
}
@@ -722,7 +781,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
-static void nvmet_execute_async_event(struct nvmet_req *req)
+void nvmet_execute_async_event(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -738,7 +797,7 @@ static void nvmet_execute_async_event(struct nvmet_req *req)
schedule_work(&ctrl->async_event_work);
}
-static void nvmet_execute_keep_alive(struct nvmet_req *req)
+void nvmet_execute_keep_alive(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -764,13 +823,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
switch (cmd->get_log_page.lid) {
case NVME_LOG_ERROR:
- /*
- * We currently never set the More bit in the status
- * field, so all error log entries are invalid and can
- * be zeroed out. This is called a minum viable
- * implementation (TM) of this mandatory log page.
- */
- req->execute = nvmet_execute_get_log_page_noop;
+ req->execute = nvmet_execute_get_log_page_error;
return 0;
case NVME_LOG_SMART:
req->execute = nvmet_execute_get_log_page_smart;
@@ -836,5 +889,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
req->sq->qid);
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index d895579b6c5d..618bbd006544 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -25,12 +25,16 @@
static const struct config_item_type nvmet_host_type;
static const struct config_item_type nvmet_subsys_type;
+static LIST_HEAD(nvmet_ports_list);
+struct list_head *nvmet_ports = &nvmet_ports_list;
+
static const struct nvmet_transport_name {
u8 type;
const char *name;
} nvmet_transport_names[] = {
{ NVMF_TRTYPE_RDMA, "rdma" },
{ NVMF_TRTYPE_FC, "fc" },
+ { NVMF_TRTYPE_TCP, "tcp" },
{ NVMF_TRTYPE_LOOP, "loop" },
};
@@ -150,7 +154,8 @@ CONFIGFS_ATTR(nvmet_, addr_traddr);
static ssize_t nvmet_addr_treq_show(struct config_item *item,
char *page)
{
- switch (to_nvmet_port(item)->disc_addr.treq) {
+ switch (to_nvmet_port(item)->disc_addr.treq &
+ NVME_TREQ_SECURE_CHANNEL_MASK) {
case NVMF_TREQ_NOT_SPECIFIED:
return sprintf(page, "not specified\n");
case NVMF_TREQ_REQUIRED:
@@ -166,6 +171,7 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
+ u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK;
if (port->enabled) {
pr_err("Cannot modify address while enabled\n");
@@ -174,15 +180,16 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
}
if (sysfs_streq(page, "not specified")) {
- port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED;
+ treq |= NVMF_TREQ_NOT_SPECIFIED;
} else if (sysfs_streq(page, "required")) {
- port->disc_addr.treq = NVMF_TREQ_REQUIRED;
+ treq |= NVMF_TREQ_REQUIRED;
} else if (sysfs_streq(page, "not required")) {
- port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED;
+ treq |= NVMF_TREQ_NOT_REQUIRED;
} else {
pr_err("Invalid value '%s' for treq\n", page);
return -EINVAL;
}
+ port->disc_addr.treq = treq;
return count;
}
@@ -646,7 +653,8 @@ static int nvmet_port_subsys_allow_link(struct config_item *parent,
}
list_add_tail(&link->entry, &port->subsystems);
- nvmet_genctr++;
+ nvmet_port_disc_changed(port, subsys);
+
up_write(&nvmet_config_sem);
return 0;
@@ -673,7 +681,8 @@ static void nvmet_port_subsys_drop_link(struct config_item *parent,
found:
list_del(&p->entry);
- nvmet_genctr++;
+ nvmet_port_disc_changed(port, subsys);
+
if (list_empty(&port->subsystems))
nvmet_disable_port(port);
up_write(&nvmet_config_sem);
@@ -722,7 +731,8 @@ static int nvmet_allowed_hosts_allow_link(struct config_item *parent,
goto out_free_link;
}
list_add_tail(&link->entry, &subsys->hosts);
- nvmet_genctr++;
+ nvmet_subsys_disc_changed(subsys, host);
+
up_write(&nvmet_config_sem);
return 0;
out_free_link:
@@ -748,7 +758,8 @@ static void nvmet_allowed_hosts_drop_link(struct config_item *parent,
found:
list_del(&p->entry);
- nvmet_genctr++;
+ nvmet_subsys_disc_changed(subsys, host);
+
up_write(&nvmet_config_sem);
kfree(p);
}
@@ -787,7 +798,11 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item,
goto out_unlock;
}
- subsys->allow_any_host = allow_any_host;
+ if (subsys->allow_any_host != allow_any_host) {
+ subsys->allow_any_host = allow_any_host;
+ nvmet_subsys_disc_changed(subsys, NULL);
+ }
+
out_unlock:
up_write(&nvmet_config_sem);
return ret ? ret : count;
@@ -936,7 +951,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item,
if (enable)
nvmet_referral_enable(parent, port);
else
- nvmet_referral_disable(port);
+ nvmet_referral_disable(parent, port);
return count;
inval:
@@ -962,9 +977,10 @@ static struct configfs_attribute *nvmet_referral_attrs[] = {
static void nvmet_referral_release(struct config_item *item)
{
+ struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
struct nvmet_port *port = to_nvmet_port(item);
- nvmet_referral_disable(port);
+ nvmet_referral_disable(parent, port);
kfree(port);
}
@@ -1137,6 +1153,8 @@ static void nvmet_port_release(struct config_item *item)
{
struct nvmet_port *port = to_nvmet_port(item);
+ list_del(&port->global_entry);
+
kfree(port->ana_state);
kfree(port);
}
@@ -1189,12 +1207,15 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
port->ana_state[i] = NVME_ANA_INACCESSIBLE;
}
+ list_add(&port->global_entry, &nvmet_ports_list);
+
INIT_LIST_HEAD(&port->entry);
INIT_LIST_HEAD(&port->subsystems);
INIT_LIST_HEAD(&port->referrals);
port->inline_data_size = -1; /* < 0 == let the transport choose */
port->disc_addr.portid = cpu_to_le16(portid);
+ port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW;
config_group_init_type_name(&port->group, name, &nvmet_port_type);
config_group_init_type_name(&port->subsys_group,
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index a5f9bbce863f..88d260f31835 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -45,28 +45,72 @@ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
u64 nvmet_ana_chgcnt;
DECLARE_RWSEM(nvmet_ana_sem);
+inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
+{
+ u16 status;
+
+ switch (errno) {
+ case -ENOSPC:
+ req->error_loc = offsetof(struct nvme_rw_command, length);
+ status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
+ break;
+ case -EREMOTEIO:
+ req->error_loc = offsetof(struct nvme_rw_command, slba);
+ status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+ break;
+ case -EOPNOTSUPP:
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
+ switch (req->cmd->common.opcode) {
+ case nvme_cmd_dsm:
+ case nvme_cmd_write_zeroes:
+ status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
+ break;
+ default:
+ status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+ }
+ break;
+ case -ENODATA:
+ req->error_loc = offsetof(struct nvme_rw_command, nsid);
+ status = NVME_SC_ACCESS_DENIED;
+ break;
+ case -EIO:
+ /* FALLTHRU */
+ default:
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
+ status = NVME_SC_INTERNAL | NVME_SC_DNR;
+ }
+
+ return status;
+}
+
static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
const char *subsysnqn);
u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
size_t len)
{
- if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
+ if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
+ req->error_loc = offsetof(struct nvme_common_command, dptr);
return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+ }
return 0;
}
u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
{
- if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
+ if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
+ req->error_loc = offsetof(struct nvme_common_command, dptr);
return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+ }
return 0;
}
u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
{
- if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len)
+ if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
+ req->error_loc = offsetof(struct nvme_common_command, dptr);
return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+ }
return 0;
}
@@ -130,7 +174,7 @@ static void nvmet_async_event_work(struct work_struct *work)
}
}
-static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
u8 event_info, u8 log_page)
{
struct nvmet_async_event *aen;
@@ -150,13 +194,6 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
schedule_work(&ctrl->async_event_work);
}
-static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
-{
- if (!(READ_ONCE(ctrl->aen_enabled) & aen))
- return true;
- return test_and_set_bit(aen, &ctrl->aen_masked);
-}
-
static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
{
u32 i;
@@ -187,7 +224,7 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
- if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR))
+ if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
continue;
nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
NVME_AER_NOTICE_NS_CHANGED,
@@ -204,7 +241,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys,
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
if (port && ctrl->port != port)
continue;
- if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
+ if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
continue;
nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
@@ -299,6 +336,15 @@ static void nvmet_keep_alive_timer(struct work_struct *work)
{
struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvmet_ctrl, ka_work);
+ bool cmd_seen = ctrl->cmd_seen;
+
+ ctrl->cmd_seen = false;
+ if (cmd_seen) {
+ pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
+ ctrl->cntlid);
+ schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ return;
+ }
pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
ctrl->cntlid, ctrl->kato);
@@ -595,26 +641,58 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
return ns;
}
-static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
+static void nvmet_update_sq_head(struct nvmet_req *req)
{
- u32 old_sqhd, new_sqhd;
- u16 sqhd;
-
- if (status)
- nvmet_set_status(req, status);
-
if (req->sq->size) {
+ u32 old_sqhd, new_sqhd;
+
do {
old_sqhd = req->sq->sqhd;
new_sqhd = (old_sqhd + 1) % req->sq->size;
} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
old_sqhd);
}
- sqhd = req->sq->sqhd & 0x0000FFFF;
- req->rsp->sq_head = cpu_to_le16(sqhd);
+ req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
+}
+
+static void nvmet_set_error(struct nvmet_req *req, u16 status)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvme_error_slot *new_error_slot;
+ unsigned long flags;
+
+ req->rsp->status = cpu_to_le16(status << 1);
+
+ if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
+ return;
+
+ spin_lock_irqsave(&ctrl->error_lock, flags);
+ ctrl->err_counter++;
+ new_error_slot =
+ &ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
+
+ new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
+ new_error_slot->sqid = cpu_to_le16(req->sq->qid);
+ new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
+ new_error_slot->status_field = cpu_to_le16(status << 1);
+ new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
+ new_error_slot->lba = cpu_to_le64(req->error_slba);
+ new_error_slot->nsid = req->cmd->common.nsid;
+ spin_unlock_irqrestore(&ctrl->error_lock, flags);
+
+ /* set the more bit for this request */
+ req->rsp->status |= cpu_to_le16(1 << 14);
+}
+
+static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
+{
+ if (!req->sq->sqhd_disabled)
+ nvmet_update_sq_head(req);
req->rsp->sq_id = cpu_to_le16(req->sq->qid);
req->rsp->command_id = req->cmd->common.command_id;
+ if (unlikely(status))
+ nvmet_set_error(req, status);
if (req->ns)
nvmet_put_namespace(req->ns);
req->ops->queue_response(req);
@@ -735,14 +813,20 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
return ret;
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
- if (unlikely(!req->ns))
+ if (unlikely(!req->ns)) {
+ req->error_loc = offsetof(struct nvme_common_command, nsid);
return NVME_SC_INVALID_NS | NVME_SC_DNR;
+ }
ret = nvmet_check_ana_state(req->port, req->ns);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ req->error_loc = offsetof(struct nvme_common_command, nsid);
return ret;
+ }
ret = nvmet_io_cmd_check_access(req);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ req->error_loc = offsetof(struct nvme_common_command, nsid);
return ret;
+ }
if (req->ns->file)
return nvmet_file_parse_io_cmd(req);
@@ -763,10 +847,14 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
req->sg_cnt = 0;
req->transfer_len = 0;
req->rsp->status = 0;
+ req->rsp->sq_head = 0;
req->ns = NULL;
+ req->error_loc = NVMET_NO_ERROR_LOC;
+ req->error_slba = 0;
/* no support for fused commands yet */
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
+ req->error_loc = offsetof(struct nvme_common_command, flags);
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto fail;
}
@@ -777,6 +865,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
* byte aligned.
*/
if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
+ req->error_loc = offsetof(struct nvme_common_command, flags);
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto fail;
}
@@ -801,6 +890,9 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
goto fail;
}
+ if (sq->ctrl)
+ sq->ctrl->cmd_seen = true;
+
return true;
fail:
@@ -819,9 +911,10 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit);
void nvmet_req_execute(struct nvmet_req *req)
{
- if (unlikely(req->data_len != req->transfer_len))
+ if (unlikely(req->data_len != req->transfer_len)) {
+ req->error_loc = offsetof(struct nvme_common_command, dptr);
nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
- else
+ } else
req->execute(req);
}
EXPORT_SYMBOL_GPL(nvmet_req_execute);
@@ -1027,14 +1120,18 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
return 0;
}
-static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
- const char *hostnqn)
+bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
{
struct nvmet_host_link *p;
+ lockdep_assert_held(&nvmet_config_sem);
+
if (subsys->allow_any_host)
return true;
+ if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
+ return true;
+
list_for_each_entry(p, &subsys->hosts, entry) {
if (!strcmp(nvmet_host_name(p->host), hostnqn))
return true;
@@ -1043,30 +1140,6 @@ static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
return false;
}
-static bool nvmet_host_discovery_allowed(struct nvmet_req *req,
- const char *hostnqn)
-{
- struct nvmet_subsys_link *s;
-
- list_for_each_entry(s, &req->port->subsystems, entry) {
- if (__nvmet_host_allowed(s->subsys, hostnqn))
- return true;
- }
-
- return false;
-}
-
-bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
- const char *hostnqn)
-{
- lockdep_assert_held(&nvmet_config_sem);
-
- if (subsys->type == NVME_NQN_DISC)
- return nvmet_host_discovery_allowed(req, hostnqn);
- else
- return __nvmet_host_allowed(subsys, hostnqn);
-}
-
/*
* Note: ctrl->subsys->lock should be held when calling this function
*/
@@ -1117,7 +1190,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
down_read(&nvmet_config_sem);
- if (!nvmet_host_allowed(req, subsys, hostnqn)) {
+ if (!nvmet_host_allowed(subsys, hostnqn)) {
pr_info("connect by host %s for subsystem %s not allowed\n",
hostnqn, subsysnqn);
req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
@@ -1175,31 +1248,20 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
ctrl->cntlid = ret;
ctrl->ops = req->ops;
- if (ctrl->subsys->type == NVME_NQN_DISC) {
- /* Don't accept keep-alive timeout for discovery controllers */
- if (kato) {
- status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
- goto out_remove_ida;
- }
- /*
- * Discovery controllers use some arbitrary high value in order
- * to cleanup stale discovery sessions
- *
- * From the latest base diff RC:
- * "The Keep Alive command is not supported by
- * Discovery controllers. A transport may specify a
- * fixed Discovery controller activity timeout value
- * (e.g., 2 minutes). If no commands are received
- * by a Discovery controller within that time
- * period, the controller may perform the
- * actions for Keep Alive Timer expiration".
- */
- ctrl->kato = NVMET_DISC_KATO;
- } else {
- /* keep-alive timeout in seconds */
- ctrl->kato = DIV_ROUND_UP(kato, 1000);
- }
+ /*
+ * Discovery controllers may use some arbitrary high value
+ * in order to cleanup stale discovery sessions
+ */
+ if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
+ kato = NVMET_DISC_KATO_MS;
+
+ /* keep-alive timeout in seconds */
+ ctrl->kato = DIV_ROUND_UP(kato, 1000);
+
+ ctrl->err_counter = 0;
+ spin_lock_init(&ctrl->error_lock);
+
nvmet_start_keep_alive_timer(ctrl);
mutex_lock(&subsys->lock);
@@ -1210,8 +1272,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
*ctrlp = ctrl;
return 0;
-out_remove_ida:
- ida_simple_remove(&cntlid_ida, ctrl->cntlid);
out_free_sqs:
kfree(ctrl->sqs);
out_free_cqs:
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index bc0aa0bf1543..d2cb71a0b419 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -18,7 +18,65 @@
struct nvmet_subsys *nvmet_disc_subsys;
-u64 nvmet_genctr;
+static u64 nvmet_genctr;
+
+static void __nvmet_disc_changed(struct nvmet_port *port,
+ struct nvmet_ctrl *ctrl)
+{
+ if (ctrl->port != port)
+ return;
+
+ if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE))
+ return;
+
+ nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+ NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC);
+}
+
+void nvmet_port_disc_changed(struct nvmet_port *port,
+ struct nvmet_subsys *subsys)
+{
+ struct nvmet_ctrl *ctrl;
+
+ nvmet_genctr++;
+
+ list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
+ if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn))
+ continue;
+
+ __nvmet_disc_changed(port, ctrl);
+ }
+}
+
+static void __nvmet_subsys_disc_changed(struct nvmet_port *port,
+ struct nvmet_subsys *subsys,
+ struct nvmet_host *host)
+{
+ struct nvmet_ctrl *ctrl;
+
+ list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
+ if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn))
+ continue;
+
+ __nvmet_disc_changed(port, ctrl);
+ }
+}
+
+void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
+ struct nvmet_host *host)
+{
+ struct nvmet_port *port;
+ struct nvmet_subsys_link *s;
+
+ nvmet_genctr++;
+
+ list_for_each_entry(port, nvmet_ports, global_entry)
+ list_for_each_entry(s, &port->subsystems, entry) {
+ if (s->subsys != subsys)
+ continue;
+ __nvmet_subsys_disc_changed(port, subsys, host);
+ }
+}
void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
{
@@ -26,18 +84,18 @@ void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
if (list_empty(&port->entry)) {
list_add_tail(&port->entry, &parent->referrals);
port->enabled = true;
- nvmet_genctr++;
+ nvmet_port_disc_changed(parent, NULL);
}
up_write(&nvmet_config_sem);
}
-void nvmet_referral_disable(struct nvmet_port *port)
+void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port)
{
down_write(&nvmet_config_sem);
if (!list_empty(&port->entry)) {
port->enabled = false;
list_del_init(&port->entry);
- nvmet_genctr++;
+ nvmet_port_disc_changed(parent, NULL);
}
up_write(&nvmet_config_sem);
}
@@ -107,7 +165,7 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
down_read(&nvmet_config_sem);
list_for_each_entry(p, &req->port->subsystems, entry) {
- if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn))
+ if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
continue;
if (residual_len >= entry_size) {
char traddr[NVMF_TRADDR_SIZE];
@@ -136,6 +194,8 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
hdr->numrec = cpu_to_le64(numrec);
hdr->recfmt = cpu_to_le16(0);
+ nvmet_clear_aen_bit(req, NVME_AEN_BIT_DISC_CHANGE);
+
up_read(&nvmet_config_sem);
status = nvmet_copy_to_sgl(req, 0, hdr, data_len);
@@ -174,6 +234,8 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20);
+ id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL);
+
strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn));
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
@@ -183,6 +245,51 @@ out:
nvmet_req_complete(req, status);
}
+static void nvmet_execute_disc_set_features(struct nvmet_req *req)
+{
+ u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+ u16 stat;
+
+ switch (cdw10 & 0xff) {
+ case NVME_FEAT_KATO:
+ stat = nvmet_set_feat_kato(req);
+ break;
+ case NVME_FEAT_ASYNC_EVENT:
+ stat = nvmet_set_feat_async_event(req,
+ NVMET_DISC_AEN_CFG_OPTIONAL);
+ break;
+ default:
+ req->error_loc =
+ offsetof(struct nvme_common_command, cdw10);
+ stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ break;
+ }
+
+ nvmet_req_complete(req, stat);
+}
+
+static void nvmet_execute_disc_get_features(struct nvmet_req *req)
+{
+ u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+ u16 stat = 0;
+
+ switch (cdw10 & 0xff) {
+ case NVME_FEAT_KATO:
+ nvmet_get_feat_kato(req);
+ break;
+ case NVME_FEAT_ASYNC_EVENT:
+ nvmet_get_feat_async_event(req);
+ break;
+ default:
+ req->error_loc =
+ offsetof(struct nvme_common_command, cdw10);
+ stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ break;
+ }
+
+ nvmet_req_complete(req, stat);
+}
+
u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@@ -190,10 +297,28 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
pr_err("got cmd %d while not ready\n",
cmd->common.opcode);
+ req->error_loc =
+ offsetof(struct nvme_common_command, opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
switch (cmd->common.opcode) {
+ case nvme_admin_set_features:
+ req->execute = nvmet_execute_disc_set_features;
+ req->data_len = 0;
+ return 0;
+ case nvme_admin_get_features:
+ req->execute = nvmet_execute_disc_get_features;
+ req->data_len = 0;
+ return 0;
+ case nvme_admin_async_event:
+ req->execute = nvmet_execute_async_event;
+ req->data_len = 0;
+ return 0;
+ case nvme_admin_keep_alive:
+ req->execute = nvmet_execute_keep_alive;
+ req->data_len = 0;
+ return 0;
case nvme_admin_get_log_page:
req->data_len = nvmet_get_log_page_len(cmd);
@@ -204,6 +329,8 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
default:
pr_err("unsupported get_log_page lid %d\n",
cmd->get_log_page.lid);
+ req->error_loc =
+ offsetof(struct nvme_get_log_page_command, lid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
case nvme_admin_identify:
@@ -216,10 +343,12 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
default:
pr_err("unsupported identify cns %d\n",
cmd->identify.cns);
+ req->error_loc = offsetof(struct nvme_identify, cns);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
default:
pr_err("unhandled cmd %d\n", cmd->common.opcode);
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index d84ae004cb85..6cf1fd9eb32e 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -17,23 +17,26 @@
static void nvmet_execute_prop_set(struct nvmet_req *req)
{
+ u64 val = le64_to_cpu(req->cmd->prop_set.value);
u16 status = 0;
- if (!(req->cmd->prop_set.attrib & 1)) {
- u64 val = le64_to_cpu(req->cmd->prop_set.value);
-
- switch (le32_to_cpu(req->cmd->prop_set.offset)) {
- case NVME_REG_CC:
- nvmet_update_cc(req->sq->ctrl, val);
- break;
- default:
- status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
- break;
- }
- } else {
+ if (req->cmd->prop_set.attrib & 1) {
+ req->error_loc =
+ offsetof(struct nvmf_property_set_command, attrib);
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ goto out;
}
+ switch (le32_to_cpu(req->cmd->prop_set.offset)) {
+ case NVME_REG_CC:
+ nvmet_update_cc(req->sq->ctrl, val);
+ break;
+ default:
+ req->error_loc =
+ offsetof(struct nvmf_property_set_command, offset);
+ status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ }
+out:
nvmet_req_complete(req, status);
}
@@ -69,6 +72,14 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
}
}
+ if (status && req->cmd->prop_get.attrib & 1) {
+ req->error_loc =
+ offsetof(struct nvmf_property_get_command, offset);
+ } else {
+ req->error_loc =
+ offsetof(struct nvmf_property_get_command, attrib);
+ }
+
req->rsp->result.u64 = cpu_to_le64(val);
nvmet_req_complete(req, status);
}
@@ -89,6 +100,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
default:
pr_err("received unknown capsule type 0x%x\n",
cmd->fabrics.fctype);
+ req->error_loc = offsetof(struct nvmf_common_command, fctype);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
@@ -105,16 +117,34 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
if (old) {
pr_warn("queue already connected!\n");
+ req->error_loc = offsetof(struct nvmf_connect_command, opcode);
return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
}
if (!sqsize) {
pr_warn("queue size zero!\n");
+ req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
}
/* note: convert queue size from 0's-based value to 1's-based value */
nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
+
+ if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) {
+ req->sq->sqhd_disabled = true;
+ req->rsp->sq_head = cpu_to_le16(0xffff);
+ }
+
+ if (ctrl->ops->install_queue) {
+ u16 ret = ctrl->ops->install_queue(req->sq);
+
+ if (ret) {
+ pr_err("failed to install queue %d cntlid %d ret %x\n",
+ qid, ret, ctrl->cntlid);
+ return ret;
+ }
+ }
+
return 0;
}
@@ -141,6 +171,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
if (c->recfmt != 0) {
pr_warn("invalid connect version (%d).\n",
le16_to_cpu(c->recfmt));
+ req->error_loc = offsetof(struct nvmf_connect_command, recfmt);
status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
goto out;
}
@@ -155,8 +186,13 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
le32_to_cpu(c->kato), &ctrl);
- if (status)
+ if (status) {
+ if (status == (NVME_SC_INVALID_FIELD | NVME_SC_DNR))
+ req->error_loc =
+ offsetof(struct nvme_common_command, opcode);
goto out;
+ }
+
uuid_copy(&ctrl->hostid, &d->hostid);
status = nvmet_install_queue(ctrl, req);
@@ -243,11 +279,13 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
if (cmd->common.opcode != nvme_fabrics_command) {
pr_err("invalid command 0x%x on unconnected queue.\n",
cmd->fabrics.opcode);
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
if (cmd->fabrics.fctype != nvme_fabrics_type_connect) {
pr_err("invalid capsule type 0x%x on unconnected queue.\n",
cmd->fabrics.fctype);
+ req->error_loc = offsetof(struct nvmf_common_command, fctype);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 409081a03b24..f98f5c5bea26 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -86,8 +86,6 @@ struct nvmet_fc_fcp_iod {
spinlock_t flock;
struct nvmet_req req;
- struct work_struct work;
- struct work_struct done_work;
struct work_struct defer_work;
struct nvmet_fc_tgtport *tgtport;
@@ -134,7 +132,6 @@ struct nvmet_fc_tgt_queue {
u16 sqsize;
u16 ersp_ratio;
__le16 sqhd;
- int cpu;
atomic_t connected;
atomic_t sqtail;
atomic_t zrspcnt;
@@ -232,8 +229,6 @@ static LIST_HEAD(nvmet_fc_portentry_list);
static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
-static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
-static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work);
static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
@@ -438,8 +433,6 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
int i;
for (i = 0; i < queue->sqsize; fod++, i++) {
- INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
- INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work);
fod->tgtport = tgtport;
fod->queue = queue;
@@ -517,10 +510,7 @@ nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport,
fcpreq->hwqid = queue->qid ?
((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
- if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
- queue_work_on(queue->cpu, queue->work_q, &fod->work);
- else
- nvmet_fc_handle_fcp_rqst(tgtport, fod);
+ nvmet_fc_handle_fcp_rqst(tgtport, fod);
}
static void
@@ -599,30 +589,6 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
queue_work(queue->work_q, &fod->defer_work);
}
-static int
-nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid)
-{
- int cpu, idx, cnt;
-
- if (tgtport->ops->max_hw_queues == 1)
- return WORK_CPU_UNBOUND;
-
- /* Simple cpu selection based on qid modulo active cpu count */
- idx = !qid ? 0 : (qid - 1) % num_active_cpus();
-
- /* find the n'th active cpu */
- for (cpu = 0, cnt = 0; ; ) {
- if (cpu_active(cpu)) {
- if (cnt == idx)
- break;
- cnt++;
- }
- cpu = (cpu + 1) % num_possible_cpus();
- }
-
- return cpu;
-}
-
static struct nvmet_fc_tgt_queue *
nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
u16 qid, u16 sqsize)
@@ -653,7 +619,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
queue->qid = qid;
queue->sqsize = sqsize;
queue->assoc = assoc;
- queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid);
INIT_LIST_HEAD(&queue->fod_list);
INIT_LIST_HEAD(&queue->avail_defer_list);
INIT_LIST_HEAD(&queue->pending_cmd_list);
@@ -2146,25 +2111,11 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
}
static void
-nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
-{
- struct nvmet_fc_fcp_iod *fod =
- container_of(work, struct nvmet_fc_fcp_iod, done_work);
-
- nvmet_fc_fod_op_done(fod);
-}
-
-static void
nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
{
struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
- struct nvmet_fc_tgt_queue *queue = fod->queue;
- if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
- /* context switch so completion is not in ISR context */
- queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
- else
- nvmet_fc_fod_op_done(fod);
+ nvmet_fc_fod_op_done(fod);
}
/*
@@ -2332,19 +2283,6 @@ transport_error:
nvmet_fc_abort_op(tgtport, fod);
}
-/*
- * Actual processing routine for received FC-NVME LS Requests from the LLD
- */
-static void
-nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
-{
- struct nvmet_fc_fcp_iod *fod =
- container_of(work, struct nvmet_fc_fcp_iod, work);
- struct nvmet_fc_tgtport *tgtport = fod->tgtport;
-
- nvmet_fc_handle_fcp_rqst(tgtport, fod);
-}
-
/**
* nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD
* upon the reception of a NVME FCP CMD IU.
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index c1ec3475a140..b6d030d3259f 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -44,13 +44,69 @@ void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
}
}
+static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
+{
+ u16 status = NVME_SC_SUCCESS;
+
+ if (likely(blk_sts == BLK_STS_OK))
+ return status;
+ /*
+ * Right now there exists M : 1 mapping between block layer error
+ * to the NVMe status code (see nvme_error_status()). For consistency,
+ * when we reverse map we use most appropriate NVMe Status code from
+ * the group of the NVMe staus codes used in the nvme_error_status().
+ */
+ switch (blk_sts) {
+ case BLK_STS_NOSPC:
+ status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
+ req->error_loc = offsetof(struct nvme_rw_command, length);
+ break;
+ case BLK_STS_TARGET:
+ status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+ req->error_loc = offsetof(struct nvme_rw_command, slba);
+ break;
+ case BLK_STS_NOTSUPP:
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
+ switch (req->cmd->common.opcode) {
+ case nvme_cmd_dsm:
+ case nvme_cmd_write_zeroes:
+ status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
+ break;
+ default:
+ status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+ }
+ break;
+ case BLK_STS_MEDIUM:
+ status = NVME_SC_ACCESS_DENIED;
+ req->error_loc = offsetof(struct nvme_rw_command, nsid);
+ break;
+ case BLK_STS_IOERR:
+ /* fallthru */
+ default:
+ status = NVME_SC_INTERNAL | NVME_SC_DNR;
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
+ }
+
+ switch (req->cmd->common.opcode) {
+ case nvme_cmd_read:
+ case nvme_cmd_write:
+ req->error_slba = le64_to_cpu(req->cmd->rw.slba);
+ break;
+ case nvme_cmd_write_zeroes:
+ req->error_slba =
+ le64_to_cpu(req->cmd->write_zeroes.slba);
+ break;
+ default:
+ req->error_slba = 0;
+ }
+ return status;
+}
+
static void nvmet_bio_done(struct bio *bio)
{
struct nvmet_req *req = bio->bi_private;
- nvmet_req_complete(req,
- bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
-
+ nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status));
if (bio != &req->b.inline_bio)
bio_put(bio);
}
@@ -61,7 +117,6 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
struct bio *bio;
struct scatterlist *sg;
sector_t sector;
- blk_qc_t cookie;
int op, op_flags = 0, i;
if (!req->sg_cnt) {
@@ -114,9 +169,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
sg_cnt--;
}
- cookie = submit_bio(bio);
-
- blk_poll(bdev_get_queue(req->ns->bdev), cookie);
+ submit_bio(bio);
}
static void nvmet_bdev_execute_flush(struct nvmet_req *req)
@@ -139,18 +192,21 @@ u16 nvmet_bdev_flush(struct nvmet_req *req)
return 0;
}
-static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns,
+static u16 nvmet_bdev_discard_range(struct nvmet_req *req,
struct nvme_dsm_range *range, struct bio **bio)
{
+ struct nvmet_ns *ns = req->ns;
int ret;
ret = __blkdev_issue_discard(ns->bdev,
le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
GFP_KERNEL, 0, bio);
- if (ret && ret != -EOPNOTSUPP)
- return NVME_SC_INTERNAL | NVME_SC_DNR;
- return 0;
+
+ if (ret)
+ req->error_slba = le64_to_cpu(range->slba);
+
+ return blk_to_nvme_status(req, errno_to_blk_status(ret));
}
static void nvmet_bdev_execute_discard(struct nvmet_req *req)
@@ -166,7 +222,7 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req)
if (status)
break;
- status = nvmet_bdev_discard_range(req->ns, &range, &bio);
+ status = nvmet_bdev_discard_range(req, &range, &bio);
if (status)
break;
}
@@ -207,16 +263,16 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
u16 status = NVME_SC_SUCCESS;
sector_t sector;
sector_t nr_sector;
+ int ret;
sector = le64_to_cpu(write_zeroes->slba) <<
(req->ns->blksize_shift - 9);
nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
(req->ns->blksize_shift - 9));
- if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
- GFP_KERNEL, &bio, 0))
- status = NVME_SC_INTERNAL | NVME_SC_DNR;
-
+ ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
+ GFP_KERNEL, &bio, 0);
+ status = blk_to_nvme_status(req, errno_to_blk_status(ret));
if (bio) {
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
@@ -251,6 +307,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
default:
pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
req->sq->qid);
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 01feebec29ea..517522305e5c 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -83,17 +83,16 @@ static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter)
}
static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
- unsigned long nr_segs, size_t count)
+ unsigned long nr_segs, size_t count, int ki_flags)
{
struct kiocb *iocb = &req->f.iocb;
ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter);
struct iov_iter iter;
- int ki_flags = 0, rw;
- ssize_t ret;
+ int rw;
if (req->cmd->rw.opcode == nvme_cmd_write) {
if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
- ki_flags = IOCB_DSYNC;
+ ki_flags |= IOCB_DSYNC;
call_iter = req->ns->file->f_op->write_iter;
rw = WRITE;
} else {
@@ -107,17 +106,13 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
iocb->ki_filp = req->ns->file;
iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
- ret = call_iter(iocb, &iter);
-
- if (ret != -EIOCBQUEUED && iocb->ki_complete)
- iocb->ki_complete(iocb, ret, 0);
-
- return ret;
+ return call_iter(iocb, &iter);
}
static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
{
struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
+ u16 status = NVME_SC_SUCCESS;
if (req->f.bvec != req->inline_bvec) {
if (likely(req->f.mpool_alloc == false))
@@ -126,11 +121,12 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
mempool_free(req->f.bvec, req->ns->bvec_pool);
}
- nvmet_req_complete(req, ret != req->data_len ?
- NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+ if (unlikely(ret != req->data_len))
+ status = errno_to_nvme_status(req, ret);
+ nvmet_req_complete(req, status);
}
-static void nvmet_file_execute_rw(struct nvmet_req *req)
+static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
{
ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
struct sg_page_iter sg_pg_iter;
@@ -140,30 +136,14 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
ssize_t ret = 0;
loff_t pos;
- if (!req->sg_cnt || !nr_bvec) {
- nvmet_req_complete(req, 0);
- return;
- }
+
+ if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC)
+ is_sync = true;
pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
if (unlikely(pos + req->data_len > req->ns->size)) {
- nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
- return;
- }
-
- if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
- req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
- GFP_KERNEL);
- else
- req->f.bvec = req->inline_bvec;
-
- req->f.mpool_alloc = false;
- if (unlikely(!req->f.bvec)) {
- /* fallback under memory pressure */
- req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
- req->f.mpool_alloc = true;
- if (nr_bvec > NVMET_MAX_MPOOL_BVEC)
- is_sync = true;
+ nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
+ return true;
}
memset(&req->f.iocb, 0, sizeof(struct kiocb));
@@ -177,9 +157,10 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
if (unlikely(is_sync) &&
(nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) {
- ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len);
+ ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len, 0);
if (ret < 0)
- goto out;
+ goto complete;
+
pos += len;
bv_cnt = 0;
len = 0;
@@ -187,35 +168,95 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
nr_bvec--;
}
- if (WARN_ON_ONCE(total_len != req->data_len))
+ if (WARN_ON_ONCE(total_len != req->data_len)) {
ret = -EIO;
-out:
- if (unlikely(is_sync || ret)) {
- nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0);
- return;
+ goto complete;
+ }
+
+ if (unlikely(is_sync)) {
+ ret = total_len;
+ goto complete;
}
- req->f.iocb.ki_complete = nvmet_file_io_done;
- nvmet_file_submit_bvec(req, pos, bv_cnt, total_len);
+
+ /*
+ * A NULL ki_complete ask for synchronous execution, which we want
+ * for the IOCB_NOWAIT case.
+ */
+ if (!(ki_flags & IOCB_NOWAIT))
+ req->f.iocb.ki_complete = nvmet_file_io_done;
+
+ ret = nvmet_file_submit_bvec(req, pos, bv_cnt, total_len, ki_flags);
+
+ switch (ret) {
+ case -EIOCBQUEUED:
+ return true;
+ case -EAGAIN:
+ if (WARN_ON_ONCE(!(ki_flags & IOCB_NOWAIT)))
+ goto complete;
+ return false;
+ case -EOPNOTSUPP:
+ /*
+ * For file systems returning error -EOPNOTSUPP, handle
+ * IOCB_NOWAIT error case separately and retry without
+ * IOCB_NOWAIT.
+ */
+ if ((ki_flags & IOCB_NOWAIT))
+ return false;
+ break;
+ }
+
+complete:
+ nvmet_file_io_done(&req->f.iocb, ret, 0);
+ return true;
}
static void nvmet_file_buffered_io_work(struct work_struct *w)
{
struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
- nvmet_file_execute_rw(req);
+ nvmet_file_execute_io(req, 0);
}
-static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req)
+static void nvmet_file_submit_buffered_io(struct nvmet_req *req)
{
INIT_WORK(&req->f.work, nvmet_file_buffered_io_work);
queue_work(buffered_io_wq, &req->f.work);
}
+static void nvmet_file_execute_rw(struct nvmet_req *req)
+{
+ ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
+
+ if (!req->sg_cnt || !nr_bvec) {
+ nvmet_req_complete(req, 0);
+ return;
+ }
+
+ if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
+ req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ else
+ req->f.bvec = req->inline_bvec;
+
+ if (unlikely(!req->f.bvec)) {
+ /* fallback under memory pressure */
+ req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
+ req->f.mpool_alloc = true;
+ } else
+ req->f.mpool_alloc = false;
+
+ if (req->ns->buffered_io) {
+ if (likely(!req->f.mpool_alloc) &&
+ nvmet_file_execute_io(req, IOCB_NOWAIT))
+ return;
+ nvmet_file_submit_buffered_io(req);
+ } else
+ nvmet_file_execute_io(req, 0);
+}
+
u16 nvmet_file_flush(struct nvmet_req *req)
{
- if (vfs_fsync(req->ns->file, 1) < 0)
- return NVME_SC_INTERNAL | NVME_SC_DNR;
- return 0;
+ return errno_to_nvme_status(req, vfs_fsync(req->ns->file, 1));
}
static void nvmet_file_flush_work(struct work_struct *w)
@@ -236,30 +277,34 @@ static void nvmet_file_execute_discard(struct nvmet_req *req)
int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
struct nvme_dsm_range range;
loff_t offset, len;
- u16 ret;
+ u16 status = 0;
+ int ret;
int i;
for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
- ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
+ status = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
sizeof(range));
- if (ret)
+ if (status)
break;
offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
len = le32_to_cpu(range.nlb);
len <<= req->ns->blksize_shift;
if (offset + len > req->ns->size) {
- ret = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+ req->error_slba = le64_to_cpu(range.slba);
+ status = errno_to_nvme_status(req, -ENOSPC);
break;
}
- if (vfs_fallocate(req->ns->file, mode, offset, len)) {
- ret = NVME_SC_INTERNAL | NVME_SC_DNR;
+ ret = vfs_fallocate(req->ns->file, mode, offset, len);
+ if (ret) {
+ req->error_slba = le64_to_cpu(range.slba);
+ status = errno_to_nvme_status(req, ret);
break;
}
}
- nvmet_req_complete(req, ret);
+ nvmet_req_complete(req, status);
}
static void nvmet_file_dsm_work(struct work_struct *w)
@@ -299,12 +344,12 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w)
req->ns->blksize_shift);
if (unlikely(offset + len > req->ns->size)) {
- nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+ nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
return;
}
ret = vfs_fallocate(req->ns->file, mode, offset, len);
- nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+ nvmet_req_complete(req, ret < 0 ? errno_to_nvme_status(req, ret) : 0);
}
static void nvmet_file_execute_write_zeroes(struct nvmet_req *req)
@@ -320,10 +365,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
switch (cmd->common.opcode) {
case nvme_cmd_read:
case nvme_cmd_write:
- if (req->ns->buffered_io)
- req->execute = nvmet_file_execute_rw_buffered_io;
- else
- req->execute = nvmet_file_execute_rw;
+ req->execute = nvmet_file_execute_rw;
req->data_len = nvmet_rw_len(req);
return 0;
case nvme_cmd_flush:
@@ -342,6 +384,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
default:
pr_err("unhandled cmd for file ns %d on qid %d\n",
cmd->common.opcode, req->sq->qid);
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 9908082b32c4..4aac1b4a8112 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -345,7 +345,7 @@ static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl)
int i, ret;
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false);
if (ret)
return ret;
set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index c2b4d9ee6391..3e4719fdba85 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -30,12 +30,15 @@
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
+#define NVMET_NO_ERROR_LOC ((u16)-1)
/*
* Supported optional AENs:
*/
#define NVMET_AEN_CFG_OPTIONAL \
(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE)
+#define NVMET_DISC_AEN_CFG_OPTIONAL \
+ (NVME_AEN_CFG_DISC_CHANGE)
/*
* Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
@@ -104,6 +107,7 @@ struct nvmet_sq {
u16 qid;
u16 size;
u32 sqhd;
+ bool sqhd_disabled;
struct completion free_done;
struct completion confirm_done;
};
@@ -137,6 +141,7 @@ struct nvmet_port {
struct list_head subsystems;
struct config_group referrals_group;
struct list_head referrals;
+ struct list_head global_entry;
struct config_group ana_groups_group;
struct nvmet_ana_group ana_default_group;
enum nvme_ana_state *ana_state;
@@ -163,6 +168,8 @@ struct nvmet_ctrl {
struct nvmet_cq **cqs;
struct nvmet_sq **sqs;
+ bool cmd_seen;
+
struct mutex lock;
u64 cap;
u32 cc;
@@ -194,8 +201,12 @@ struct nvmet_ctrl {
char subsysnqn[NVMF_NQN_FIELD_LEN];
char hostnqn[NVMF_NQN_FIELD_LEN];
- struct device *p2p_client;
- struct radix_tree_root p2p_ns_map;
+ struct device *p2p_client;
+ struct radix_tree_root p2p_ns_map;
+
+ spinlock_t error_lock;
+ u64 err_counter;
+ struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS];
};
struct nvmet_subsys {
@@ -273,6 +284,7 @@ struct nvmet_fabrics_ops {
void (*delete_ctrl)(struct nvmet_ctrl *ctrl);
void (*disc_traddr)(struct nvmet_req *req,
struct nvmet_port *port, char *traddr);
+ u16 (*install_queue)(struct nvmet_sq *nvme_sq);
};
#define NVMET_MAX_INLINE_BIOVEC 8
@@ -308,17 +320,14 @@ struct nvmet_req {
void (*execute)(struct nvmet_req *req);
const struct nvmet_fabrics_ops *ops;
- struct pci_dev *p2p_dev;
- struct device *p2p_client;
+ struct pci_dev *p2p_dev;
+ struct device *p2p_client;
+ u16 error_loc;
+ u64 error_slba;
};
extern struct workqueue_struct *buffered_io_wq;
-static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
-{
- req->rsp->status = cpu_to_le16(status << 1);
-}
-
static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
{
req->rsp->result.u32 = cpu_to_le32(result);
@@ -340,6 +349,27 @@ struct nvmet_async_event {
u8 log_page;
};
+static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn)
+{
+ int rae = le32_to_cpu(req->cmd->common.cdw10) & 1 << 15;
+
+ if (!rae)
+ clear_bit(bn, &req->sq->ctrl->aen_masked);
+}
+
+static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn)
+{
+ if (!(READ_ONCE(ctrl->aen_enabled) & (1 << bn)))
+ return true;
+ return test_and_set_bit(bn, &ctrl->aen_masked);
+}
+
+void nvmet_get_feat_kato(struct nvmet_req *req);
+void nvmet_get_feat_async_event(struct nvmet_req *req);
+u16 nvmet_set_feat_kato(struct nvmet_req *req);
+u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask);
+void nvmet_execute_async_event(struct nvmet_req *req);
+
u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
@@ -355,6 +385,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
int nvmet_req_alloc_sgl(struct nvmet_req *req);
void nvmet_req_free_sgl(struct nvmet_req *req);
+void nvmet_execute_keep_alive(struct nvmet_req *req);
+
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
@@ -395,7 +427,7 @@ int nvmet_enable_port(struct nvmet_port *port);
void nvmet_disable_port(struct nvmet_port *port);
void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port);
-void nvmet_referral_disable(struct nvmet_port *port);
+void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port);
u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
size_t len);
@@ -405,6 +437,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len);
u32 nvmet_get_log_page_len(struct nvme_command *cmd);
+extern struct list_head *nvmet_ports;
+void nvmet_port_disc_changed(struct nvmet_port *port,
+ struct nvmet_subsys *subsys);
+void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
+ struct nvmet_host *host);
+void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+ u8 event_info, u8 log_page);
+
#define NVMET_QUEUE_SIZE 1024
#define NVMET_NR_QUEUES 128
#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
@@ -425,7 +465,7 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd);
#define NVMET_DEFAULT_ANA_GRPID 1
#define NVMET_KAS 10
-#define NVMET_DISC_KATO 120
+#define NVMET_DISC_KATO_MS 120000
int __init nvmet_init_configfs(void);
void __exit nvmet_exit_configfs(void);
@@ -434,15 +474,13 @@ int __init nvmet_init_discovery(void);
void nvmet_exit_discovery(void);
extern struct nvmet_subsys *nvmet_disc_subsys;
-extern u64 nvmet_genctr;
extern struct rw_semaphore nvmet_config_sem;
extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
extern u64 nvmet_ana_chgcnt;
extern struct rw_semaphore nvmet_ana_sem;
-bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
- const char *hostnqn);
+bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn);
int nvmet_bdev_ns_enable(struct nvmet_ns *ns);
int nvmet_file_ns_enable(struct nvmet_ns *ns);
@@ -457,4 +495,6 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
req->ns->blksize_shift;
}
+
+u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
#endif /* _NVMET_H */
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 583086dd9cb9..a8d23eb80192 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -196,7 +196,7 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
{
unsigned long flags;
- if (rsp->allocated) {
+ if (unlikely(rsp->allocated)) {
kfree(rsp);
return;
}
@@ -630,8 +630,11 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
u64 off = le64_to_cpu(sgl->addr);
u32 len = le32_to_cpu(sgl->length);
- if (!nvme_is_write(rsp->req.cmd))
+ if (!nvme_is_write(rsp->req.cmd)) {
+ rsp->req.error_loc =
+ offsetof(struct nvme_common_command, opcode);
return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+ }
if (off + len > rsp->queue->dev->inline_data_size) {
pr_err("invalid inline data offset!\n");
@@ -696,6 +699,8 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
return nvmet_rdma_map_sgl_inline(rsp);
default:
pr_err("invalid SGL subtype: %#x\n", sgl->type);
+ rsp->req.error_loc =
+ offsetof(struct nvme_common_command, dptr);
return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
}
case NVME_KEY_SGL_FMT_DATA_DESC:
@@ -706,10 +711,13 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
default:
pr_err("invalid SGL subtype: %#x\n", sgl->type);
+ rsp->req.error_loc =
+ offsetof(struct nvme_common_command, dptr);
return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
}
default:
pr_err("invalid SGL type: %#x\n", sgl->type);
+ rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
}
}
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
new file mode 100644
index 000000000000..44b37b202e39
--- /dev/null
+++ b/drivers/nvme/target/tcp.c
@@ -0,0 +1,1737 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP target.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/inet.h>
+#include <linux/llist.h>
+#include <crypto/hash.h>
+
+#include "nvmet.h"
+
+#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
+
+#define NVMET_TCP_RECV_BUDGET 8
+#define NVMET_TCP_SEND_BUDGET 8
+#define NVMET_TCP_IO_WORK_BUDGET 64
+
+enum nvmet_tcp_send_state {
+ NVMET_TCP_SEND_DATA_PDU,
+ NVMET_TCP_SEND_DATA,
+ NVMET_TCP_SEND_R2T,
+ NVMET_TCP_SEND_DDGST,
+ NVMET_TCP_SEND_RESPONSE
+};
+
+enum nvmet_tcp_recv_state {
+ NVMET_TCP_RECV_PDU,
+ NVMET_TCP_RECV_DATA,
+ NVMET_TCP_RECV_DDGST,
+ NVMET_TCP_RECV_ERR,
+};
+
+enum {
+ NVMET_TCP_F_INIT_FAILED = (1 << 0),
+};
+
+struct nvmet_tcp_cmd {
+ struct nvmet_tcp_queue *queue;
+ struct nvmet_req req;
+
+ struct nvme_tcp_cmd_pdu *cmd_pdu;
+ struct nvme_tcp_rsp_pdu *rsp_pdu;
+ struct nvme_tcp_data_pdu *data_pdu;
+ struct nvme_tcp_r2t_pdu *r2t_pdu;
+
+ u32 rbytes_done;
+ u32 wbytes_done;
+
+ u32 pdu_len;
+ u32 pdu_recv;
+ int sg_idx;
+ int nr_mapped;
+ struct msghdr recv_msg;
+ struct kvec *iov;
+ u32 flags;
+
+ struct list_head entry;
+ struct llist_node lentry;
+
+ /* send state */
+ u32 offset;
+ struct scatterlist *cur_sg;
+ enum nvmet_tcp_send_state state;
+
+ __le32 exp_ddgst;
+ __le32 recv_ddgst;
+};
+
+enum nvmet_tcp_queue_state {
+ NVMET_TCP_Q_CONNECTING,
+ NVMET_TCP_Q_LIVE,
+ NVMET_TCP_Q_DISCONNECTING,
+};
+
+struct nvmet_tcp_queue {
+ struct socket *sock;
+ struct nvmet_tcp_port *port;
+ struct work_struct io_work;
+ int cpu;
+ struct nvmet_cq nvme_cq;
+ struct nvmet_sq nvme_sq;
+
+ /* send state */
+ struct nvmet_tcp_cmd *cmds;
+ unsigned int nr_cmds;
+ struct list_head free_list;
+ struct llist_head resp_list;
+ struct list_head resp_send_list;
+ int send_list_len;
+ struct nvmet_tcp_cmd *snd_cmd;
+
+ /* recv state */
+ int offset;
+ int left;
+ enum nvmet_tcp_recv_state rcv_state;
+ struct nvmet_tcp_cmd *cmd;
+ union nvme_tcp_pdu pdu;
+
+ /* digest state */
+ bool hdr_digest;
+ bool data_digest;
+ struct ahash_request *snd_hash;
+ struct ahash_request *rcv_hash;
+
+ spinlock_t state_lock;
+ enum nvmet_tcp_queue_state state;
+
+ struct sockaddr_storage sockaddr;
+ struct sockaddr_storage sockaddr_peer;
+ struct work_struct release_work;
+
+ int idx;
+ struct list_head queue_list;
+
+ struct nvmet_tcp_cmd connect;
+
+ struct page_frag_cache pf_cache;
+
+ void (*data_ready)(struct sock *);
+ void (*state_change)(struct sock *);
+ void (*write_space)(struct sock *);
+};
+
+struct nvmet_tcp_port {
+ struct socket *sock;
+ struct work_struct accept_work;
+ struct nvmet_port *nport;
+ struct sockaddr_storage addr;
+ int last_cpu;
+ void (*data_ready)(struct sock *);
+};
+
+static DEFINE_IDA(nvmet_tcp_queue_ida);
+static LIST_HEAD(nvmet_tcp_queue_list);
+static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
+
+static struct workqueue_struct *nvmet_tcp_wq;
+static struct nvmet_fabrics_ops nvmet_tcp_ops;
+static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
+static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
+
+static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
+ struct nvmet_tcp_cmd *cmd)
+{
+ return cmd - queue->cmds;
+}
+
+static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
+{
+ return nvme_is_write(cmd->req.cmd) &&
+ cmd->rbytes_done < cmd->req.transfer_len;
+}
+
+static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
+{
+ return nvmet_tcp_has_data_in(cmd) && !cmd->req.rsp->status;
+}
+
+static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
+{
+ return !nvme_is_write(cmd->req.cmd) &&
+ cmd->req.transfer_len > 0 &&
+ !cmd->req.rsp->status;
+}
+
+static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
+{
+ return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
+ !cmd->rbytes_done;
+}
+
+static inline struct nvmet_tcp_cmd *
+nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_cmd *cmd;
+
+ cmd = list_first_entry_or_null(&queue->free_list,
+ struct nvmet_tcp_cmd, entry);
+ if (!cmd)
+ return NULL;
+ list_del_init(&cmd->entry);
+
+ cmd->rbytes_done = cmd->wbytes_done = 0;
+ cmd->pdu_len = 0;
+ cmd->pdu_recv = 0;
+ cmd->iov = NULL;
+ cmd->flags = 0;
+ return cmd;
+}
+
+static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
+{
+ if (unlikely(cmd == &cmd->queue->connect))
+ return;
+
+ list_add_tail(&cmd->entry, &cmd->queue->free_list);
+}
+
+static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
+{
+ return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
+{
+ return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
+ void *pdu, size_t len)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, pdu, len);
+ ahash_request_set_crypt(hash, &sg, pdu + len, len);
+ crypto_ahash_digest(hash);
+}
+
+static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
+ void *pdu, size_t len)
+{
+ struct nvme_tcp_hdr *hdr = pdu;
+ __le32 recv_digest;
+ __le32 exp_digest;
+
+ if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+ pr_err("queue %d: header digest enabled but no header digest\n",
+ queue->idx);
+ return -EPROTO;
+ }
+
+ recv_digest = *(__le32 *)(pdu + hdr->hlen);
+ nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
+ exp_digest = *(__le32 *)(pdu + hdr->hlen);
+ if (recv_digest != exp_digest) {
+ pr_err("queue %d: header digest error: recv %#x expected %#x\n",
+ queue->idx, le32_to_cpu(recv_digest),
+ le32_to_cpu(exp_digest));
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
+{
+ struct nvme_tcp_hdr *hdr = pdu;
+ u8 digest_len = nvmet_tcp_hdgst_len(queue);
+ u32 len;
+
+ len = le32_to_cpu(hdr->plen) - hdr->hlen -
+ (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
+
+ if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+ pr_err("queue %d: data digest flag is cleared\n", queue->idx);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
+{
+ struct scatterlist *sg;
+ int i;
+
+ sg = &cmd->req.sg[cmd->sg_idx];
+
+ for (i = 0; i < cmd->nr_mapped; i++)
+ kunmap(sg_page(&sg[i]));
+}
+
+static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
+{
+ struct kvec *iov = cmd->iov;
+ struct scatterlist *sg;
+ u32 length, offset, sg_offset;
+
+ length = cmd->pdu_len;
+ cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
+ offset = cmd->rbytes_done;
+ cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
+ sg_offset = offset % PAGE_SIZE;
+ sg = &cmd->req.sg[cmd->sg_idx];
+
+ while (length) {
+ u32 iov_len = min_t(u32, length, sg->length - sg_offset);
+
+ iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
+ iov->iov_len = iov_len;
+
+ length -= iov_len;
+ sg = sg_next(sg);
+ iov++;
+ }
+
+ iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
+ cmd->nr_mapped, cmd->pdu_len);
+}
+
+static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
+{
+ queue->rcv_state = NVMET_TCP_RECV_ERR;
+ if (queue->nvme_sq.ctrl)
+ nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
+ else
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+}
+
+static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
+{
+ struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
+ u32 len = le32_to_cpu(sgl->length);
+
+ if (!cmd->req.data_len)
+ return 0;
+
+ if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
+ NVME_SGL_FMT_OFFSET)) {
+ if (!nvme_is_write(cmd->req.cmd))
+ return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+
+ if (len > cmd->req.port->inline_data_size)
+ return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
+ cmd->pdu_len = len;
+ }
+ cmd->req.transfer_len += len;
+
+ cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
+ if (!cmd->req.sg)
+ return NVME_SC_INTERNAL;
+ cmd->cur_sg = cmd->req.sg;
+
+ if (nvmet_tcp_has_data_in(cmd)) {
+ cmd->iov = kmalloc_array(cmd->req.sg_cnt,
+ sizeof(*cmd->iov), GFP_KERNEL);
+ if (!cmd->iov)
+ goto err;
+ }
+
+ return 0;
+err:
+ sgl_free(cmd->req.sg);
+ return NVME_SC_INTERNAL;
+}
+
+static void nvmet_tcp_ddgst(struct ahash_request *hash,
+ struct nvmet_tcp_cmd *cmd)
+{
+ ahash_request_set_crypt(hash, cmd->req.sg,
+ (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
+ crypto_ahash_digest(hash);
+}
+
+static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
+{
+ struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
+ struct nvmet_tcp_queue *queue = cmd->queue;
+ u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+ u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
+
+ cmd->offset = 0;
+ cmd->state = NVMET_TCP_SEND_DATA_PDU;
+
+ pdu->hdr.type = nvme_tcp_c2h_data;
+ pdu->hdr.flags = NVME_TCP_F_DATA_LAST;
+ pdu->hdr.hlen = sizeof(*pdu);
+ pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
+ pdu->hdr.plen =
+ cpu_to_le32(pdu->hdr.hlen + hdgst +
+ cmd->req.transfer_len + ddgst);
+ pdu->command_id = cmd->req.rsp->command_id;
+ pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
+ pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
+
+ if (queue->data_digest) {
+ pdu->hdr.flags |= NVME_TCP_F_DDGST;
+ nvmet_tcp_ddgst(queue->snd_hash, cmd);
+ }
+
+ if (cmd->queue->hdr_digest) {
+ pdu->hdr.flags |= NVME_TCP_F_HDGST;
+ nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ }
+}
+
+static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
+{
+ struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
+ struct nvmet_tcp_queue *queue = cmd->queue;
+ u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+
+ cmd->offset = 0;
+ cmd->state = NVMET_TCP_SEND_R2T;
+
+ pdu->hdr.type = nvme_tcp_r2t;
+ pdu->hdr.flags = 0;
+ pdu->hdr.hlen = sizeof(*pdu);
+ pdu->hdr.pdo = 0;
+ pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+ pdu->command_id = cmd->req.cmd->common.command_id;
+ pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
+ pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
+ pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
+ if (cmd->queue->hdr_digest) {
+ pdu->hdr.flags |= NVME_TCP_F_HDGST;
+ nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ }
+}
+
+static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
+{
+ struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
+ struct nvmet_tcp_queue *queue = cmd->queue;
+ u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+
+ cmd->offset = 0;
+ cmd->state = NVMET_TCP_SEND_RESPONSE;
+
+ pdu->hdr.type = nvme_tcp_rsp;
+ pdu->hdr.flags = 0;
+ pdu->hdr.hlen = sizeof(*pdu);
+ pdu->hdr.pdo = 0;
+ pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+ if (cmd->queue->hdr_digest) {
+ pdu->hdr.flags |= NVME_TCP_F_HDGST;
+ nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ }
+}
+
+static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&queue->resp_list);
+ if (!node)
+ return;
+
+ while (node) {
+ struct nvmet_tcp_cmd *cmd = llist_entry(node,
+ struct nvmet_tcp_cmd, lentry);
+
+ list_add(&cmd->entry, &queue->resp_send_list);
+ node = node->next;
+ queue->send_list_len++;
+ }
+}
+
+static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
+{
+ queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
+ struct nvmet_tcp_cmd, entry);
+ if (!queue->snd_cmd) {
+ nvmet_tcp_process_resp_list(queue);
+ queue->snd_cmd =
+ list_first_entry_or_null(&queue->resp_send_list,
+ struct nvmet_tcp_cmd, entry);
+ if (unlikely(!queue->snd_cmd))
+ return NULL;
+ }
+
+ list_del_init(&queue->snd_cmd->entry);
+ queue->send_list_len--;
+
+ if (nvmet_tcp_need_data_out(queue->snd_cmd))
+ nvmet_setup_c2h_data_pdu(queue->snd_cmd);
+ else if (nvmet_tcp_need_data_in(queue->snd_cmd))
+ nvmet_setup_r2t_pdu(queue->snd_cmd);
+ else
+ nvmet_setup_response_pdu(queue->snd_cmd);
+
+ return queue->snd_cmd;
+}
+
+static void nvmet_tcp_queue_response(struct nvmet_req *req)
+{
+ struct nvmet_tcp_cmd *cmd =
+ container_of(req, struct nvmet_tcp_cmd, req);
+ struct nvmet_tcp_queue *queue = cmd->queue;
+
+ llist_add(&cmd->lentry, &queue->resp_list);
+ queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
+}
+
+static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
+{
+ u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+ int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
+ int ret;
+
+ ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
+ offset_in_page(cmd->data_pdu) + cmd->offset,
+ left, MSG_DONTWAIT | MSG_MORE);
+ if (ret <= 0)
+ return ret;
+
+ cmd->offset += ret;
+ left -= ret;
+
+ if (left)
+ return -EAGAIN;
+
+ cmd->state = NVMET_TCP_SEND_DATA;
+ cmd->offset = 0;
+ return 1;
+}
+
+static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
+{
+ struct nvmet_tcp_queue *queue = cmd->queue;
+ int ret;
+
+ while (cmd->cur_sg) {
+ struct page *page = sg_page(cmd->cur_sg);
+ u32 left = cmd->cur_sg->length - cmd->offset;
+
+ ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
+ left, MSG_DONTWAIT | MSG_MORE);
+ if (ret <= 0)
+ return ret;
+
+ cmd->offset += ret;
+ cmd->wbytes_done += ret;
+
+ /* Done with sg?*/
+ if (cmd->offset == cmd->cur_sg->length) {
+ cmd->cur_sg = sg_next(cmd->cur_sg);
+ cmd->offset = 0;
+ }
+ }
+
+ if (queue->data_digest) {
+ cmd->state = NVMET_TCP_SEND_DDGST;
+ cmd->offset = 0;
+ } else {
+ nvmet_setup_response_pdu(cmd);
+ }
+ return 1;
+
+}
+
+static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
+ bool last_in_batch)
+{
+ u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+ int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
+ int flags = MSG_DONTWAIT;
+ int ret;
+
+ if (!last_in_batch && cmd->queue->send_list_len)
+ flags |= MSG_MORE;
+ else
+ flags |= MSG_EOR;
+
+ ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
+ offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
+ if (ret <= 0)
+ return ret;
+ cmd->offset += ret;
+ left -= ret;
+
+ if (left)
+ return -EAGAIN;
+
+ kfree(cmd->iov);
+ sgl_free(cmd->req.sg);
+ cmd->queue->snd_cmd = NULL;
+ nvmet_tcp_put_cmd(cmd);
+ return 1;
+}
+
+static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
+{
+ u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+ int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
+ int flags = MSG_DONTWAIT;
+ int ret;
+
+ if (!last_in_batch && cmd->queue->send_list_len)
+ flags |= MSG_MORE;
+ else
+ flags |= MSG_EOR;
+
+ ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
+ offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
+ if (ret <= 0)
+ return ret;
+ cmd->offset += ret;
+ left -= ret;
+
+ if (left)
+ return -EAGAIN;
+
+ cmd->queue->snd_cmd = NULL;
+ return 1;
+}
+
+static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
+{
+ struct nvmet_tcp_queue *queue = cmd->queue;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+ struct kvec iov = {
+ .iov_base = &cmd->exp_ddgst + cmd->offset,
+ .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
+ };
+ int ret;
+
+ ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+ if (unlikely(ret <= 0))
+ return ret;
+
+ cmd->offset += ret;
+ nvmet_setup_response_pdu(cmd);
+ return 1;
+}
+
+static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
+ bool last_in_batch)
+{
+ struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
+ int ret = 0;
+
+ if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
+ cmd = nvmet_tcp_fetch_cmd(queue);
+ if (unlikely(!cmd))
+ return 0;
+ }
+
+ if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
+ ret = nvmet_try_send_data_pdu(cmd);
+ if (ret <= 0)
+ goto done_send;
+ }
+
+ if (cmd->state == NVMET_TCP_SEND_DATA) {
+ ret = nvmet_try_send_data(cmd);
+ if (ret <= 0)
+ goto done_send;
+ }
+
+ if (cmd->state == NVMET_TCP_SEND_DDGST) {
+ ret = nvmet_try_send_ddgst(cmd);
+ if (ret <= 0)
+ goto done_send;
+ }
+
+ if (cmd->state == NVMET_TCP_SEND_R2T) {
+ ret = nvmet_try_send_r2t(cmd, last_in_batch);
+ if (ret <= 0)
+ goto done_send;
+ }
+
+ if (cmd->state == NVMET_TCP_SEND_RESPONSE)
+ ret = nvmet_try_send_response(cmd, last_in_batch);
+
+done_send:
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ return 0;
+ return ret;
+ }
+
+ return 1;
+}
+
+static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
+ int budget, int *sends)
+{
+ int i, ret = 0;
+
+ for (i = 0; i < budget; i++) {
+ ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
+ if (ret <= 0)
+ break;
+ (*sends)++;
+ }
+
+ return ret;
+}
+
+static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
+{
+ queue->offset = 0;
+ queue->left = sizeof(struct nvme_tcp_hdr);
+ queue->cmd = NULL;
+ queue->rcv_state = NVMET_TCP_RECV_PDU;
+}
+
+static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+ ahash_request_free(queue->rcv_hash);
+ ahash_request_free(queue->snd_hash);
+ crypto_free_ahash(tfm);
+}
+
+static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
+{
+ struct crypto_ahash *tfm;
+
+ tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+ if (!queue->snd_hash)
+ goto free_tfm;
+ ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+ queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+ if (!queue->rcv_hash)
+ goto free_snd_hash;
+ ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+ return 0;
+free_snd_hash:
+ ahash_request_free(queue->snd_hash);
+free_tfm:
+ crypto_free_ahash(tfm);
+ return -ENOMEM;
+}
+
+
+static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
+{
+ struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
+ struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
+ struct msghdr msg = {};
+ struct kvec iov;
+ int ret;
+
+ if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
+ pr_err("bad nvme-tcp pdu length (%d)\n",
+ le32_to_cpu(icreq->hdr.plen));
+ nvmet_tcp_fatal_error(queue);
+ }
+
+ if (icreq->pfv != NVME_TCP_PFV_1_0) {
+ pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
+ return -EPROTO;
+ }
+
+ if (icreq->hpda != 0) {
+ pr_err("queue %d: unsupported hpda %d\n", queue->idx,
+ icreq->hpda);
+ return -EPROTO;
+ }
+
+ if (icreq->maxr2t != 0) {
+ pr_err("queue %d: unsupported maxr2t %d\n", queue->idx,
+ le32_to_cpu(icreq->maxr2t) + 1);
+ return -EPROTO;
+ }
+
+ queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+ queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+ if (queue->hdr_digest || queue->data_digest) {
+ ret = nvmet_tcp_alloc_crypto(queue);
+ if (ret)
+ return ret;
+ }
+
+ memset(icresp, 0, sizeof(*icresp));
+ icresp->hdr.type = nvme_tcp_icresp;
+ icresp->hdr.hlen = sizeof(*icresp);
+ icresp->hdr.pdo = 0;
+ icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
+ icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+ icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */
+ icresp->cpda = 0;
+ if (queue->hdr_digest)
+ icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+ if (queue->data_digest)
+ icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+ iov.iov_base = icresp;
+ iov.iov_len = sizeof(*icresp);
+ ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+ if (ret < 0)
+ goto free_crypto;
+
+ queue->state = NVMET_TCP_Q_LIVE;
+ nvmet_prepare_receive_pdu(queue);
+ return 0;
+free_crypto:
+ if (queue->hdr_digest || queue->data_digest)
+ nvmet_tcp_free_crypto(queue);
+ return ret;
+}
+
+static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
+ struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
+{
+ int ret;
+
+ /* recover the expected data transfer length */
+ req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
+
+ if (!nvme_is_write(cmd->req.cmd) ||
+ req->data_len > cmd->req.port->inline_data_size) {
+ nvmet_prepare_receive_pdu(queue);
+ return;
+ }
+
+ ret = nvmet_tcp_map_data(cmd);
+ if (unlikely(ret)) {
+ pr_err("queue %d: failed to map data\n", queue->idx);
+ nvmet_tcp_fatal_error(queue);
+ return;
+ }
+
+ queue->rcv_state = NVMET_TCP_RECV_DATA;
+ nvmet_tcp_map_pdu_iovec(cmd);
+ cmd->flags |= NVMET_TCP_F_INIT_FAILED;
+}
+
+static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
+{
+ struct nvme_tcp_data_pdu *data = &queue->pdu.data;
+ struct nvmet_tcp_cmd *cmd;
+
+ cmd = &queue->cmds[data->ttag];
+
+ if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
+ pr_err("ttag %u unexpected data offset %u (expected %u)\n",
+ data->ttag, le32_to_cpu(data->data_offset),
+ cmd->rbytes_done);
+ /* FIXME: use path and transport errors */
+ nvmet_req_complete(&cmd->req,
+ NVME_SC_INVALID_FIELD | NVME_SC_DNR);
+ return -EPROTO;
+ }
+
+ cmd->pdu_len = le32_to_cpu(data->data_length);
+ cmd->pdu_recv = 0;
+ nvmet_tcp_map_pdu_iovec(cmd);
+ queue->cmd = cmd;
+ queue->rcv_state = NVMET_TCP_RECV_DATA;
+
+ return 0;
+}
+
+static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
+{
+ struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+ struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
+ struct nvmet_req *req;
+ int ret;
+
+ if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
+ if (hdr->type != nvme_tcp_icreq) {
+ pr_err("unexpected pdu type (%d) before icreq\n",
+ hdr->type);
+ nvmet_tcp_fatal_error(queue);
+ return -EPROTO;
+ }
+ return nvmet_tcp_handle_icreq(queue);
+ }
+
+ if (hdr->type == nvme_tcp_h2c_data) {
+ ret = nvmet_tcp_handle_h2c_data_pdu(queue);
+ if (unlikely(ret))
+ return ret;
+ return 0;
+ }
+
+ queue->cmd = nvmet_tcp_get_cmd(queue);
+ if (unlikely(!queue->cmd)) {
+ /* This should never happen */
+ pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
+ queue->idx, queue->nr_cmds, queue->send_list_len,
+ nvme_cmd->common.opcode);
+ nvmet_tcp_fatal_error(queue);
+ return -ENOMEM;
+ }
+
+ req = &queue->cmd->req;
+ memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
+
+ if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
+ &queue->nvme_sq, &nvmet_tcp_ops))) {
+ pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
+ req->cmd, req->cmd->common.command_id,
+ req->cmd->common.opcode,
+ le32_to_cpu(req->cmd->common.dptr.sgl.length));
+
+ nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
+ return -EAGAIN;
+ }
+
+ ret = nvmet_tcp_map_data(queue->cmd);
+ if (unlikely(ret)) {
+ pr_err("queue %d: failed to map data\n", queue->idx);
+ if (nvmet_tcp_has_inline_data(queue->cmd))
+ nvmet_tcp_fatal_error(queue);
+ else
+ nvmet_req_complete(req, ret);
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ if (nvmet_tcp_need_data_in(queue->cmd)) {
+ if (nvmet_tcp_has_inline_data(queue->cmd)) {
+ queue->rcv_state = NVMET_TCP_RECV_DATA;
+ nvmet_tcp_map_pdu_iovec(queue->cmd);
+ return 0;
+ }
+ /* send back R2T */
+ nvmet_tcp_queue_response(&queue->cmd->req);
+ goto out;
+ }
+
+ nvmet_req_execute(&queue->cmd->req);
+out:
+ nvmet_prepare_receive_pdu(queue);
+ return ret;
+}
+
+static const u8 nvme_tcp_pdu_sizes[] = {
+ [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu),
+ [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu),
+ [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu),
+};
+
+static inline u8 nvmet_tcp_pdu_size(u8 type)
+{
+ size_t idx = type;
+
+ return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
+ nvme_tcp_pdu_sizes[idx]) ?
+ nvme_tcp_pdu_sizes[idx] : 0;
+}
+
+static inline bool nvmet_tcp_pdu_valid(u8 type)
+{
+ switch (type) {
+ case nvme_tcp_icreq:
+ case nvme_tcp_cmd:
+ case nvme_tcp_h2c_data:
+ /* fallthru */
+ return true;
+ }
+
+ return false;
+}
+
+static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
+{
+ struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+ int len;
+ struct kvec iov;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+
+recv:
+ iov.iov_base = (void *)&queue->pdu + queue->offset;
+ iov.iov_len = queue->left;
+ len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+ iov.iov_len, msg.msg_flags);
+ if (unlikely(len < 0))
+ return len;
+
+ queue->offset += len;
+ queue->left -= len;
+ if (queue->left)
+ return -EAGAIN;
+
+ if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
+ u8 hdgst = nvmet_tcp_hdgst_len(queue);
+
+ if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
+ pr_err("unexpected pdu type %d\n", hdr->type);
+ nvmet_tcp_fatal_error(queue);
+ return -EIO;
+ }
+
+ if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
+ pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
+ return -EIO;
+ }
+
+ queue->left = hdr->hlen - queue->offset + hdgst;
+ goto recv;
+ }
+
+ if (queue->hdr_digest &&
+ nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
+ nvmet_tcp_fatal_error(queue); /* fatal */
+ return -EPROTO;
+ }
+
+ if (queue->data_digest &&
+ nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
+ nvmet_tcp_fatal_error(queue); /* fatal */
+ return -EPROTO;
+ }
+
+ return nvmet_tcp_done_recv_pdu(queue);
+}
+
+static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
+{
+ struct nvmet_tcp_queue *queue = cmd->queue;
+
+ nvmet_tcp_ddgst(queue->rcv_hash, cmd);
+ queue->offset = 0;
+ queue->left = NVME_TCP_DIGEST_LENGTH;
+ queue->rcv_state = NVMET_TCP_RECV_DDGST;
+}
+
+static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_cmd *cmd = queue->cmd;
+ int ret;
+
+ while (msg_data_left(&cmd->recv_msg)) {
+ ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
+ cmd->recv_msg.msg_flags);
+ if (ret <= 0)
+ return ret;
+
+ cmd->pdu_recv += ret;
+ cmd->rbytes_done += ret;
+ }
+
+ nvmet_tcp_unmap_pdu_iovec(cmd);
+
+ if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
+ cmd->rbytes_done == cmd->req.transfer_len) {
+ if (queue->data_digest) {
+ nvmet_tcp_prep_recv_ddgst(cmd);
+ return 0;
+ }
+ nvmet_req_execute(&cmd->req);
+ }
+
+ nvmet_prepare_receive_pdu(queue);
+ return 0;
+}
+
+static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_cmd *cmd = queue->cmd;
+ int ret;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+ struct kvec iov = {
+ .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
+ .iov_len = queue->left
+ };
+
+ ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+ iov.iov_len, msg.msg_flags);
+ if (unlikely(ret < 0))
+ return ret;
+
+ queue->offset += ret;
+ queue->left -= ret;
+ if (queue->left)
+ return -EAGAIN;
+
+ if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
+ pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
+ queue->idx, cmd->req.cmd->common.command_id,
+ queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
+ le32_to_cpu(cmd->exp_ddgst));
+ nvmet_tcp_finish_cmd(cmd);
+ nvmet_tcp_fatal_error(queue);
+ ret = -EPROTO;
+ goto out;
+ }
+
+ if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
+ cmd->rbytes_done == cmd->req.transfer_len)
+ nvmet_req_execute(&cmd->req);
+ ret = 0;
+out:
+ nvmet_prepare_receive_pdu(queue);
+ return ret;
+}
+
+static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
+{
+ int result;
+
+ if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
+ return 0;
+
+ if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
+ result = nvmet_tcp_try_recv_pdu(queue);
+ if (result != 0)
+ goto done_recv;
+ }
+
+ if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
+ result = nvmet_tcp_try_recv_data(queue);
+ if (result != 0)
+ goto done_recv;
+ }
+
+ if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
+ result = nvmet_tcp_try_recv_ddgst(queue);
+ if (result != 0)
+ goto done_recv;
+ }
+
+done_recv:
+ if (result < 0) {
+ if (result == -EAGAIN)
+ return 0;
+ return result;
+ }
+ return 1;
+}
+
+static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
+ int budget, int *recvs)
+{
+ int i, ret = 0;
+
+ for (i = 0; i < budget; i++) {
+ ret = nvmet_tcp_try_recv_one(queue);
+ if (ret <= 0)
+ break;
+ (*recvs)++;
+ }
+
+ return ret;
+}
+
+static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
+{
+ spin_lock(&queue->state_lock);
+ if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
+ queue->state = NVMET_TCP_Q_DISCONNECTING;
+ schedule_work(&queue->release_work);
+ }
+ spin_unlock(&queue->state_lock);
+}
+
+static void nvmet_tcp_io_work(struct work_struct *w)
+{
+ struct nvmet_tcp_queue *queue =
+ container_of(w, struct nvmet_tcp_queue, io_work);
+ bool pending;
+ int ret, ops = 0;
+
+ do {
+ pending = false;
+
+ ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
+ if (ret > 0) {
+ pending = true;
+ } else if (ret < 0) {
+ if (ret == -EPIPE || ret == -ECONNRESET)
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ else
+ nvmet_tcp_fatal_error(queue);
+ return;
+ }
+
+ ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
+ if (ret > 0) {
+ /* transmitted message/data */
+ pending = true;
+ } else if (ret < 0) {
+ if (ret == -EPIPE || ret == -ECONNRESET)
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ else
+ nvmet_tcp_fatal_error(queue);
+ return;
+ }
+
+ } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
+
+ /*
+ * We exahusted our budget, requeue our selves
+ */
+ if (pending)
+ queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+}
+
+static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
+ struct nvmet_tcp_cmd *c)
+{
+ u8 hdgst = nvmet_tcp_hdgst_len(queue);
+
+ c->queue = queue;
+ c->req.port = queue->port->nport;
+
+ c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
+ sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+ if (!c->cmd_pdu)
+ return -ENOMEM;
+ c->req.cmd = &c->cmd_pdu->cmd;
+
+ c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
+ sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+ if (!c->rsp_pdu)
+ goto out_free_cmd;
+ c->req.rsp = &c->rsp_pdu->cqe;
+
+ c->data_pdu = page_frag_alloc(&queue->pf_cache,
+ sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+ if (!c->data_pdu)
+ goto out_free_rsp;
+
+ c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
+ sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+ if (!c->r2t_pdu)
+ goto out_free_data;
+
+ c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+
+ list_add_tail(&c->entry, &queue->free_list);
+
+ return 0;
+out_free_data:
+ page_frag_free(c->data_pdu);
+out_free_rsp:
+ page_frag_free(c->rsp_pdu);
+out_free_cmd:
+ page_frag_free(c->cmd_pdu);
+ return -ENOMEM;
+}
+
+static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
+{
+ page_frag_free(c->r2t_pdu);
+ page_frag_free(c->data_pdu);
+ page_frag_free(c->rsp_pdu);
+ page_frag_free(c->cmd_pdu);
+}
+
+static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_cmd *cmds;
+ int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
+
+ cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
+ if (!cmds)
+ goto out;
+
+ for (i = 0; i < nr_cmds; i++) {
+ ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
+ if (ret)
+ goto out_free;
+ }
+
+ queue->cmds = cmds;
+
+ return 0;
+out_free:
+ while (--i >= 0)
+ nvmet_tcp_free_cmd(cmds + i);
+ kfree(cmds);
+out:
+ return ret;
+}
+
+static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_cmd *cmds = queue->cmds;
+ int i;
+
+ for (i = 0; i < queue->nr_cmds; i++)
+ nvmet_tcp_free_cmd(cmds + i);
+
+ nvmet_tcp_free_cmd(&queue->connect);
+ kfree(cmds);
+}
+
+static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
+{
+ struct socket *sock = queue->sock;
+
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ sock->sk->sk_data_ready = queue->data_ready;
+ sock->sk->sk_state_change = queue->state_change;
+ sock->sk->sk_write_space = queue->write_space;
+ sock->sk->sk_user_data = NULL;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
+{
+ nvmet_req_uninit(&cmd->req);
+ nvmet_tcp_unmap_pdu_iovec(cmd);
+ sgl_free(cmd->req.sg);
+}
+
+static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_cmd *cmd = queue->cmds;
+ int i;
+
+ for (i = 0; i < queue->nr_cmds; i++, cmd++) {
+ if (nvmet_tcp_need_data_in(cmd))
+ nvmet_tcp_finish_cmd(cmd);
+ }
+
+ if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
+ /* failed in connect */
+ nvmet_tcp_finish_cmd(&queue->connect);
+ }
+}
+
+static void nvmet_tcp_release_queue_work(struct work_struct *w)
+{
+ struct nvmet_tcp_queue *queue =
+ container_of(w, struct nvmet_tcp_queue, release_work);
+
+ mutex_lock(&nvmet_tcp_queue_mutex);
+ list_del_init(&queue->queue_list);
+ mutex_unlock(&nvmet_tcp_queue_mutex);
+
+ nvmet_tcp_restore_socket_callbacks(queue);
+ flush_work(&queue->io_work);
+
+ nvmet_tcp_uninit_data_in_cmds(queue);
+ nvmet_sq_destroy(&queue->nvme_sq);
+ cancel_work_sync(&queue->io_work);
+ sock_release(queue->sock);
+ nvmet_tcp_free_cmds(queue);
+ if (queue->hdr_digest || queue->data_digest)
+ nvmet_tcp_free_crypto(queue);
+ ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+
+ kfree(queue);
+}
+
+static void nvmet_tcp_data_ready(struct sock *sk)
+{
+ struct nvmet_tcp_queue *queue;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (likely(queue))
+ queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_write_space(struct sock *sk)
+{
+ struct nvmet_tcp_queue *queue;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (unlikely(!queue))
+ goto out;
+
+ if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
+ queue->write_space(sk);
+ goto out;
+ }
+
+ if (sk_stream_is_writeable(sk)) {
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+ }
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_state_change(struct sock *sk)
+{
+ struct nvmet_tcp_queue *queue;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (!queue)
+ goto done;
+
+ switch (sk->sk_state) {
+ case TCP_FIN_WAIT1:
+ case TCP_CLOSE_WAIT:
+ case TCP_CLOSE:
+ /* FALLTHRU */
+ sk->sk_user_data = NULL;
+ nvmet_tcp_schedule_release_queue(queue);
+ break;
+ default:
+ pr_warn("queue %d unhandled state %d\n",
+ queue->idx, sk->sk_state);
+ }
+done:
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
+{
+ struct socket *sock = queue->sock;
+ struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+ int ret;
+
+ ret = kernel_getsockname(sock,
+ (struct sockaddr *)&queue->sockaddr);
+ if (ret < 0)
+ return ret;
+
+ ret = kernel_getpeername(sock,
+ (struct sockaddr *)&queue->sockaddr_peer);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Cleanup whatever is sitting in the TCP transmit queue on socket
+ * close. This is done to prevent stale data from being sent should
+ * the network connection be restored before TCP times out.
+ */
+ ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+ (char *)&sol, sizeof(sol));
+ if (ret)
+ return ret;
+
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ sock->sk->sk_user_data = queue;
+ queue->data_ready = sock->sk->sk_data_ready;
+ sock->sk->sk_data_ready = nvmet_tcp_data_ready;
+ queue->state_change = sock->sk->sk_state_change;
+ sock->sk->sk_state_change = nvmet_tcp_state_change;
+ queue->write_space = sock->sk->sk_write_space;
+ sock->sk->sk_write_space = nvmet_tcp_write_space;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+
+ return 0;
+}
+
+static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
+ struct socket *newsock)
+{
+ struct nvmet_tcp_queue *queue;
+ int ret;
+
+ queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+ if (!queue)
+ return -ENOMEM;
+
+ INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
+ INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
+ queue->sock = newsock;
+ queue->port = port;
+ queue->nr_cmds = 0;
+ spin_lock_init(&queue->state_lock);
+ queue->state = NVMET_TCP_Q_CONNECTING;
+ INIT_LIST_HEAD(&queue->free_list);
+ init_llist_head(&queue->resp_list);
+ INIT_LIST_HEAD(&queue->resp_send_list);
+
+ queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
+ if (queue->idx < 0) {
+ ret = queue->idx;
+ goto out_free_queue;
+ }
+
+ ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
+ if (ret)
+ goto out_ida_remove;
+
+ ret = nvmet_sq_init(&queue->nvme_sq);
+ if (ret)
+ goto out_free_connect;
+
+ port->last_cpu = cpumask_next_wrap(port->last_cpu,
+ cpu_online_mask, -1, false);
+ queue->cpu = port->last_cpu;
+ nvmet_prepare_receive_pdu(queue);
+
+ mutex_lock(&nvmet_tcp_queue_mutex);
+ list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
+ mutex_unlock(&nvmet_tcp_queue_mutex);
+
+ ret = nvmet_tcp_set_queue_sock(queue);
+ if (ret)
+ goto out_destroy_sq;
+
+ queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+
+ return 0;
+out_destroy_sq:
+ mutex_lock(&nvmet_tcp_queue_mutex);
+ list_del_init(&queue->queue_list);
+ mutex_unlock(&nvmet_tcp_queue_mutex);
+ nvmet_sq_destroy(&queue->nvme_sq);
+out_free_connect:
+ nvmet_tcp_free_cmd(&queue->connect);
+out_ida_remove:
+ ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+out_free_queue:
+ kfree(queue);
+ return ret;
+}
+
+static void nvmet_tcp_accept_work(struct work_struct *w)
+{
+ struct nvmet_tcp_port *port =
+ container_of(w, struct nvmet_tcp_port, accept_work);
+ struct socket *newsock;
+ int ret;
+
+ while (true) {
+ ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ pr_warn("failed to accept err=%d\n", ret);
+ return;
+ }
+ ret = nvmet_tcp_alloc_queue(port, newsock);
+ if (ret) {
+ pr_err("failed to allocate queue\n");
+ sock_release(newsock);
+ }
+ }
+}
+
+static void nvmet_tcp_listen_data_ready(struct sock *sk)
+{
+ struct nvmet_tcp_port *port;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ port = sk->sk_user_data;
+ if (!port)
+ goto out;
+
+ if (sk->sk_state == TCP_LISTEN)
+ schedule_work(&port->accept_work);
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int nvmet_tcp_add_port(struct nvmet_port *nport)
+{
+ struct nvmet_tcp_port *port;
+ __kernel_sa_family_t af;
+ int opt, ret;
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (!port)
+ return -ENOMEM;
+
+ switch (nport->disc_addr.adrfam) {
+ case NVMF_ADDR_FAMILY_IP4:
+ af = AF_INET;
+ break;
+ case NVMF_ADDR_FAMILY_IP6:
+ af = AF_INET6;
+ break;
+ default:
+ pr_err("address family %d not supported\n",
+ nport->disc_addr.adrfam);
+ ret = -EINVAL;
+ goto err_port;
+ }
+
+ ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
+ nport->disc_addr.trsvcid, &port->addr);
+ if (ret) {
+ pr_err("malformed ip/port passed: %s:%s\n",
+ nport->disc_addr.traddr, nport->disc_addr.trsvcid);
+ goto err_port;
+ }
+
+ port->nport = nport;
+ port->last_cpu = -1;
+ INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
+ if (port->nport->inline_data_size < 0)
+ port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
+
+ ret = sock_create(port->addr.ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &port->sock);
+ if (ret) {
+ pr_err("failed to create a socket\n");
+ goto err_port;
+ }
+
+ port->sock->sk->sk_user_data = port;
+ port->data_ready = port->sock->sk->sk_data_ready;
+ port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
+
+ opt = 1;
+ ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
+ TCP_NODELAY, (char *)&opt, sizeof(opt));
+ if (ret) {
+ pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
+ goto err_sock;
+ }
+
+ ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
+ (char *)&opt, sizeof(opt));
+ if (ret) {
+ pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
+ goto err_sock;
+ }
+
+ ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
+ sizeof(port->addr));
+ if (ret) {
+ pr_err("failed to bind port socket %d\n", ret);
+ goto err_sock;
+ }
+
+ ret = kernel_listen(port->sock, 128);
+ if (ret) {
+ pr_err("failed to listen %d on port sock\n", ret);
+ goto err_sock;
+ }
+
+ nport->priv = port;
+ pr_info("enabling port %d (%pISpc)\n",
+ le16_to_cpu(nport->disc_addr.portid), &port->addr);
+
+ return 0;
+
+err_sock:
+ sock_release(port->sock);
+err_port:
+ kfree(port);
+ return ret;
+}
+
+static void nvmet_tcp_remove_port(struct nvmet_port *nport)
+{
+ struct nvmet_tcp_port *port = nport->priv;
+
+ write_lock_bh(&port->sock->sk->sk_callback_lock);
+ port->sock->sk->sk_data_ready = port->data_ready;
+ port->sock->sk->sk_user_data = NULL;
+ write_unlock_bh(&port->sock->sk->sk_callback_lock);
+ cancel_work_sync(&port->accept_work);
+
+ sock_release(port->sock);
+ kfree(port);
+}
+
+static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
+{
+ struct nvmet_tcp_queue *queue;
+
+ mutex_lock(&nvmet_tcp_queue_mutex);
+ list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+ if (queue->nvme_sq.ctrl == ctrl)
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ mutex_unlock(&nvmet_tcp_queue_mutex);
+}
+
+static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
+{
+ struct nvmet_tcp_queue *queue =
+ container_of(sq, struct nvmet_tcp_queue, nvme_sq);
+
+ if (sq->qid == 0) {
+ /* Let inflight controller teardown complete */
+ flush_scheduled_work();
+ }
+
+ queue->nr_cmds = sq->size * 2;
+ if (nvmet_tcp_alloc_cmds(queue))
+ return NVME_SC_INTERNAL;
+ return 0;
+}
+
+static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
+ struct nvmet_port *nport, char *traddr)
+{
+ struct nvmet_tcp_port *port = nport->priv;
+
+ if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
+ struct nvmet_tcp_cmd *cmd =
+ container_of(req, struct nvmet_tcp_cmd, req);
+ struct nvmet_tcp_queue *queue = cmd->queue;
+
+ sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
+ } else {
+ memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
+ }
+}
+
+static struct nvmet_fabrics_ops nvmet_tcp_ops = {
+ .owner = THIS_MODULE,
+ .type = NVMF_TRTYPE_TCP,
+ .msdbd = 1,
+ .has_keyed_sgls = 0,
+ .add_port = nvmet_tcp_add_port,
+ .remove_port = nvmet_tcp_remove_port,
+ .queue_response = nvmet_tcp_queue_response,
+ .delete_ctrl = nvmet_tcp_delete_ctrl,
+ .install_queue = nvmet_tcp_install_queue,
+ .disc_traddr = nvmet_tcp_disc_port_addr,
+};
+
+static int __init nvmet_tcp_init(void)
+{
+ int ret;
+
+ nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
+ if (!nvmet_tcp_wq)
+ return -ENOMEM;
+
+ ret = nvmet_register_transport(&nvmet_tcp_ops);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ destroy_workqueue(nvmet_tcp_wq);
+ return ret;
+}
+
+static void __exit nvmet_tcp_exit(void)
+{
+ struct nvmet_tcp_queue *queue;
+
+ nvmet_unregister_transport(&nvmet_tcp_ops);
+
+ flush_scheduled_work();
+ mutex_lock(&nvmet_tcp_queue_mutex);
+ list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ mutex_unlock(&nvmet_tcp_queue_mutex);
+ flush_scheduled_work();
+
+ destroy_workqueue(nvmet_tcp_wq);
+}
+
+module_init(nvmet_tcp_init);
+module_exit(nvmet_tcp_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 2016e0ed5865..8e26001dc11c 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -412,6 +412,7 @@ static int dasd_ioctl_information(struct dasd_block *block,
struct ccw_dev_id dev_id;
struct dasd_device *base;
struct ccw_device *cdev;
+ struct list_head *l;
unsigned long flags;
int rc;
@@ -462,23 +463,10 @@ static int dasd_ioctl_information(struct dasd_block *block,
memcpy(dasd_info->type, base->discipline->name, 4);
- if (block->request_queue->request_fn) {
- struct list_head *l;
-#ifdef DASD_EXTENDED_PROFILING
- {
- struct list_head *l;
- spin_lock_irqsave(&block->lock, flags);
- list_for_each(l, &block->request_queue->queue_head)
- dasd_info->req_queue_len++;
- spin_unlock_irqrestore(&block->lock, flags);
- }
-#endif /* DASD_EXTENDED_PROFILING */
- spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags);
- list_for_each(l, &base->ccw_queue)
- dasd_info->chanq_len++;
- spin_unlock_irqrestore(get_ccwdev_lock(base->cdev),
- flags);
- }
+ spin_lock_irqsave(&block->queue_lock, flags);
+ list_for_each(l, &base->ccw_queue)
+ dasd_info->chanq_len++;
+ spin_unlock_irqrestore(&block->queue_lock, flags);
rc = 0;
if (copy_to_user(argp, dasd_info,
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 640cd1b31a18..f38882f6f37d 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -50,18 +50,6 @@ config SCSI_NETLINK
default n
depends on NET
-config SCSI_MQ_DEFAULT
- bool "SCSI: use blk-mq I/O path by default"
- default y
- depends on SCSI
- ---help---
- This option enables the blk-mq based I/O path for SCSI devices by
- default. With this option the scsi_mod.use_blk_mq module/boot
- option defaults to Y, without it to N, but it can still be
- overridden either way.
-
- If unsure say Y.
-
config SCSI_PROC_FS
bool "legacy /proc/scsi/ support"
depends on SCSI && PROC_FS
diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c
index e9e669a6c2bc..6bad2689edd4 100644
--- a/drivers/scsi/bnx2i/bnx2i_hwi.c
+++ b/drivers/scsi/bnx2i/bnx2i_hwi.c
@@ -1906,7 +1906,6 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session,
struct iscsi_task *task;
struct scsi_cmnd *sc;
int rc = 0;
- int cpu;
spin_lock(&session->back_lock);
task = iscsi_itt_to_task(bnx2i_conn->cls_conn->dd_data,
@@ -1917,14 +1916,9 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session,
}
sc = task->sc;
- if (!blk_rq_cpu_valid(sc->request))
- cpu = smp_processor_id();
- else
- cpu = sc->request->cpu;
-
spin_unlock(&session->back_lock);
- p = &per_cpu(bnx2i_percpu, cpu);
+ p = &per_cpu(bnx2i_percpu, blk_mq_rq_cpu(sc->request));
spin_lock(&p->p_work_lock);
if (unlikely(!p->iothread)) {
rc = -EINVAL;
diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c
index 8c15b7acb4b7..a95debbea0e4 100644
--- a/drivers/scsi/csiostor/csio_scsi.c
+++ b/drivers/scsi/csiostor/csio_scsi.c
@@ -1780,16 +1780,10 @@ csio_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmnd)
int nsge = 0;
int rv = SCSI_MLQUEUE_HOST_BUSY, nr;
int retval;
- int cpu;
struct csio_scsi_qset *sqset;
struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
- if (!blk_rq_cpu_valid(cmnd->request))
- cpu = smp_processor_id();
- else
- cpu = cmnd->request->cpu;
-
- sqset = &hw->sqset[ln->portid][cpu];
+ sqset = &hw->sqset[ln->portid][blk_mq_rq_cpu(cmnd->request)];
nr = fc_remote_port_chkready(rport);
if (nr) {
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 6637116529aa..abdc9eac4173 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -3088,12 +3088,6 @@ static ssize_t hwq_mode_store(struct device *dev,
return -EINVAL;
}
- if ((mode == HWQ_MODE_TAG) && !shost_use_blk_mq(shost)) {
- dev_info(cfgdev, "SCSI-MQ is not enabled, use a different "
- "HWQ steering mode.\n");
- return -EINVAL;
- }
-
afu->hwq_mode = mode;
return count;
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index 12dc7100bb4c..d7ac498ba35a 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -1071,28 +1071,29 @@ static void alua_check(struct scsi_device *sdev, bool force)
* Fail I/O to all paths not in state
* active/optimized or active/non-optimized.
*/
-static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t alua_prep_fn(struct scsi_device *sdev, struct request *req)
{
struct alua_dh_data *h = sdev->handler_data;
struct alua_port_group *pg;
unsigned char state = SCSI_ACCESS_STATE_OPTIMAL;
- int ret = BLKPREP_OK;
rcu_read_lock();
pg = rcu_dereference(h->pg);
if (pg)
state = pg->state;
rcu_read_unlock();
- if (state == SCSI_ACCESS_STATE_TRANSITIONING)
- ret = BLKPREP_DEFER;
- else if (state != SCSI_ACCESS_STATE_OPTIMAL &&
- state != SCSI_ACCESS_STATE_ACTIVE &&
- state != SCSI_ACCESS_STATE_LBA) {
- ret = BLKPREP_KILL;
+
+ switch (state) {
+ case SCSI_ACCESS_STATE_OPTIMAL:
+ case SCSI_ACCESS_STATE_ACTIVE:
+ case SCSI_ACCESS_STATE_LBA:
+ return BLK_STS_OK;
+ case SCSI_ACCESS_STATE_TRANSITIONING:
+ return BLK_STS_RESOURCE;
+ default:
req->rq_flags |= RQF_QUIET;
+ return BLK_STS_IOERR;
}
- return ret;
-
}
static void alua_rescan(struct scsi_device *sdev)
diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c
index 95c47909a58f..bea8e13febb6 100644
--- a/drivers/scsi/device_handler/scsi_dh_emc.c
+++ b/drivers/scsi/device_handler/scsi_dh_emc.c
@@ -341,17 +341,17 @@ static int clariion_check_sense(struct scsi_device *sdev,
return SCSI_RETURN_NOT_HANDLED;
}
-static int clariion_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t clariion_prep_fn(struct scsi_device *sdev,
+ struct request *req)
{
struct clariion_dh_data *h = sdev->handler_data;
- int ret = BLKPREP_OK;
if (h->lun_state != CLARIION_LUN_OWNED) {
- ret = BLKPREP_KILL;
req->rq_flags |= RQF_QUIET;
+ return BLK_STS_IOERR;
}
- return ret;
+ return BLK_STS_OK;
}
static int clariion_std_inquiry(struct scsi_device *sdev,
diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
index e65a0ebb4b54..80129b033855 100644
--- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c
+++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
@@ -172,17 +172,16 @@ retry:
return rc;
}
-static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
{
struct hp_sw_dh_data *h = sdev->handler_data;
- int ret = BLKPREP_OK;
if (h->path_state != HP_SW_PATH_ACTIVE) {
- ret = BLKPREP_KILL;
req->rq_flags |= RQF_QUIET;
+ return BLK_STS_IOERR;
}
- return ret;
+ return BLK_STS_OK;
}
/*
diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c
index d27fabae8ddd..65f1fe343c64 100644
--- a/drivers/scsi/device_handler/scsi_dh_rdac.c
+++ b/drivers/scsi/device_handler/scsi_dh_rdac.c
@@ -642,17 +642,16 @@ done:
return 0;
}
-static int rdac_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t rdac_prep_fn(struct scsi_device *sdev, struct request *req)
{
struct rdac_dh_data *h = sdev->handler_data;
- int ret = BLKPREP_OK;
if (h->state != RDAC_STATE_ACTIVE) {
- ret = BLKPREP_KILL;
req->rq_flags |= RQF_QUIET;
+ return BLK_STS_IOERR;
}
- return ret;
+ return BLK_STS_OK;
}
static int rdac_check_sense(struct scsi_device *sdev,
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 96acfcecd540..cafbcfb85bfa 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -2274,7 +2274,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
return SCSI_NO_TAG;
sc->tag = sc->request->tag = dummy->tag;
- sc->request->special = sc;
+ sc->host_scribble = (unsigned char *)dummy;
return dummy->tag;
}
@@ -2286,7 +2286,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
static inline void
fnic_scsi_host_end_tag(struct fnic *fnic, struct scsi_cmnd *sc)
{
- struct request *dummy = sc->request->special;
+ struct request *dummy = (struct request *)sc->host_scribble;
blk_mq_free_request(dummy);
}
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index ea4b0bb0c1cd..cc71136ba300 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -222,18 +222,9 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
if (error)
goto fail;
- if (shost_use_blk_mq(shost)) {
- error = scsi_mq_setup_tags(shost);
- if (error)
- goto fail;
- } else {
- shost->bqt = blk_init_tags(shost->can_queue,
- shost->hostt->tag_alloc_policy);
- if (!shost->bqt) {
- error = -ENOMEM;
- goto fail;
- }
- }
+ error = scsi_mq_setup_tags(shost);
+ if (error)
+ goto fail;
if (!shost->shost_gendev.parent)
shost->shost_gendev.parent = dev ? dev : &platform_bus;
@@ -309,8 +300,7 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
pm_runtime_disable(&shost->shost_gendev);
pm_runtime_set_suspended(&shost->shost_gendev);
pm_runtime_put_noidle(&shost->shost_gendev);
- if (shost_use_blk_mq(shost))
- scsi_mq_destroy_tags(shost);
+ scsi_mq_destroy_tags(shost);
fail:
return error;
}
@@ -344,13 +334,8 @@ static void scsi_host_dev_release(struct device *dev)
kfree(dev_name(&shost->shost_dev));
}
- if (shost_use_blk_mq(shost)) {
- if (shost->tag_set.tags)
- scsi_mq_destroy_tags(shost);
- } else {
- if (shost->bqt)
- blk_free_tags(shost->bqt);
- }
+ if (shost->tag_set.tags)
+ scsi_mq_destroy_tags(shost);
kfree(shost->shost_data);
@@ -472,8 +457,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
else
shost->dma_boundary = 0xffffffff;
- shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq;
-
device_initialize(&shost->shost_gendev);
dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
shost->shost_gendev.bus = &scsi_bus_type;
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 4f6cdf53e913..c90b278cc28c 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -601,12 +601,7 @@ void sas_ata_task_abort(struct sas_task *task)
/* Bounce SCSI-initiated commands to the SCSI EH */
if (qc->scsicmd) {
- struct request_queue *q = qc->scsicmd->device->request_queue;
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
blk_abort_request(qc->scsicmd->request);
- spin_unlock_irqrestore(q->queue_lock, flags);
return;
}
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 33229348dcb6..af085432c5fe 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -930,16 +930,10 @@ void sas_task_abort(struct sas_task *task)
return;
}
- if (dev_is_sata(task->dev)) {
+ if (dev_is_sata(task->dev))
sas_ata_task_abort(task);
- } else {
- struct request_queue *q = sc->device->request_queue;
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
+ else
blk_abort_request(sc->request);
- spin_unlock_irqrestore(q->queue_lock, flags);
- }
}
void sas_target_destroy(struct scsi_target *starget)
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 4fa6703a9ec9..baed2b891efb 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -3914,7 +3914,7 @@ int lpfc_sli4_scmd_to_wqidx_distr(struct lpfc_hba *phba,
uint32_t tag;
uint16_t hwq;
- if (cmnd && shost_use_blk_mq(cmnd->device->host)) {
+ if (cmnd) {
tag = blk_mq_unique_tag(cmnd->request);
hwq = blk_mq_unique_tag_to_hwq(tag);
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index e19fa883376f..60cf7c5eb880 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -506,11 +506,11 @@ static void osd_request_async_done(struct request *req, blk_status_t error)
_set_error_resid(or, req, error);
if (req->next_rq) {
- __blk_put_request(req->q, req->next_rq);
+ blk_put_request(req->next_rq);
req->next_rq = NULL;
}
- __blk_put_request(req->q, req);
+ blk_put_request(req);
or->request = NULL;
or->in.req = NULL;
or->out.req = NULL;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 7a1a1edde35d..664c1238a87f 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -341,7 +341,7 @@ static void osst_end_async(struct request *req, blk_status_t status)
blk_rq_unmap_user(SRpnt->bio);
}
- __blk_put_request(req->q, req);
+ blk_put_request(req);
}
/* osst_request memory management */
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index 105b0e4d7818..311eb22068e1 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -644,8 +644,7 @@ static struct qedi_ctx *qedi_host_alloc(struct pci_dev *pdev)
qedi->max_active_conns = ISCSI_MAX_SESS_PER_HBA;
qedi->max_sqes = QEDI_SQ_SIZE;
- if (shost_use_blk_mq(shost))
- shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
+ shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
pci_set_drvdata(pdev, qedi);
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 7e78e7eff783..fccc733145fc 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -272,17 +272,6 @@ static void qla_nvme_fcp_abort(struct nvme_fc_local_port *lport,
schedule_work(&priv->abort_work);
}
-static void qla_nvme_poll(struct nvme_fc_local_port *lport, void *hw_queue_handle)
-{
- struct qla_qpair *qpair = hw_queue_handle;
- unsigned long flags;
- struct scsi_qla_host *vha = lport->private;
-
- spin_lock_irqsave(&qpair->qp_lock, flags);
- qla24xx_process_response_queue(vha, qpair->rsp);
- spin_unlock_irqrestore(&qpair->qp_lock, flags);
-}
-
static inline int qla2x00_start_nvme_mq(srb_t *sp)
{
unsigned long flags;
@@ -578,7 +567,6 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = {
.ls_abort = qla_nvme_ls_abort,
.fcp_io = qla_nvme_post_cmd,
.fcp_abort = qla_nvme_fcp_abort,
- .poll_queue = qla_nvme_poll,
.max_hw_queues = 8,
.max_sgl_segments = 128,
.max_dif_sgl_segments = 64,
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index d0ecc729a90a..f92196ec5489 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -857,13 +857,9 @@ qla2xxx_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
}
if (ha->mqenable) {
- if (shost_use_blk_mq(vha->host)) {
- tag = blk_mq_unique_tag(cmd->request);
- hwq = blk_mq_unique_tag_to_hwq(tag);
- qpair = ha->queue_pair_map[hwq];
- } else if (vha->vp_idx && vha->qpair) {
- qpair = vha->qpair;
- }
+ tag = blk_mq_unique_tag(cmd->request);
+ hwq = blk_mq_unique_tag_to_hwq(tag);
+ qpair = ha->queue_pair_map[hwq];
if (qpair)
return qla2xxx_mqueuecommand(host, cmd, qpair);
@@ -1464,7 +1460,7 @@ __qla2xxx_eh_generic_reset(char *name, enum nexus_wait_type type,
goto eh_reset_failed;
}
err = 2;
- if (do_reset(fcport, cmd->device->lun, cmd->request->cpu + 1)
+ if (do_reset(fcport, cmd->device->lun, blk_mq_rq_cpu(cmd->request) + 1)
!= QLA_SUCCESS) {
ql_log(ql_log_warn, vha, 0x800c,
"do_reset failed for cmd=%p.\n", cmd);
@@ -3159,7 +3155,7 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
goto probe_failed;
}
- if (ha->mqenable && shost_use_blk_mq(host)) {
+ if (ha->mqenable) {
/* number of hardware queues supported by blk/scsi-mq*/
host->nr_hw_queues = ha->max_qpairs;
@@ -3271,25 +3267,17 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
base_vha->mgmt_svr_loop_id, host->sg_tablesize);
if (ha->mqenable) {
- bool mq = false;
bool startit = false;
- if (QLA_TGT_MODE_ENABLED()) {
- mq = true;
+ if (QLA_TGT_MODE_ENABLED())
startit = false;
- }
- if ((ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED) &&
- shost_use_blk_mq(host)) {
- mq = true;
+ if (ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED)
startit = true;
- }
- if (mq) {
- /* Create start of day qpairs for Block MQ */
- for (i = 0; i < ha->max_qpairs; i++)
- qla2xxx_create_qpair(base_vha, 5, 0, startit);
- }
+ /* Create start of day qpairs for Block MQ */
+ for (i = 0; i < ha->max_qpairs; i++)
+ qla2xxx_create_qpair(base_vha, 5, 0, startit);
}
if (ha->flags.running_gold_fw)
@@ -6952,11 +6940,12 @@ static int qla2xxx_map_queues(struct Scsi_Host *shost)
{
int rc;
scsi_qla_host_t *vha = (scsi_qla_host_t *)shost->hostdata;
+ struct blk_mq_queue_map *qmap = &shost->tag_set.map[0];
if (USER_CTRL_IRQ(vha->hw))
- rc = blk_mq_map_queues(&shost->tag_set);
+ rc = blk_mq_map_queues(qmap);
else
- rc = blk_mq_pci_map_queues(&shost->tag_set, vha->hw->pdev, 0);
+ rc = blk_mq_pci_map_queues(qmap, vha->hw->pdev, 0);
return rc;
}
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index fc1356d101b0..7675ff0ca2ea 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -780,11 +780,8 @@ MODULE_LICENSE("GPL");
module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels");
-#ifdef CONFIG_SCSI_MQ_DEFAULT
+/* This should go away in the future, it doesn't do anything anymore */
bool scsi_use_blk_mq = true;
-#else
-bool scsi_use_blk_mq = false;
-#endif
module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO);
static int __init init_scsi(void)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 60bcc6df97a9..4740f1e9dd17 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5881,8 +5881,7 @@ static int sdebug_driver_probe(struct device *dev)
}
/* Decide whether to tell scsi subsystem that we want mq */
/* Following should give the same answer for each host */
- if (shost_use_blk_mq(hpnt))
- hpnt->nr_hw_queues = submit_queues;
+ hpnt->nr_hw_queues = submit_queues;
sdbg_host->shost = hpnt;
*((struct sdebug_host_info **)hpnt->hostdata) = sdbg_host;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c736d61b1648..16eef068e9e9 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -297,19 +297,19 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
if (rtn == BLK_EH_DONE) {
/*
- * For blk-mq, we must set the request state to complete now
- * before sending the request to the scsi error handler. This
- * will prevent a use-after-free in the event the LLD manages
- * to complete the request before the error handler finishes
- * processing this timed out request.
+ * Set the command to complete first in order to prevent a real
+ * completion from releasing the command while error handling
+ * is using it. If the command was already completed, then the
+ * lower level driver beat the timeout handler, and it is safe
+ * to return without escalating error recovery.
*
- * If the request was already completed, then the LLD beat the
- * time out handler from transferring the request to the scsi
- * error handler. In that case we can return immediately as no
- * further action is required.
+ * If timeout handling lost the race to a real completion, the
+ * block layer may ignore that due to a fake timeout injection,
+ * so return RESET_TIMER to allow error handling another shot
+ * at this command.
*/
- if (req->q->mq_ops && !blk_mq_mark_complete(req))
- return rtn;
+ if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state))
+ return BLK_EH_RESET_TIMER;
if (scsi_abort_command(scmd) != SUCCESS) {
set_host_byte(scmd, DID_TIME_OUT);
scsi_eh_scmd_add(scmd);
@@ -1932,7 +1932,7 @@ maybe_retry:
static void eh_lock_door_done(struct request *req, blk_status_t status)
{
- __blk_put_request(req->q, req);
+ blk_put_request(req);
}
/**
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index fa6e0c3b3aa6..0dbf25512778 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -168,8 +168,6 @@ static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd)
static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
{
struct scsi_device *device = cmd->device;
- struct request_queue *q = device->request_queue;
- unsigned long flags;
SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd,
"Inserting command %p into mlqueue\n", cmd));
@@ -190,26 +188,20 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
* before blk_cleanup_queue() finishes.
*/
cmd->result = 0;
- if (q->mq_ops) {
- /*
- * Before a SCSI command is dispatched,
- * get_device(&sdev->sdev_gendev) is called and the host,
- * target and device busy counters are increased. Since
- * requeuing a request causes these actions to be repeated and
- * since scsi_device_unbusy() has already been called,
- * put_device(&device->sdev_gendev) must still be called. Call
- * put_device() after blk_mq_requeue_request() to avoid that
- * removal of the SCSI device can start before requeueing has
- * happened.
- */
- blk_mq_requeue_request(cmd->request, true);
- put_device(&device->sdev_gendev);
- return;
- }
- spin_lock_irqsave(q->queue_lock, flags);
- blk_requeue_request(q, cmd->request);
- kblockd_schedule_work(&device->requeue_work);
- spin_unlock_irqrestore(q->queue_lock, flags);
+
+ /*
+ * Before a SCSI command is dispatched,
+ * get_device(&sdev->sdev_gendev) is called and the host,
+ * target and device busy counters are increased. Since
+ * requeuing a request causes these actions to be repeated and
+ * since scsi_device_unbusy() has already been called,
+ * put_device(&device->sdev_gendev) must still be called. Call
+ * put_device() after blk_mq_requeue_request() to avoid that
+ * removal of the SCSI device can start before requeueing has
+ * happened.
+ */
+ blk_mq_requeue_request(cmd->request, true);
+ put_device(&device->sdev_gendev);
}
/*
@@ -370,10 +362,7 @@ void scsi_device_unbusy(struct scsi_device *sdev)
static void scsi_kick_queue(struct request_queue *q)
{
- if (q->mq_ops)
- blk_mq_run_hw_queues(q, false);
- else
- blk_run_queue(q);
+ blk_mq_run_hw_queues(q, false);
}
/*
@@ -534,10 +523,7 @@ static void scsi_run_queue(struct request_queue *q)
if (!list_empty(&sdev->host->starved_list))
scsi_starved_list_run(sdev->host);
- if (q->mq_ops)
- blk_mq_run_hw_queues(q, false);
- else
- blk_run_queue(q);
+ blk_mq_run_hw_queues(q, false);
}
void scsi_requeue_run_queue(struct work_struct *work)
@@ -550,42 +536,6 @@ void scsi_requeue_run_queue(struct work_struct *work)
scsi_run_queue(q);
}
-/*
- * Function: scsi_requeue_command()
- *
- * Purpose: Handle post-processing of completed commands.
- *
- * Arguments: q - queue to operate on
- * cmd - command that may need to be requeued.
- *
- * Returns: Nothing
- *
- * Notes: After command completion, there may be blocks left
- * over which weren't finished by the previous command
- * this can be for a number of reasons - the main one is
- * I/O errors in the middle of the request, in which case
- * we need to request the blocks that come after the bad
- * sector.
- * Notes: Upon return, cmd is a stale pointer.
- */
-static void scsi_requeue_command(struct request_queue *q, struct scsi_cmnd *cmd)
-{
- struct scsi_device *sdev = cmd->device;
- struct request *req = cmd->request;
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- blk_unprep_request(req);
- req->special = NULL;
- scsi_put_command(cmd);
- blk_requeue_request(q, req);
- spin_unlock_irqrestore(q->queue_lock, flags);
-
- scsi_run_queue(q);
-
- put_device(&sdev->sdev_gendev);
-}
-
void scsi_run_host_queues(struct Scsi_Host *shost)
{
struct scsi_device *sdev;
@@ -626,42 +576,6 @@ static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
scsi_del_cmd_from_list(cmd);
}
-/*
- * Function: scsi_release_buffers()
- *
- * Purpose: Free resources allocate for a scsi_command.
- *
- * Arguments: cmd - command that we are bailing.
- *
- * Lock status: Assumed that no lock is held upon entry.
- *
- * Returns: Nothing
- *
- * Notes: In the event that an upper level driver rejects a
- * command, we must release resources allocated during
- * the __init_io() function. Primarily this would involve
- * the scatter-gather table.
- */
-static void scsi_release_buffers(struct scsi_cmnd *cmd)
-{
- if (cmd->sdb.table.nents)
- sg_free_table_chained(&cmd->sdb.table, false);
-
- memset(&cmd->sdb, 0, sizeof(cmd->sdb));
-
- if (scsi_prot_sg_count(cmd))
- sg_free_table_chained(&cmd->prot_sdb->table, false);
-}
-
-static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd)
-{
- struct scsi_data_buffer *bidi_sdb = cmd->request->next_rq->special;
-
- sg_free_table_chained(&bidi_sdb->table, false);
- kmem_cache_free(scsi_sdb_cache, bidi_sdb);
- cmd->request->next_rq->special = NULL;
-}
-
/* Returns false when no more bytes to process, true if there are more */
static bool scsi_end_request(struct request *req, blk_status_t error,
unsigned int bytes, unsigned int bidi_bytes)
@@ -687,46 +601,30 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
destroy_rcu_head(&cmd->rcu);
}
- if (req->mq_ctx) {
- /*
- * In the MQ case the command gets freed by __blk_mq_end_request,
- * so we have to do all cleanup that depends on it earlier.
- *
- * We also can't kick the queues from irq context, so we
- * will have to defer it to a workqueue.
- */
- scsi_mq_uninit_cmd(cmd);
-
- /*
- * queue is still alive, so grab the ref for preventing it
- * from being cleaned up during running queue.
- */
- percpu_ref_get(&q->q_usage_counter);
-
- __blk_mq_end_request(req, error);
-
- if (scsi_target(sdev)->single_lun ||
- !list_empty(&sdev->host->starved_list))
- kblockd_schedule_work(&sdev->requeue_work);
- else
- blk_mq_run_hw_queues(q, true);
-
- percpu_ref_put(&q->q_usage_counter);
- } else {
- unsigned long flags;
+ /*
+ * In the MQ case the command gets freed by __blk_mq_end_request,
+ * so we have to do all cleanup that depends on it earlier.
+ *
+ * We also can't kick the queues from irq context, so we
+ * will have to defer it to a workqueue.
+ */
+ scsi_mq_uninit_cmd(cmd);
- if (bidi_bytes)
- scsi_release_bidi_buffers(cmd);
- scsi_release_buffers(cmd);
- scsi_put_command(cmd);
+ /*
+ * queue is still alive, so grab the ref for preventing it
+ * from being cleaned up during running queue.
+ */
+ percpu_ref_get(&q->q_usage_counter);
- spin_lock_irqsave(q->queue_lock, flags);
- blk_finish_request(req, error);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ __blk_mq_end_request(req, error);
- scsi_run_queue(q);
- }
+ if (scsi_target(sdev)->single_lun ||
+ !list_empty(&sdev->host->starved_list))
+ kblockd_schedule_work(&sdev->requeue_work);
+ else
+ blk_mq_run_hw_queues(q, true);
+ percpu_ref_put(&q->q_usage_counter);
put_device(&sdev->sdev_gendev);
return false;
}
@@ -774,13 +672,7 @@ static void scsi_io_completion_reprep(struct scsi_cmnd *cmd,
struct request_queue *q)
{
/* A new command will be prepared and issued. */
- if (q->mq_ops) {
- scsi_mq_requeue_cmd(cmd);
- } else {
- /* Unprep request and put it back at head of the queue. */
- scsi_release_buffers(cmd);
- scsi_requeue_command(q, cmd);
- }
+ scsi_mq_requeue_cmd(cmd);
}
/* Helper for scsi_io_completion() when special action required. */
@@ -1120,7 +1012,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
scsi_io_completion_action(cmd, result);
}
-static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
+static blk_status_t scsi_init_sgtable(struct request *req,
+ struct scsi_data_buffer *sdb)
{
int count;
@@ -1129,7 +1022,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
*/
if (unlikely(sg_alloc_table_chained(&sdb->table,
blk_rq_nr_phys_segments(req), sdb->table.sgl)))
- return BLKPREP_DEFER;
+ return BLK_STS_RESOURCE;
/*
* Next, walk the list, and fill in the addresses and sizes of
@@ -1139,7 +1032,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
BUG_ON(count > sdb->table.nents);
sdb->table.nents = count;
sdb->length = blk_rq_payload_bytes(req);
- return BLKPREP_OK;
+ return BLK_STS_OK;
}
/*
@@ -1149,62 +1042,48 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
*
* Arguments: cmd - Command descriptor we wish to initialize
*
- * Returns: 0 on success
- * BLKPREP_DEFER if the failure is retryable
- * BLKPREP_KILL if the failure is fatal
+ * Returns: BLK_STS_OK on success
+ * BLK_STS_RESOURCE if the failure is retryable
+ * BLK_STS_IOERR if the failure is fatal
*/
-int scsi_init_io(struct scsi_cmnd *cmd)
+blk_status_t scsi_init_io(struct scsi_cmnd *cmd)
{
- struct scsi_device *sdev = cmd->device;
struct request *rq = cmd->request;
- bool is_mq = (rq->mq_ctx != NULL);
- int error = BLKPREP_KILL;
+ blk_status_t ret;
if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
- goto err_exit;
+ return BLK_STS_IOERR;
- error = scsi_init_sgtable(rq, &cmd->sdb);
- if (error)
- goto err_exit;
+ ret = scsi_init_sgtable(rq, &cmd->sdb);
+ if (ret)
+ return ret;
if (blk_bidi_rq(rq)) {
- if (!rq->q->mq_ops) {
- struct scsi_data_buffer *bidi_sdb =
- kmem_cache_zalloc(scsi_sdb_cache, GFP_ATOMIC);
- if (!bidi_sdb) {
- error = BLKPREP_DEFER;
- goto err_exit;
- }
-
- rq->next_rq->special = bidi_sdb;
- }
-
- error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
- if (error)
- goto err_exit;
+ ret = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
+ if (ret)
+ goto out_free_sgtables;
}
if (blk_integrity_rq(rq)) {
struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
int ivecs, count;
- if (prot_sdb == NULL) {
+ if (WARN_ON_ONCE(!prot_sdb)) {
/*
* This can happen if someone (e.g. multipath)
* queues a command to a device on an adapter
* that does not support DIX.
*/
- WARN_ON_ONCE(1);
- error = BLKPREP_KILL;
- goto err_exit;
+ ret = BLK_STS_IOERR;
+ goto out_free_sgtables;
}
ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
prot_sdb->table.sgl)) {
- error = BLKPREP_DEFER;
- goto err_exit;
+ ret = BLK_STS_RESOURCE;
+ goto out_free_sgtables;
}
count = blk_rq_map_integrity_sg(rq->q, rq->bio,
@@ -1216,17 +1095,10 @@ int scsi_init_io(struct scsi_cmnd *cmd)
cmd->prot_sdb->table.nents = count;
}
- return BLKPREP_OK;
-err_exit:
- if (is_mq) {
- scsi_mq_free_sgtables(cmd);
- } else {
- scsi_release_buffers(cmd);
- cmd->request->special = NULL;
- scsi_put_command(cmd);
- put_device(&sdev->sdev_gendev);
- }
- return error;
+ return BLK_STS_OK;
+out_free_sgtables:
+ scsi_mq_free_sgtables(cmd);
+ return ret;
}
EXPORT_SYMBOL(scsi_init_io);
@@ -1312,7 +1184,8 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
scsi_add_cmd_to_list(cmd);
}
-static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
+static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
+ struct request *req)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
@@ -1323,8 +1196,8 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
* submit a request without an attached bio.
*/
if (req->bio) {
- int ret = scsi_init_io(cmd);
- if (unlikely(ret))
+ blk_status_t ret = scsi_init_io(cmd);
+ if (unlikely(ret != BLK_STS_OK))
return ret;
} else {
BUG_ON(blk_rq_bytes(req));
@@ -1336,20 +1209,21 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
cmd->cmnd = scsi_req(req)->cmd;
cmd->transfersize = blk_rq_bytes(req);
cmd->allowed = scsi_req(req)->retries;
- return BLKPREP_OK;
+ return BLK_STS_OK;
}
/*
* Setup a normal block command. These are simple request from filesystems
* that still need to be translated to SCSI CDBs from the ULD.
*/
-static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
+static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev,
+ struct request *req)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
- int ret = sdev->handler->prep_fn(sdev, req);
- if (ret != BLKPREP_OK)
+ blk_status_t ret = sdev->handler->prep_fn(sdev, req);
+ if (ret != BLK_STS_OK)
return ret;
}
@@ -1358,7 +1232,8 @@ static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
return scsi_cmd_to_driver(cmd)->init_command(cmd);
}
-static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
+static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev,
+ struct request *req)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
@@ -1375,129 +1250,48 @@ static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
return scsi_setup_fs_cmnd(sdev, req);
}
-static int
+static blk_status_t
scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
{
- int ret = BLKPREP_OK;
-
- /*
- * If the device is not in running state we will reject some
- * or all commands.
- */
- if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
- switch (sdev->sdev_state) {
- case SDEV_OFFLINE:
- case SDEV_TRANSPORT_OFFLINE:
- /*
- * If the device is offline we refuse to process any
- * commands. The device must be brought online
- * before trying any recovery commands.
- */
- sdev_printk(KERN_ERR, sdev,
- "rejecting I/O to offline device\n");
- ret = BLKPREP_KILL;
- break;
- case SDEV_DEL:
- /*
- * If the device is fully deleted, we refuse to
- * process any commands as well.
- */
- sdev_printk(KERN_ERR, sdev,
- "rejecting I/O to dead device\n");
- ret = BLKPREP_KILL;
- break;
- case SDEV_BLOCK:
- case SDEV_CREATED_BLOCK:
- ret = BLKPREP_DEFER;
- break;
- case SDEV_QUIESCE:
- /*
- * If the devices is blocked we defer normal commands.
- */
- if (req && !(req->rq_flags & RQF_PREEMPT))
- ret = BLKPREP_DEFER;
- break;
- default:
- /*
- * For any other not fully online state we only allow
- * special commands. In particular any user initiated
- * command is not allowed.
- */
- if (req && !(req->rq_flags & RQF_PREEMPT))
- ret = BLKPREP_KILL;
- break;
- }
- }
- return ret;
-}
-
-static int
-scsi_prep_return(struct request_queue *q, struct request *req, int ret)
-{
- struct scsi_device *sdev = q->queuedata;
-
- switch (ret) {
- case BLKPREP_KILL:
- case BLKPREP_INVALID:
- scsi_req(req)->result = DID_NO_CONNECT << 16;
- /* release the command and kill it */
- if (req->special) {
- struct scsi_cmnd *cmd = req->special;
- scsi_release_buffers(cmd);
- scsi_put_command(cmd);
- put_device(&sdev->sdev_gendev);
- req->special = NULL;
- }
- break;
- case BLKPREP_DEFER:
+ switch (sdev->sdev_state) {
+ case SDEV_OFFLINE:
+ case SDEV_TRANSPORT_OFFLINE:
/*
- * If we defer, the blk_peek_request() returns NULL, but the
- * queue must be restarted, so we schedule a callback to happen
- * shortly.
+ * If the device is offline we refuse to process any
+ * commands. The device must be brought online
+ * before trying any recovery commands.
*/
- if (atomic_read(&sdev->device_busy) == 0)
- blk_delay_queue(q, SCSI_QUEUE_DELAY);
- break;
+ sdev_printk(KERN_ERR, sdev,
+ "rejecting I/O to offline device\n");
+ return BLK_STS_IOERR;
+ case SDEV_DEL:
+ /*
+ * If the device is fully deleted, we refuse to
+ * process any commands as well.
+ */
+ sdev_printk(KERN_ERR, sdev,
+ "rejecting I/O to dead device\n");
+ return BLK_STS_IOERR;
+ case SDEV_BLOCK:
+ case SDEV_CREATED_BLOCK:
+ return BLK_STS_RESOURCE;
+ case SDEV_QUIESCE:
+ /*
+ * If the devices is blocked we defer normal commands.
+ */
+ if (req && !(req->rq_flags & RQF_PREEMPT))
+ return BLK_STS_RESOURCE;
+ return BLK_STS_OK;
default:
- req->rq_flags |= RQF_DONTPREP;
- }
-
- return ret;
-}
-
-static int scsi_prep_fn(struct request_queue *q, struct request *req)
-{
- struct scsi_device *sdev = q->queuedata;
- struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
- int ret;
-
- ret = scsi_prep_state_check(sdev, req);
- if (ret != BLKPREP_OK)
- goto out;
-
- if (!req->special) {
- /* Bail if we can't get a reference to the device */
- if (unlikely(!get_device(&sdev->sdev_gendev))) {
- ret = BLKPREP_DEFER;
- goto out;
- }
-
- scsi_init_command(sdev, cmd);
- req->special = cmd;
+ /*
+ * For any other not fully online state we only allow
+ * special commands. In particular any user initiated
+ * command is not allowed.
+ */
+ if (req && !(req->rq_flags & RQF_PREEMPT))
+ return BLK_STS_IOERR;
+ return BLK_STS_OK;
}
-
- cmd->tag = req->tag;
- cmd->request = req;
- cmd->prot_op = SCSI_PROT_NORMAL;
-
- ret = scsi_setup_cmnd(sdev, req);
-out:
- return scsi_prep_return(q, req, ret);
-}
-
-static void scsi_unprep_fn(struct request_queue *q, struct request *req)
-{
- scsi_uninit_cmd(blk_mq_rq_to_pdu(req));
}
/*
@@ -1519,14 +1313,8 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
/*
* unblock after device_blocked iterates to zero
*/
- if (atomic_dec_return(&sdev->device_blocked) > 0) {
- /*
- * For the MQ case we take care of this in the caller.
- */
- if (!q->mq_ops)
- blk_delay_queue(q, SCSI_QUEUE_DELAY);
+ if (atomic_dec_return(&sdev->device_blocked) > 0)
goto out_dec;
- }
SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
"unblocking device at zero depth\n"));
}
@@ -1661,13 +1449,13 @@ out_dec:
* needs to return 'not busy'. Otherwise, request stacking drivers
* may hold requests forever.
*/
-static int scsi_lld_busy(struct request_queue *q)
+static bool scsi_mq_lld_busy(struct request_queue *q)
{
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost;
if (blk_queue_dying(q))
- return 0;
+ return false;
shost = sdev->host;
@@ -1678,43 +1466,9 @@ static int scsi_lld_busy(struct request_queue *q)
* in SCSI layer.
*/
if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev))
- return 1;
-
- return 0;
-}
-
-/*
- * Kill a request for a dead device
- */
-static void scsi_kill_request(struct request *req, struct request_queue *q)
-{
- struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
- struct scsi_device *sdev;
- struct scsi_target *starget;
- struct Scsi_Host *shost;
-
- blk_start_request(req);
-
- scmd_printk(KERN_INFO, cmd, "killing request\n");
-
- sdev = cmd->device;
- starget = scsi_target(sdev);
- shost = sdev->host;
- scsi_init_cmd_errh(cmd);
- cmd->result = DID_NO_CONNECT << 16;
- atomic_inc(&cmd->device->iorequest_cnt);
-
- /*
- * SCSI request completion path will do scsi_device_unbusy(),
- * bump busy counts. To bump the counters, we need to dance
- * with the locks as normal issue path does.
- */
- atomic_inc(&sdev->device_busy);
- atomic_inc(&shost->host_busy);
- if (starget->can_queue > 0)
- atomic_inc(&starget->target_busy);
+ return true;
- blk_complete_request(req);
+ return false;
}
static void scsi_softirq_done(struct request *rq)
@@ -1837,170 +1591,6 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
return 0;
}
-/**
- * scsi_done - Invoke completion on finished SCSI command.
- * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
- * ownership back to SCSI Core -- i.e. the LLDD has finished with it.
- *
- * Description: This function is the mid-level's (SCSI Core) interrupt routine,
- * which regains ownership of the SCSI command (de facto) from a LLDD, and
- * calls blk_complete_request() for further processing.
- *
- * This function is interrupt context safe.
- */
-static void scsi_done(struct scsi_cmnd *cmd)
-{
- trace_scsi_dispatch_cmd_done(cmd);
- blk_complete_request(cmd->request);
-}
-
-/*
- * Function: scsi_request_fn()
- *
- * Purpose: Main strategy routine for SCSI.
- *
- * Arguments: q - Pointer to actual queue.
- *
- * Returns: Nothing
- *
- * Lock status: request queue lock assumed to be held when called.
- *
- * Note: See sd_zbc.c sd_zbc_write_lock_zone() for write order
- * protection for ZBC disks.
- */
-static void scsi_request_fn(struct request_queue *q)
- __releases(q->queue_lock)
- __acquires(q->queue_lock)
-{
- struct scsi_device *sdev = q->queuedata;
- struct Scsi_Host *shost;
- struct scsi_cmnd *cmd;
- struct request *req;
-
- /*
- * To start with, we keep looping until the queue is empty, or until
- * the host is no longer able to accept any more requests.
- */
- shost = sdev->host;
- for (;;) {
- int rtn;
- /*
- * get next queueable request. We do this early to make sure
- * that the request is fully prepared even if we cannot
- * accept it.
- */
- req = blk_peek_request(q);
- if (!req)
- break;
-
- if (unlikely(!scsi_device_online(sdev))) {
- sdev_printk(KERN_ERR, sdev,
- "rejecting I/O to offline device\n");
- scsi_kill_request(req, q);
- continue;
- }
-
- if (!scsi_dev_queue_ready(q, sdev))
- break;
-
- /*
- * Remove the request from the request list.
- */
- if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
- blk_start_request(req);
-
- spin_unlock_irq(q->queue_lock);
- cmd = blk_mq_rq_to_pdu(req);
- if (cmd != req->special) {
- printk(KERN_CRIT "impossible request in %s.\n"
- "please mail a stack trace to "
- "linux-scsi@vger.kernel.org\n",
- __func__);
- blk_dump_rq_flags(req, "foo");
- BUG();
- }
-
- /*
- * We hit this when the driver is using a host wide
- * tag map. For device level tag maps the queue_depth check
- * in the device ready fn would prevent us from trying
- * to allocate a tag. Since the map is a shared host resource
- * we add the dev to the starved list so it eventually gets
- * a run when a tag is freed.
- */
- if (blk_queue_tagged(q) && !(req->rq_flags & RQF_QUEUED)) {
- spin_lock_irq(shost->host_lock);
- if (list_empty(&sdev->starved_entry))
- list_add_tail(&sdev->starved_entry,
- &shost->starved_list);
- spin_unlock_irq(shost->host_lock);
- goto not_ready;
- }
-
- if (!scsi_target_queue_ready(shost, sdev))
- goto not_ready;
-
- if (!scsi_host_queue_ready(q, shost, sdev))
- goto host_not_ready;
-
- if (sdev->simple_tags)
- cmd->flags |= SCMD_TAGGED;
- else
- cmd->flags &= ~SCMD_TAGGED;
-
- /*
- * Finally, initialize any error handling parameters, and set up
- * the timers for timeouts.
- */
- scsi_init_cmd_errh(cmd);
-
- /*
- * Dispatch the command to the low-level driver.
- */
- cmd->scsi_done = scsi_done;
- rtn = scsi_dispatch_cmd(cmd);
- if (rtn) {
- scsi_queue_insert(cmd, rtn);
- spin_lock_irq(q->queue_lock);
- goto out_delay;
- }
- spin_lock_irq(q->queue_lock);
- }
-
- return;
-
- host_not_ready:
- if (scsi_target(sdev)->can_queue > 0)
- atomic_dec(&scsi_target(sdev)->target_busy);
- not_ready:
- /*
- * lock q, handle tag, requeue req, and decrement device_busy. We
- * must return with queue_lock held.
- *
- * Decrementing device_busy without checking it is OK, as all such
- * cases (host limits or settings) should run the queue at some
- * later time.
- */
- spin_lock_irq(q->queue_lock);
- blk_requeue_request(q, req);
- atomic_dec(&sdev->device_busy);
-out_delay:
- if (!atomic_read(&sdev->device_busy) && !scsi_device_blocked(sdev))
- blk_delay_queue(q, SCSI_QUEUE_DELAY);
-}
-
-static inline blk_status_t prep_to_mq(int ret)
-{
- switch (ret) {
- case BLKPREP_OK:
- return BLK_STS_OK;
- case BLKPREP_DEFER:
- return BLK_STS_RESOURCE;
- default:
- return BLK_STS_IOERR;
- }
-}
-
/* Size in bytes of the sg-list stored in the scsi-mq command-private data. */
static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost)
{
@@ -2008,7 +1598,7 @@ static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost)
sizeof(struct scatterlist);
}
-static int scsi_mq_prep_fn(struct request *req)
+static blk_status_t scsi_mq_prep_fn(struct request *req)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
struct scsi_device *sdev = req->q->queuedata;
@@ -2052,8 +1642,18 @@ static int scsi_mq_prep_fn(struct request *req)
static void scsi_mq_done(struct scsi_cmnd *cmd)
{
+ if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
+ return;
trace_scsi_dispatch_cmd_done(cmd);
- blk_mq_complete_request(cmd->request);
+
+ /*
+ * If the block layer didn't complete the request due to a timeout
+ * injection, scsi must clear its internal completed state so that the
+ * timeout handler will see it needs to escalate its own error
+ * recovery.
+ */
+ if (unlikely(!blk_mq_complete_request(cmd->request)))
+ clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
}
static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
@@ -2096,9 +1696,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_status_t ret;
int reason;
- ret = prep_to_mq(scsi_prep_state_check(sdev, req));
- if (ret != BLK_STS_OK)
- goto out_put_budget;
+ /*
+ * If the device is not in running state we will reject some or all
+ * commands.
+ */
+ if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
+ ret = scsi_prep_state_check(sdev, req);
+ if (ret != BLK_STS_OK)
+ goto out_put_budget;
+ }
ret = BLK_STS_RESOURCE;
if (!scsi_target_queue_ready(shost, sdev))
@@ -2106,8 +1712,9 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
if (!scsi_host_queue_ready(q, shost, sdev))
goto out_dec_target_busy;
+ clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
if (!(req->rq_flags & RQF_DONTPREP)) {
- ret = prep_to_mq(scsi_mq_prep_fn(req));
+ ret = scsi_mq_prep_fn(req);
if (ret != BLK_STS_OK)
goto out_dec_host_busy;
req->rq_flags |= RQF_DONTPREP;
@@ -2208,7 +1815,7 @@ static int scsi_map_queues(struct blk_mq_tag_set *set)
if (shost->hostt->map_queues)
return shost->hostt->map_queues(shost);
- return blk_mq_map_queues(set);
+ return blk_mq_map_queues(&set->map[0]);
}
void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
@@ -2251,77 +1858,6 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
}
EXPORT_SYMBOL_GPL(__scsi_init_queue);
-static int scsi_old_init_rq(struct request_queue *q, struct request *rq,
- gfp_t gfp)
-{
- struct Scsi_Host *shost = q->rq_alloc_data;
- const bool unchecked_isa_dma = shost->unchecked_isa_dma;
- struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
-
- memset(cmd, 0, sizeof(*cmd));
-
- if (unchecked_isa_dma)
- cmd->flags |= SCMD_UNCHECKED_ISA_DMA;
- cmd->sense_buffer = scsi_alloc_sense_buffer(unchecked_isa_dma, gfp,
- NUMA_NO_NODE);
- if (!cmd->sense_buffer)
- goto fail;
- cmd->req.sense = cmd->sense_buffer;
-
- if (scsi_host_get_prot(shost) >= SHOST_DIX_TYPE0_PROTECTION) {
- cmd->prot_sdb = kmem_cache_zalloc(scsi_sdb_cache, gfp);
- if (!cmd->prot_sdb)
- goto fail_free_sense;
- }
-
- return 0;
-
-fail_free_sense:
- scsi_free_sense_buffer(unchecked_isa_dma, cmd->sense_buffer);
-fail:
- return -ENOMEM;
-}
-
-static void scsi_old_exit_rq(struct request_queue *q, struct request *rq)
-{
- struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
-
- if (cmd->prot_sdb)
- kmem_cache_free(scsi_sdb_cache, cmd->prot_sdb);
- scsi_free_sense_buffer(cmd->flags & SCMD_UNCHECKED_ISA_DMA,
- cmd->sense_buffer);
-}
-
-struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev)
-{
- struct Scsi_Host *shost = sdev->host;
- struct request_queue *q;
-
- q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
- if (!q)
- return NULL;
- q->cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
- q->rq_alloc_data = shost;
- q->request_fn = scsi_request_fn;
- q->init_rq_fn = scsi_old_init_rq;
- q->exit_rq_fn = scsi_old_exit_rq;
- q->initialize_rq_fn = scsi_initialize_rq;
-
- if (blk_init_allocated_queue(q) < 0) {
- blk_cleanup_queue(q);
- return NULL;
- }
-
- __scsi_init_queue(shost, q);
- blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
- blk_queue_prep_rq(q, scsi_prep_fn);
- blk_queue_unprep_rq(q, scsi_unprep_fn);
- blk_queue_softirq_done(q, scsi_softirq_done);
- blk_queue_rq_timed_out(q, scsi_times_out);
- blk_queue_lld_busy(q, scsi_lld_busy);
- return q;
-}
-
static const struct blk_mq_ops scsi_mq_ops = {
.get_budget = scsi_mq_get_budget,
.put_budget = scsi_mq_put_budget,
@@ -2334,6 +1870,7 @@ static const struct blk_mq_ops scsi_mq_ops = {
.init_request = scsi_mq_init_request,
.exit_request = scsi_mq_exit_request,
.initialize_rq_fn = scsi_initialize_rq,
+ .busy = scsi_mq_lld_busy,
.map_queues = scsi_map_queues,
};
@@ -2388,10 +1925,7 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q)
{
struct scsi_device *sdev = NULL;
- if (q->mq_ops) {
- if (q->mq_ops == &scsi_mq_ops)
- sdev = q->queuedata;
- } else if (q->request_fn == scsi_request_fn)
+ if (q->mq_ops == &scsi_mq_ops)
sdev = q->queuedata;
if (!sdev || !get_device(&sdev->sdev_gendev))
sdev = NULL;
@@ -2995,39 +2529,6 @@ void sdev_evt_send_simple(struct scsi_device *sdev,
EXPORT_SYMBOL_GPL(sdev_evt_send_simple);
/**
- * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn()
- * @sdev: SCSI device to count the number of scsi_request_fn() callers for.
- */
-static int scsi_request_fn_active(struct scsi_device *sdev)
-{
- struct request_queue *q = sdev->request_queue;
- int request_fn_active;
-
- WARN_ON_ONCE(sdev->host->use_blk_mq);
-
- spin_lock_irq(q->queue_lock);
- request_fn_active = q->request_fn_active;
- spin_unlock_irq(q->queue_lock);
-
- return request_fn_active;
-}
-
-/**
- * scsi_wait_for_queuecommand() - wait for ongoing queuecommand() calls
- * @sdev: SCSI device pointer.
- *
- * Wait until the ongoing shost->hostt->queuecommand() calls that are
- * invoked from scsi_request_fn() have finished.
- */
-static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
-{
- WARN_ON_ONCE(sdev->host->use_blk_mq);
-
- while (scsi_request_fn_active(sdev))
- msleep(20);
-}
-
-/**
* scsi_device_quiesce - Block user issued commands.
* @sdev: scsi device to quiesce.
*
@@ -3150,7 +2651,6 @@ EXPORT_SYMBOL(scsi_target_resume);
int scsi_internal_device_block_nowait(struct scsi_device *sdev)
{
struct request_queue *q = sdev->request_queue;
- unsigned long flags;
int err = 0;
err = scsi_device_set_state(sdev, SDEV_BLOCK);
@@ -3166,14 +2666,7 @@ int scsi_internal_device_block_nowait(struct scsi_device *sdev)
* block layer from calling the midlayer with this device's
* request queue.
*/
- if (q->mq_ops) {
- blk_mq_quiesce_queue_nowait(q);
- } else {
- spin_lock_irqsave(q->queue_lock, flags);
- blk_stop_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
- }
-
+ blk_mq_quiesce_queue_nowait(q);
return 0;
}
EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
@@ -3204,12 +2697,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
mutex_lock(&sdev->state_mutex);
err = scsi_internal_device_block_nowait(sdev);
- if (err == 0) {
- if (q->mq_ops)
- blk_mq_quiesce_queue(q);
- else
- scsi_wait_for_queuecommand(sdev);
- }
+ if (err == 0)
+ blk_mq_quiesce_queue(q);
mutex_unlock(&sdev->state_mutex);
return err;
@@ -3218,15 +2707,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
void scsi_start_queue(struct scsi_device *sdev)
{
struct request_queue *q = sdev->request_queue;
- unsigned long flags;
- if (q->mq_ops) {
- blk_mq_unquiesce_queue(q);
- } else {
- spin_lock_irqsave(q->queue_lock, flags);
- blk_start_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
- }
+ blk_mq_unquiesce_queue(q);
}
/**
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 99f1db5e467e..5f21547b2ad2 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -92,7 +92,6 @@ extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
extern void scsi_run_host_queues(struct Scsi_Host *shost);
extern void scsi_requeue_run_queue(struct work_struct *work);
-extern struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev);
extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
extern void scsi_start_queue(struct scsi_device *sdev);
extern int scsi_mq_setup_tags(struct Scsi_Host *shost);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 78ca63dfba4a..dd0d516f65e2 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -266,10 +266,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
*/
sdev->borken = 1;
- if (shost_use_blk_mq(shost))
- sdev->request_queue = scsi_mq_alloc_queue(sdev);
- else
- sdev->request_queue = scsi_old_alloc_queue(sdev);
+ sdev->request_queue = scsi_mq_alloc_queue(sdev);
if (!sdev->request_queue) {
/* release fn is set up in scsi_sysfs_device_initialise, so
* have to free and put manually here */
@@ -280,11 +277,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
sdev->request_queue->queuedata = sdev;
- if (!shost_use_blk_mq(sdev->host)) {
- blk_queue_init_tags(sdev->request_queue,
- sdev->host->cmd_per_lun, shost->bqt,
- shost->hostt->tag_alloc_policy);
- }
scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun ?
sdev->host->cmd_per_lun : 1);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 3aee9464a7bf..6a9040faed00 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -367,7 +367,6 @@ store_shost_eh_deadline(struct device *dev, struct device_attribute *attr,
static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline);
-shost_rd_attr(use_blk_mq, "%d\n");
shost_rd_attr(unique_id, "%u\n");
shost_rd_attr(cmd_per_lun, "%hd\n");
shost_rd_attr(can_queue, "%hd\n");
@@ -386,6 +385,13 @@ show_host_busy(struct device *dev, struct device_attribute *attr, char *buf)
}
static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL);
+static ssize_t
+show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "1\n");
+}
+static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL);
+
static struct attribute *scsi_sysfs_shost_attrs[] = {
&dev_attr_use_blk_mq.attr,
&dev_attr_unique_id.attr,
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 381668fa135d..d7035270d274 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req)
/* the blk_end_sync_io() doesn't check the error */
if (inflight)
- __blk_complete_request(req);
+ blk_mq_end_request(req, BLK_STS_IOERR);
return BLK_EH_DONE;
}
@@ -3684,14 +3684,9 @@ static void
fc_bsg_goose_queue(struct fc_rport *rport)
{
struct request_queue *q = rport->rqst_q;
- unsigned long flags;
-
- if (!q)
- return;
- spin_lock_irqsave(q->queue_lock, flags);
- blk_run_queue_async(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ if (q)
+ blk_mq_run_hw_queues(q, true);
}
/**
@@ -3759,6 +3754,37 @@ static int fc_bsg_dispatch(struct bsg_job *job)
return fc_bsg_host_dispatch(shost, job);
}
+static blk_status_t fc_bsg_rport_prep(struct fc_rport *rport)
+{
+ if (rport->port_state == FC_PORTSTATE_BLOCKED &&
+ !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
+ return BLK_STS_RESOURCE;
+
+ if (rport->port_state != FC_PORTSTATE_ONLINE)
+ return BLK_STS_IOERR;
+
+ return BLK_STS_OK;
+}
+
+
+static int fc_bsg_dispatch_prep(struct bsg_job *job)
+{
+ struct fc_rport *rport = fc_bsg_to_rport(job);
+ blk_status_t ret;
+
+ ret = fc_bsg_rport_prep(rport);
+ switch (ret) {
+ case BLK_STS_OK:
+ break;
+ case BLK_STS_RESOURCE:
+ return -EAGAIN;
+ default:
+ return -EIO;
+ }
+
+ return fc_bsg_dispatch(job);
+}
+
/**
* fc_bsg_hostadd - Create and add the bsg hooks so we can receive requests
* @shost: shost for fc_host
@@ -3780,7 +3806,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
snprintf(bsg_name, sizeof(bsg_name),
"fc_host%d", shost->host_no);
- q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
+ q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, fc_bsg_job_timeout,
+ i->f->dd_bsg_size);
if (IS_ERR(q)) {
dev_err(dev,
"fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -3788,26 +3815,11 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
return PTR_ERR(q);
}
__scsi_init_queue(shost, q);
- blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT);
fc_host->rqst_q = q;
return 0;
}
-static int fc_bsg_rport_prep(struct request_queue *q, struct request *req)
-{
- struct fc_rport *rport = dev_to_rport(q->queuedata);
-
- if (rport->port_state == FC_PORTSTATE_BLOCKED &&
- !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
- return BLKPREP_DEFER;
-
- if (rport->port_state != FC_PORTSTATE_ONLINE)
- return BLKPREP_KILL;
-
- return BLKPREP_OK;
-}
-
/**
* fc_bsg_rportadd - Create and add the bsg hooks so we can receive requests
* @shost: shost that rport is attached to
@@ -3825,15 +3837,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
if (!i->f->bsg_request)
return -ENOTSUPP;
- q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
- i->f->dd_bsg_size);
+ q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch_prep,
+ fc_bsg_job_timeout, i->f->dd_bsg_size);
if (IS_ERR(q)) {
dev_err(dev, "failed to setup bsg queue\n");
return PTR_ERR(q);
}
__scsi_init_queue(shost, q);
- blk_queue_prep_rq(q, fc_bsg_rport_prep);
- blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
rport->rqst_q = q;
return 0;
@@ -3852,10 +3862,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
static void
fc_bsg_remove(struct request_queue *q)
{
- if (q) {
- bsg_unregister_queue(q);
- blk_cleanup_queue(q);
- }
+ bsg_remove_queue(q);
}
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 6fd2fe210fc3..ff123023e5a5 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
return -ENOTSUPP;
snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
- q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
+ q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, NULL, 0);
if (IS_ERR(q)) {
shost_printk(KERN_ERR, shost, "bsg interface failed to "
"initialize - no request queue\n");
@@ -1576,10 +1576,7 @@ static int iscsi_remove_host(struct transport_container *tc,
struct Scsi_Host *shost = dev_to_shost(dev);
struct iscsi_cls_host *ihost = shost->shost_data;
- if (ihost->bsg_q) {
- bsg_unregister_queue(ihost->bsg_q);
- blk_cleanup_queue(ihost->bsg_q);
- }
+ bsg_remove_queue(ihost->bsg_q);
return 0;
}
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 0a165b2b3e81..692b46937e52 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -198,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
if (rphy) {
q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
- sas_smp_dispatch, 0);
+ sas_smp_dispatch, NULL, 0);
if (IS_ERR(q))
return PTR_ERR(q);
rphy->q = q;
@@ -207,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
q = bsg_setup_queue(&shost->shost_gendev, name,
- sas_smp_dispatch, 0);
+ sas_smp_dispatch, NULL, 0);
if (IS_ERR(q))
return PTR_ERR(q);
to_sas_host_attrs(shost)->q = q;
@@ -246,11 +246,7 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev,
struct Scsi_Host *shost = dev_to_shost(dev);
struct request_queue *q = to_sas_host_attrs(shost)->q;
- if (q) {
- bsg_unregister_queue(q);
- blk_cleanup_queue(q);
- }
-
+ bsg_remove_queue(q);
return 0;
}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index bd0a5c694a97..a1a44f52e0e8 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -114,7 +114,7 @@ static int sd_suspend_system(struct device *);
static int sd_suspend_runtime(struct device *);
static int sd_resume(struct device *);
static void sd_rescan(struct device *);
-static int sd_init_command(struct scsi_cmnd *SCpnt);
+static blk_status_t sd_init_command(struct scsi_cmnd *SCpnt);
static void sd_uninit_command(struct scsi_cmnd *SCpnt);
static int sd_done(struct scsi_cmnd *);
static void sd_eh_reset(struct scsi_cmnd *);
@@ -751,7 +751,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
}
-static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
{
struct scsi_device *sdp = cmd->device;
struct request *rq = cmd->request;
@@ -762,7 +762,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
if (!rq->special_vec.bv_page)
- return BLKPREP_DEFER;
+ return BLK_STS_RESOURCE;
clear_highpage(rq->special_vec.bv_page);
rq->special_vec.bv_offset = 0;
rq->special_vec.bv_len = data_len;
@@ -786,7 +786,8 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
return scsi_init_io(cmd);
}
-static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
+static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd,
+ bool unmap)
{
struct scsi_device *sdp = cmd->device;
struct request *rq = cmd->request;
@@ -796,7 +797,7 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
if (!rq->special_vec.bv_page)
- return BLKPREP_DEFER;
+ return BLK_STS_RESOURCE;
clear_highpage(rq->special_vec.bv_page);
rq->special_vec.bv_offset = 0;
rq->special_vec.bv_len = data_len;
@@ -817,7 +818,8 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
return scsi_init_io(cmd);
}
-static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
+static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd,
+ bool unmap)
{
struct scsi_device *sdp = cmd->device;
struct request *rq = cmd->request;
@@ -827,7 +829,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
if (!rq->special_vec.bv_page)
- return BLKPREP_DEFER;
+ return BLK_STS_RESOURCE;
clear_highpage(rq->special_vec.bv_page);
rq->special_vec.bv_offset = 0;
rq->special_vec.bv_len = data_len;
@@ -848,7 +850,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
return scsi_init_io(cmd);
}
-static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
{
struct request *rq = cmd->request;
struct scsi_device *sdp = cmd->device;
@@ -866,7 +868,7 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
}
if (sdp->no_write_same)
- return BLKPREP_INVALID;
+ return BLK_STS_TARGET;
if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
return sd_setup_write_same16_cmnd(cmd, false);
@@ -943,7 +945,7 @@ out:
* Will set up either WRITE SAME(10) or WRITE SAME(16) depending on
* the preference indicated by the target device.
**/
-static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
{
struct request *rq = cmd->request;
struct scsi_device *sdp = cmd->device;
@@ -952,10 +954,10 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
sector_t sector = blk_rq_pos(rq);
unsigned int nr_sectors = blk_rq_sectors(rq);
unsigned int nr_bytes = blk_rq_bytes(rq);
- int ret;
+ blk_status_t ret;
if (sdkp->device->no_write_same)
- return BLKPREP_INVALID;
+ return BLK_STS_TARGET;
BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
@@ -996,7 +998,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
return ret;
}
-static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
{
struct request *rq = cmd->request;
@@ -1009,10 +1011,10 @@ static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
cmd->allowed = SD_MAX_RETRIES;
rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER;
- return BLKPREP_OK;
+ return BLK_STS_OK;
}
-static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
+static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
{
struct request *rq = SCpnt->request;
struct scsi_device *sdp = SCpnt->device;
@@ -1022,18 +1024,14 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
sector_t threshold;
unsigned int this_count = blk_rq_sectors(rq);
unsigned int dif, dix;
- int ret;
unsigned char protect;
+ blk_status_t ret;
ret = scsi_init_io(SCpnt);
- if (ret != BLKPREP_OK)
+ if (ret != BLK_STS_OK)
return ret;
WARN_ON_ONCE(SCpnt != rq->special);
- /* from here on until we're complete, any goto out
- * is used for a killable error condition */
- ret = BLKPREP_KILL;
-
SCSI_LOG_HLQUEUE(1,
scmd_printk(KERN_INFO, SCpnt,
"%s: block=%llu, count=%d\n",
@@ -1046,7 +1044,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
blk_rq_sectors(rq)));
SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
"Retry with 0x%p\n", SCpnt));
- goto out;
+ return BLK_STS_IOERR;
}
if (sdp->changed) {
@@ -1055,7 +1053,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
* the changed bit has been reset
*/
/* printk("SCSI disk has been changed or is not present. Prohibiting further I/O.\n"); */
- goto out;
+ return BLK_STS_IOERR;
}
/*
@@ -1093,31 +1091,28 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
if ((block & 1) || (blk_rq_sectors(rq) & 1)) {
scmd_printk(KERN_ERR, SCpnt,
"Bad block number requested\n");
- goto out;
- } else {
- block = block >> 1;
- this_count = this_count >> 1;
+ return BLK_STS_IOERR;
}
+ block = block >> 1;
+ this_count = this_count >> 1;
}
if (sdp->sector_size == 2048) {
if ((block & 3) || (blk_rq_sectors(rq) & 3)) {
scmd_printk(KERN_ERR, SCpnt,
"Bad block number requested\n");
- goto out;
- } else {
- block = block >> 2;
- this_count = this_count >> 2;
+ return BLK_STS_IOERR;
}
+ block = block >> 2;
+ this_count = this_count >> 2;
}
if (sdp->sector_size == 4096) {
if ((block & 7) || (blk_rq_sectors(rq) & 7)) {
scmd_printk(KERN_ERR, SCpnt,
"Bad block number requested\n");
- goto out;
- } else {
- block = block >> 3;
- this_count = this_count >> 3;
+ return BLK_STS_IOERR;
}
+ block = block >> 3;
+ this_count = this_count >> 3;
}
if (rq_data_dir(rq) == WRITE) {
SCpnt->cmnd[0] = WRITE_6;
@@ -1129,7 +1124,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
SCpnt->cmnd[0] = READ_6;
} else {
scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq));
- goto out;
+ return BLK_STS_IOERR;
}
SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
@@ -1149,10 +1144,8 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) {
SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);
- if (unlikely(SCpnt->cmnd == NULL)) {
- ret = BLKPREP_DEFER;
- goto out;
- }
+ if (unlikely(!SCpnt->cmnd))
+ return BLK_STS_RESOURCE;
SCpnt->cmd_len = SD_EXT_CDB_SIZE;
memset(SCpnt->cmnd, 0, SCpnt->cmd_len);
@@ -1220,7 +1213,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
*/
scmd_printk(KERN_ERR, SCpnt,
"FUA write on READ/WRITE(6) drive\n");
- goto out;
+ return BLK_STS_IOERR;
}
SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);
@@ -1244,12 +1237,10 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
* This indicates that the command is ready from our end to be
* queued.
*/
- ret = BLKPREP_OK;
- out:
- return ret;
+ return BLK_STS_OK;
}
-static int sd_init_command(struct scsi_cmnd *cmd)
+static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
{
struct request *rq = cmd->request;
@@ -1265,7 +1256,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
case SD_LBP_ZERO:
return sd_setup_write_same10_cmnd(cmd, false);
default:
- return BLKPREP_INVALID;
+ return BLK_STS_TARGET;
}
case REQ_OP_WRITE_ZEROES:
return sd_setup_write_zeroes_cmnd(cmd);
@@ -1280,7 +1271,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
return sd_zbc_setup_reset_cmnd(cmd);
default:
WARN_ON_ONCE(1);
- return BLKPREP_KILL;
+ return BLK_STS_NOTSUPP;
}
}
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 1d63f3a23ffb..7f43e6839bce 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -271,7 +271,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
-extern int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
+extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
struct scsi_sense_hdr *sshdr);
extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
@@ -288,9 +288,9 @@ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {}
-static inline int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
{
- return BLKPREP_INVALID;
+ return BLK_STS_TARGET;
}
static inline void sd_zbc_complete(struct scsi_cmnd *cmd,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index e06c48c866e4..83365b29a4d8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -185,7 +185,7 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
*
* Called from sd_init_command() for a REQ_OP_ZONE_RESET request.
*/
-int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
{
struct request *rq = cmd->request;
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
@@ -194,14 +194,14 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
if (!sd_is_zoned(sdkp))
/* Not a zoned device */
- return BLKPREP_KILL;
+ return BLK_STS_IOERR;
if (sdkp->device->changed)
- return BLKPREP_KILL;
+ return BLK_STS_IOERR;
if (sector & (sd_zbc_zone_sectors(sdkp) - 1))
/* Unaligned request */
- return BLKPREP_KILL;
+ return BLK_STS_IOERR;
cmd->cmd_len = 16;
memset(cmd->cmnd, 0, cmd->cmd_len);
@@ -214,7 +214,7 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
cmd->transfersize = 0;
cmd->allowed = 0;
- return BLKPREP_OK;
+ return BLK_STS_OK;
}
/**
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index c6ad00703c5b..4e27460ec926 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1390,7 +1390,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status)
*/
srp->rq = NULL;
scsi_req_free_cmd(scsi_req(rq));
- __blk_put_request(rq->q, rq);
+ blk_put_request(rq);
write_lock_irqsave(&sfp->rq_list_lock, iflags);
if (unlikely(srp->orphan)) {
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index a25a07a0b7f0..bac084260d80 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -5319,7 +5319,8 @@ static int pqi_map_queues(struct Scsi_Host *shost)
{
struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
- return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0);
+ return blk_mq_pci_map_queues(&shost->tag_set.map[0],
+ ctrl_info->pci_dev, 0);
}
static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info,
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 54dd70ae9731..38ddbbfe5f3c 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -80,7 +80,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_WORM);
static DEFINE_MUTEX(sr_mutex);
static int sr_probe(struct device *);
static int sr_remove(struct device *);
-static int sr_init_command(struct scsi_cmnd *SCpnt);
+static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt);
static int sr_done(struct scsi_cmnd *);
static int sr_runtime_suspend(struct device *dev);
@@ -384,22 +384,22 @@ static int sr_done(struct scsi_cmnd *SCpnt)
return good_bytes;
}
-static int sr_init_command(struct scsi_cmnd *SCpnt)
+static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt)
{
int block = 0, this_count, s_size;
struct scsi_cd *cd;
struct request *rq = SCpnt->request;
- int ret;
+ blk_status_t ret;
ret = scsi_init_io(SCpnt);
- if (ret != BLKPREP_OK)
+ if (ret != BLK_STS_OK)
goto out;
WARN_ON_ONCE(SCpnt != rq->special);
cd = scsi_cd(rq->rq_disk);
/* from here on until we're complete, any goto out
* is used for a killable error condition */
- ret = BLKPREP_KILL;
+ ret = BLK_STS_IOERR;
SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt,
"Doing sr request, block = %d\n", block));
@@ -516,7 +516,7 @@ static int sr_init_command(struct scsi_cmnd *SCpnt)
* This indicates that the command is ready from our end to be
* queued.
*/
- ret = BLKPREP_OK;
+ ret = BLK_STS_OK;
out:
return ret;
}
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 307df2fa39a3..7ff22d3f03e3 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -530,7 +530,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status)
complete(SRpnt->waiting);
blk_rq_unmap_user(tmp);
- __blk_put_request(req->q, req);
+ blk_put_request(req);
}
static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/scsi/ufs/ufs_bsg.c
index e5f8e54bf644..775bb4e5e36e 100644
--- a/drivers/scsi/ufs/ufs_bsg.c
+++ b/drivers/scsi/ufs/ufs_bsg.c
@@ -157,7 +157,7 @@ void ufs_bsg_remove(struct ufs_hba *hba)
if (!hba->bsg_queue)
return;
- bsg_unregister_queue(hba->bsg_queue);
+ bsg_remove_queue(hba->bsg_queue);
device_del(bsg_dev);
put_device(bsg_dev);
@@ -193,7 +193,7 @@ int ufs_bsg_probe(struct ufs_hba *hba)
if (ret)
goto out;
- q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, 0);
+ q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, NULL, 0);
if (IS_ERR(q)) {
ret = PTR_ERR(q);
goto out;
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 1c72db94270e..c3c95b314286 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -719,8 +719,9 @@ static void virtscsi_target_destroy(struct scsi_target *starget)
static int virtscsi_map_queues(struct Scsi_Host *shost)
{
struct virtio_scsi *vscsi = shost_priv(shost);
+ struct blk_mq_queue_map *qmap = &shost->tag_set.map[0];
- return blk_mq_virtio_map_queues(&shost->tag_set, vscsi->vdev, 2);
+ return blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2);
}
/*
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 36b742932c72..86987da86dd6 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -150,24 +150,26 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
{
int tag = -1;
- DEFINE_WAIT(wait);
+ DEFINE_SBQ_WAIT(wait);
struct sbq_wait_state *ws;
+ struct sbitmap_queue *sbq;
if (state == TASK_RUNNING)
return tag;
- ws = &se_sess->sess_tag_pool.ws[0];
+ sbq = &se_sess->sess_tag_pool;
+ ws = &sbq->ws[0];
for (;;) {
- prepare_to_wait_exclusive(&ws->wait, &wait, state);
+ sbitmap_prepare_to_wait(sbq, ws, &wait, state);
if (signal_pending_state(state, current))
break;
- tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
+ tag = sbitmap_queue_get(sbq, cpup);
if (tag >= 0)
break;
schedule();
}
- finish_wait(&ws->wait, &wait);
+ sbitmap_finish_wait(sbq, ws, &wait);
return tag;
}
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 47d76c862014..c062d363dce3 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1094,7 +1094,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status)
break;
}
- __blk_put_request(req->q, req);
+ blk_put_request(req);
kfree(pt);
}