summaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-26 21:47:20 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-26 21:47:20 +0200
commita0433f8cae3ac51f59b4b1863032822aaa2d8164 (patch)
tree9eb7b096aa9f7fa53921e6ff247488f3a55471f5 /block/blk-mq.c
parentMerge tag 'for-6.5/io_uring-2023-06-23' of git://git.kernel.dk/linux (diff)
parentscsi/sg: don't grab scsi host module reference (diff)
downloadlinux-a0433f8cae3ac51f59b4b1863032822aaa2d8164.tar.xz
linux-a0433f8cae3ac51f59b4b1863032822aaa2d8164.zip
Merge tag 'for-6.5/block-2023-06-23' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - NVMe pull request via Keith: - Various cleanups all around (Irvin, Chaitanya, Christophe) - Better struct packing (Christophe JAILLET) - Reduce controller error logs for optional commands (Keith) - Support for >=64KiB block sizes (Daniel Gomez) - Fabrics fixes and code organization (Max, Chaitanya, Daniel Wagner) - bcache updates via Coly: - Fix a race at init time (Mingzhe Zou) - Misc fixes and cleanups (Andrea, Thomas, Zheng, Ye) - use page pinning in the block layer for dio (David) - convert old block dio code to page pinning (David, Christoph) - cleanups for pktcdvd (Andy) - cleanups for rnbd (Guoqing) - use the unchecked __bio_add_page() for the initial single page additions (Johannes) - fix overflows in the Amiga partition handling code (Michael) - improve mq-deadline zoned device support (Bart) - keep passthrough requests out of the IO schedulers (Christoph, Ming) - improve support for flush requests, making them less special to deal with (Christoph) - add bdev holder ops and shutdown methods (Christoph) - fix the name_to_dev_t() situation and use cases (Christoph) - decouple the block open flags from fmode_t (Christoph) - ublk updates and cleanups, including adding user copy support (Ming) - BFQ sanity checking (Bart) - convert brd from radix to xarray (Pankaj) - constify various structures (Thomas, Ivan) - more fine grained persistent reservation ioctl capability checks (Jingbo) - misc fixes and cleanups (Arnd, Azeem, Demi, Ed, Hengqi, Hou, Jan, Jordy, Li, Min, Yu, Zhong, Waiman) * tag 'for-6.5/block-2023-06-23' of git://git.kernel.dk/linux: (266 commits) scsi/sg: don't grab scsi host module reference ext4: Fix warning in blkdev_put() block: don't return -EINVAL for not found names in devt_from_devname cdrom: Fix spectre-v1 gadget block: Improve kernel-doc headers blk-mq: don't insert passthrough request into sw queue bsg: make bsg_class a static const structure ublk: make ublk_chr_class a static const structure aoe: make aoe_class a static const structure block/rnbd: make all 'class' structures const block: fix the exclusive open mask in disk_scan_partitions block: add overflow checks for Amiga partition support block: change all __u32 annotations to __be32 in affs_hardblocks.h block: fix signed int overflow in Amiga partition support block: add capacity validation in bdev_add_partition() block: fine-granular CAP_SYS_ADMIN for Persistent Reservation block: disallow Persistent Reservation on partitions reiserfs: fix blkdev_put() warning from release_journal_dev() block: fix wrong mode for blkdev_get_by_dev() from disk_scan_partitions() block: document the holder argument to blkdev_get_by_path ...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c141
1 files changed, 73 insertions, 68 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 850bfb844ed2..decb6ab2d508 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -45,6 +45,8 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
+static void blk_mq_request_bypass_insert(struct request *rq,
+ blk_insert_t flags);
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list);
@@ -354,12 +356,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
- if (!(data->rq_flags & RQF_ELV)) {
- rq->tag = tag;
- rq->internal_tag = BLK_MQ_NO_TAG;
- } else {
+ if (data->rq_flags & RQF_SCHED_TAGS) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
+ } else {
+ rq->tag = tag;
+ rq->internal_tag = BLK_MQ_NO_TAG;
}
rq->timeout = 0;
@@ -386,17 +388,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
WRITE_ONCE(rq->deadline, 0);
req_ref_set(rq, 1);
- if (rq->rq_flags & RQF_ELV) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = data->q->elevator;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
- if (!op_is_flush(data->cmd_flags) &&
- e->type->ops.prepare_request) {
+ if (e->type->ops.prepare_request)
e->type->ops.prepare_request(rq);
- rq->rq_flags |= RQF_ELVPRIV;
- }
}
return rq;
@@ -449,26 +448,32 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (q->elevator) {
- struct elevator_queue *e = q->elevator;
-
- data->rq_flags |= RQF_ELV;
+ /*
+ * All requests use scheduler tags when an I/O scheduler is
+ * enabled for the queue.
+ */
+ data->rq_flags |= RQF_SCHED_TAGS;
/*
* Flush/passthrough requests are special and go directly to the
- * dispatch list. Don't include reserved tags in the
- * limiting, as it isn't useful.
+ * dispatch list.
*/
- if (!op_is_flush(data->cmd_flags) &&
- !blk_op_is_passthrough(data->cmd_flags) &&
- e->type->ops.limit_depth &&
- !(data->flags & BLK_MQ_REQ_RESERVED))
- e->type->ops.limit_depth(data->cmd_flags, data);
+ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
+ !blk_op_is_passthrough(data->cmd_flags)) {
+ struct elevator_mq_ops *ops = &q->elevator->type->ops;
+
+ WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
+
+ data->rq_flags |= RQF_USE_SCHED;
+ if (ops->limit_depth)
+ ops->limit_depth(data->cmd_flags, data);
+ }
}
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->rq_flags & RQF_ELV))
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
blk_mq_tag_busy(data->hctx);
if (data->flags & BLK_MQ_REQ_RESERVED)
@@ -648,10 +653,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
goto out_queue_exit;
data.ctx = __blk_mq_get_ctx(q, cpu);
- if (!q->elevator)
- blk_mq_tag_busy(data.hctx);
+ if (q->elevator)
+ data.rq_flags |= RQF_SCHED_TAGS;
else
- data.rq_flags |= RQF_ELV;
+ blk_mq_tag_busy(data.hctx);
if (flags & BLK_MQ_REQ_RESERVED)
data.rq_flags |= RQF_RESV;
@@ -699,7 +704,7 @@ void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
- if ((rq->rq_flags & RQF_ELVPRIV) &&
+ if ((rq->rq_flags & RQF_USE_SCHED) &&
q->elevator->type->ops.finish_request)
q->elevator->type->ops.finish_request(rq);
@@ -957,6 +962,8 @@ EXPORT_SYMBOL_GPL(blk_update_request);
static inline void blk_account_io_done(struct request *req, u64 now)
{
+ trace_block_io_done(req);
+
/*
* Account IO completion. flush_rq isn't accounted as a
* normal IO on queueing nor completion. Accounting the
@@ -976,6 +983,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
static inline void blk_account_io_start(struct request *req)
{
+ trace_block_io_start(req);
+
if (blk_do_io_stat(req)) {
/*
* All non-passthrough requests are created from a bio with one
@@ -1176,8 +1185,9 @@ bool blk_mq_complete_request_remote(struct request *rq)
* or a polled request, always complete locally,
* it's pointless to redirect the completion.
*/
- if (rq->mq_hctx->nr_ctx == 1 ||
- rq->cmd_flags & REQ_POLLED)
+ if ((rq->mq_hctx->nr_ctx == 1 &&
+ rq->mq_ctx->cpu == raw_smp_processor_id()) ||
+ rq->cmd_flags & REQ_POLLED)
return false;
if (blk_mq_complete_need_ipi(rq)) {
@@ -1270,7 +1280,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
if (!plug->multiple_queues && last && last->q != rq->q)
plug->multiple_queues = true;
- if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+ if (!plug->has_elevator && (rq->rq_flags & RQF_USE_SCHED))
plug->has_elevator = true;
rq->rq_next = NULL;
rq_list_add(&plug->mq_list, rq);
@@ -1411,13 +1421,16 @@ static void __blk_mq_requeue_request(struct request *rq)
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
struct request_queue *q = rq->q;
+ unsigned long flags;
__blk_mq_requeue_request(rq);
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
- blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD);
+ spin_lock_irqsave(&q->requeue_lock, flags);
+ list_add_tail(&rq->queuelist, &q->requeue_list);
+ spin_unlock_irqrestore(&q->requeue_lock, flags);
if (kick_requeue_list)
blk_mq_kick_requeue_list(q);
@@ -1429,13 +1442,16 @@ static void blk_mq_requeue_work(struct work_struct *work)
struct request_queue *q =
container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
- struct request *rq, *next;
+ LIST_HEAD(flush_list);
+ struct request *rq;
spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
+ list_splice_init(&q->flush_list, &flush_list);
spin_unlock_irq(&q->requeue_lock);
- list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+ while (!list_empty(&rq_list)) {
+ rq = list_entry(rq_list.next, struct request, queuelist);
/*
* If RQF_DONTPREP ist set, the request has been started by the
* driver already and might have driver-specific data allocated
@@ -1443,18 +1459,16 @@ static void blk_mq_requeue_work(struct work_struct *work)
* block layer merges for the request.
*/
if (rq->rq_flags & RQF_DONTPREP) {
- rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
blk_mq_request_bypass_insert(rq, 0);
- } else if (rq->rq_flags & RQF_SOFTBARRIER) {
- rq->rq_flags &= ~RQF_SOFTBARRIER;
+ } else {
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
}
}
- while (!list_empty(&rq_list)) {
- rq = list_entry(rq_list.next, struct request, queuelist);
+ while (!list_empty(&flush_list)) {
+ rq = list_entry(flush_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, 0);
}
@@ -1462,27 +1476,6 @@ static void blk_mq_requeue_work(struct work_struct *work)
blk_mq_run_hw_queues(q, false);
}
-void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags)
-{
- struct request_queue *q = rq->q;
- unsigned long flags;
-
- /*
- * We abuse this flag that is otherwise used by the I/O scheduler to
- * request head insertion from the workqueue.
- */
- BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
-
- spin_lock_irqsave(&q->requeue_lock, flags);
- if (insert_flags & BLK_MQ_INSERT_AT_HEAD) {
- rq->rq_flags |= RQF_SOFTBARRIER;
- list_add(&rq->queuelist, &q->requeue_list);
- } else {
- list_add_tail(&rq->queuelist, &q->requeue_list);
- }
- spin_unlock_irqrestore(&q->requeue_lock, flags);
-}
-
void blk_mq_kick_requeue_list(struct request_queue *q)
{
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
@@ -2427,7 +2420,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
+static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -2492,7 +2485,7 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
* dispatch it given we prioritize requests in hctx->dispatch.
*/
blk_mq_request_bypass_insert(rq, flags);
- } else if (rq->rq_flags & RQF_FLUSH_SEQ) {
+ } else if (req_op(rq) == REQ_OP_FLUSH) {
/*
* Firstly normal IO request is inserted to scheduler queue or
* sw queue, meantime we add flush request to dispatch queue(
@@ -2622,7 +2615,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
return;
}
- if ((rq->rq_flags & RQF_ELV) || !blk_mq_get_budget_and_tag(rq)) {
+ if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
blk_mq_insert_request(rq, 0);
blk_mq_run_hw_queue(hctx, false);
return;
@@ -2711,6 +2704,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
struct request *requeue_list = NULL;
struct request **requeue_lastp = &requeue_list;
unsigned int depth = 0;
+ bool is_passthrough = false;
LIST_HEAD(list);
do {
@@ -2719,7 +2713,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
if (!this_hctx) {
this_hctx = rq->mq_hctx;
this_ctx = rq->mq_ctx;
- } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+ is_passthrough = blk_rq_is_passthrough(rq);
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
+ is_passthrough != blk_rq_is_passthrough(rq)) {
rq_list_add_tail(&requeue_lastp, rq);
continue;
}
@@ -2731,7 +2727,13 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
trace_block_unplug(this_hctx->queue, depth, !from_sched);
percpu_ref_get(&this_hctx->queue->q_usage_counter);
- if (this_hctx->queue->elevator) {
+ /* passthrough requests should never be issued to the I/O scheduler */
+ if (is_passthrough) {
+ spin_lock(&this_hctx->lock);
+ list_splice_tail_init(&list, &this_hctx->dispatch);
+ spin_unlock(&this_hctx->lock);
+ blk_mq_run_hw_queue(this_hctx, from_sched);
+ } else if (this_hctx->queue->elevator) {
this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
&list, 0);
blk_mq_run_hw_queue(this_hctx, from_sched);
@@ -2970,10 +2972,8 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
- if (op_is_flush(bio->bi_opf)) {
- blk_insert_flush(rq);
+ if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
return;
- }
if (plug) {
blk_add_rq_to_plug(plug, rq);
@@ -2981,7 +2981,7 @@ void blk_mq_submit_bio(struct bio *bio)
}
hctx = rq->mq_hctx;
- if ((rq->rq_flags & RQF_ELV) ||
+ if ((rq->rq_flags & RQF_USE_SCHED) ||
(hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
blk_mq_insert_request(rq, 0);
blk_mq_run_hw_queue(hctx, true);
@@ -4232,6 +4232,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
+ INIT_LIST_HEAD(&q->flush_list);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
@@ -4608,9 +4609,6 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
{
struct blk_mq_qe_pair *qe;
- if (!q->elevator)
- return true;
-
qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
if (!qe)
return false;
@@ -4618,6 +4616,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
/* q->elevator needs protection from ->sysfs_lock */
mutex_lock(&q->sysfs_lock);
+ /* the check has to be done with holding sysfs_lock */
+ if (!q->elevator) {
+ kfree(qe);
+ goto unlock;
+ }
+
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
@@ -4625,6 +4629,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
__elevator_get(qe->type);
list_add(&qe->node, head);
elevator_disable(q);
+unlock:
mutex_unlock(&q->sysfs_lock);
return true;