diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-11-01 17:19:50 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-11-01 17:19:50 +0100 |
commit | 33c8846c814c1c27c6e33af005042d15061f948b (patch) | |
tree | da7c105b61758094d1d55ec1326ff28b521dbe9e /block/blk-mq.c | |
parent | Merge tag 'locks-v5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/jlay... (diff) | |
parent | blk-mq-debugfs: Show active requests per queue for shared tags (diff) | |
download | linux-33c8846c814c1c27c6e33af005042d15061f948b.tar.xz linux-33c8846c814c1c27c6e33af005042d15061f948b.zip |
Merge tag 'for-5.16/block-2021-10-29' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- mq-deadline accounting improvements (Bart)
- blk-wbt timer fix (Andrea)
- Untangle the block layer includes (Christoph)
- Rework the poll support to be bio based, which will enable adding
support for polling for bio based drivers (Christoph)
- Block layer core support for multi-actuator drives (Damien)
- blk-crypto improvements (Eric)
- Batched tag allocation support (me)
- Request completion batching support (me)
- Plugging improvements (me)
- Shared tag set improvements (John)
- Concurrent queue quiesce support (Ming)
- Cache bdev in ->private_data for block devices (Pavel)
- bdev dio improvements (Pavel)
- Block device invalidation and block size improvements (Xie)
- Various cleanups, fixes, and improvements (Christoph, Jackie,
Masahira, Tejun, Yu, Pavel, Zheng, me)
* tag 'for-5.16/block-2021-10-29' of git://git.kernel.dk/linux-block: (174 commits)
blk-mq-debugfs: Show active requests per queue for shared tags
block: improve readability of blk_mq_end_request_batch()
virtio-blk: Use blk_validate_block_size() to validate block size
loop: Use blk_validate_block_size() to validate block size
nbd: Use blk_validate_block_size() to validate block size
block: Add a helper to validate the block size
block: re-flow blk_mq_rq_ctx_init()
block: prefetch request to be initialized
block: pass in blk_mq_tags to blk_mq_rq_ctx_init()
block: add rq_flags to struct blk_mq_alloc_data
block: add async version of bio_set_polled
block: kill DIO_MULTI_BIO
block: kill unused polling bits in __blkdev_direct_IO()
block: avoid extra iter advance with async iocb
block: Add independent access ranges support
blk-mq: don't issue request directly in case that current is to be blocked
sbitmap: silence data race warning
blk-cgroup: synchronize blkg creation against policy deactivation
block: refactor bio_iov_bvec_set()
block: add single bio async direct IO helper
...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 1034 |
1 files changed, 668 insertions, 366 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 652a31fc3bb3..07eb1412760b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -10,14 +10,15 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> +#include <linux/interrupt.h> #include <linux/llist.h> -#include <linux/list_sort.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/sysctl.h> @@ -63,6 +64,32 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) return bucket; } +#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_INTERNAL (1U << 31) + +static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, + blk_qc_t qc) +{ + return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT]; +} + +static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, + blk_qc_t qc) +{ + unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1); + + if (qc & BLK_QC_T_INTERNAL) + return blk_mq_tag_to_rq(hctx->sched_tags, tag); + return blk_mq_tag_to_rq(hctx->tags, tag); +} + +static inline blk_qc_t blk_rq_to_qc(struct request *rq) +{ + return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | + (rq->tag != -1 ? + rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); +} + /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. @@ -214,7 +241,12 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + if (!q->quiesce_depth++) + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); @@ -255,10 +287,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + bool run_queue = false; + + spin_lock_irqsave(&q->queue_lock, flags); + if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { + ; + } else if (!--q->quiesce_depth) { + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + run_queue = true; + } + spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ - blk_mq_run_hw_queues(q, true); + if (run_queue) + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); @@ -272,74 +315,67 @@ void blk_mq_wake_waiters(struct request_queue *q) blk_mq_tag_wakeup_all(hctx->tags, true); } -/* - * Only need start/end time stamping if we have iostat or - * blk stats enabled, or using an IO scheduler. - */ -static inline bool blk_mq_need_time_stamp(struct request *rq) -{ - return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; -} - static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, u64 alloc_time_ns) + struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) { - struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct blk_mq_ctx *ctx = data->ctx; + struct blk_mq_hw_ctx *hctx = data->hctx; + struct request_queue *q = data->q; struct request *rq = tags->static_rqs[tag]; - if (data->q->elevator) { - rq->tag = BLK_MQ_NO_TAG; - rq->internal_tag = tag; - } else { + rq->q = q; + rq->mq_ctx = ctx; + rq->mq_hctx = hctx; + rq->cmd_flags = data->cmd_flags; + + if (data->flags & BLK_MQ_REQ_PM) + data->rq_flags |= RQF_PM; + if (blk_queue_io_stat(q)) + data->rq_flags |= RQF_IO_STAT; + rq->rq_flags = data->rq_flags; + + if (!(data->rq_flags & RQF_ELV)) { rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; + } else { + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = tag; } + rq->timeout = 0; - /* csd/requeue_work/fifo_time is initialized before use */ - rq->q = data->q; - rq->mq_ctx = data->ctx; - rq->mq_hctx = data->hctx; - rq->rq_flags = 0; - rq->cmd_flags = data->cmd_flags; - if (data->flags & BLK_MQ_REQ_PM) - rq->rq_flags |= RQF_PM; - if (blk_queue_io_stat(data->q)) - rq->rq_flags |= RQF_IO_STAT; - INIT_LIST_HEAD(&rq->queuelist); - INIT_HLIST_NODE(&rq->hash); - RB_CLEAR_NODE(&rq->rb_node); + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; rq->rq_disk = NULL; rq->part = NULL; #ifdef CONFIG_BLK_RQ_ALLOC_TIME rq->alloc_time_ns = alloc_time_ns; #endif - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); - else - rq->start_time_ns = 0; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif - blk_crypto_rq_set_defaults(rq); - /* tag was already set */ - WRITE_ONCE(rq->deadline, 0); - - rq->timeout = 0; - rq->end_io = NULL; rq->end_io_data = NULL; - data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++; + blk_crypto_rq_set_defaults(rq); + INIT_LIST_HEAD(&rq->queuelist); + /* tag was already set */ + WRITE_ONCE(rq->deadline, 0); refcount_set(&rq->ref, 1); - if (!op_is_flush(data->cmd_flags)) { + if (rq->rq_flags & RQF_ELV) { struct elevator_queue *e = data->q->elevator; rq->elv.icq = NULL; - if (e && e->type->ops.prepare_request) { + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + + if (!op_is_flush(data->cmd_flags) && + e->type->ops.prepare_request) { if (e->type->icq_cache) blk_mq_sched_assign_ioc(rq); @@ -348,15 +384,44 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, } } - data->hctx->queued++; return rq; } -static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) +static inline struct request * +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, + u64 alloc_time_ns) +{ + unsigned int tag, tag_offset; + struct blk_mq_tags *tags; + struct request *rq; + unsigned long tag_mask; + int i, nr = 0; + + tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); + if (unlikely(!tag_mask)) + return NULL; + + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) + continue; + prefetch(tags->static_rqs[tag]); + tag = tag_offset + i; + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); + rq_list_add(data->cached_rq, rq); + } + data->nr_tags -= nr; + + return rq_list_pop(data->cached_rq); +} + +static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; struct elevator_queue *e = q->elevator; u64 alloc_time_ns = 0; + struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ @@ -386,6 +451,16 @@ retry: blk_mq_tag_busy(data->hctx); /* + * Try batched alloc if we want more than 1 tag. + */ + if (data->nr_tags > 1) { + rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); + if (rq) + return rq; + data->nr_tags = 1; + } + + /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now. @@ -394,16 +469,18 @@ retry: if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; - /* - * Give up the CPU and sleep for a random short time to ensure - * that thread using a realtime scheduling class are migrated - * off the CPU, and thus off the hctx that is going away. + * Give up the CPU and sleep for a random short time to + * ensure that thread using a realtime scheduling class + * are migrated off the CPU, and thus off the hctx that + * is going away. */ msleep(3); goto retry; } - return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); + + return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, + alloc_time_ns); } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, @@ -413,6 +490,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, .q = q, .flags = flags, .cmd_flags = op, + .rq_flags = q->elevator ? RQF_ELV : 0, + .nr_tags = 1, }; struct request *rq; int ret; @@ -421,7 +500,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (ret) return ERR_PTR(ret); - rq = __blk_mq_alloc_request(&data); + rq = __blk_mq_alloc_requests(&data); if (!rq) goto out_queue_exit; rq->__data_len = 0; @@ -441,6 +520,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, .q = q, .flags = flags, .cmd_flags = op, + .rq_flags = q->elevator ? RQF_ELV : 0, + .nr_tags = 1, }; u64 alloc_time_ns = 0; unsigned int cpu; @@ -485,7 +566,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); + return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, + alloc_time_ns); out_queue_exit: blk_queue_exit(q); @@ -514,12 +596,12 @@ static void __blk_mq_free_request(struct request *rq) void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (rq->rq_flags & RQF_ELVPRIV) { - if (e && e->type->ops.finish_request) + struct elevator_queue *e = q->elevator; + + if (e->type->ops.finish_request) e->type->ops.finish_request(rq); if (rq->elv.icq) { put_io_context(rq->elv.icq->ioc); @@ -527,7 +609,6 @@ void blk_mq_free_request(struct request *rq) } } - ctx->rq_completed[rq_is_sync(rq)]++; if (rq->rq_flags & RQF_MQ_INFLIGHT) __blk_mq_dec_active_requests(hctx); @@ -542,21 +623,173 @@ void blk_mq_free_request(struct request *rq) } EXPORT_SYMBOL_GPL(blk_mq_free_request); -inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +void blk_mq_free_plug_rqs(struct blk_plug *plug) { - u64 now = 0; + struct request *rq; - if (blk_mq_need_time_stamp(rq)) - now = ktime_get_ns(); + while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) { + percpu_ref_get(&rq->q->q_usage_counter); + blk_mq_free_request(rq); + } +} +static void req_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, blk_status_t error) +{ + if (unlikely(error)) { + bio->bi_status = error; + } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { + /* + * Partial zone append completions cannot be supported as the + * BIO fragments may end up not being written sequentially. + */ + if (bio->bi_iter.bi_size != nbytes) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_iter.bi_sector = rq->__sector; + } + + bio_advance(bio, nbytes); + + if (unlikely(rq->rq_flags & RQF_QUIET)) + bio_set_flag(bio, BIO_QUIET); + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + bio_endio(bio); +} + +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + if (req->part && blk_do_io_stat(req)) { + const int sgrp = op_stat_group(req_op(req)); + + part_stat_lock(); + part_stat_add(req->part, sectors[sgrp], bytes >> 9); + part_stat_unlock(); + } +} + +/** + * blk_update_request - Complete multiple bytes without completing the request + * @req: the request being processed + * @error: block status code + * @nr_bytes: number of bytes to complete for @req + * + * Description: + * Ends I/O on a number of bytes attached to @req, but doesn't complete + * the request structure even if @req doesn't have leftover. + * If @req has leftover, sets it up for the next range of segments. + * + * Passing the result of blk_rq_bytes() as @nr_bytes guarantees + * %false return from this function. + * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function + * except in the consistency check at the end of this function. + * + * Return: + * %false - this request doesn't have any more data + * %true - this request has more data + **/ +bool blk_update_request(struct request *req, blk_status_t error, + unsigned int nr_bytes) +{ + int total_bytes; + + trace_block_rq_complete(req, error, nr_bytes); + + if (!req->bio) + return false; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + + if (unlikely(error && !blk_rq_is_passthrough(req) && + !(req->rq_flags & RQF_QUIET))) + blk_print_req_error(req, error); + + blk_account_io_completion(req, nr_bytes); + + total_bytes = 0; + while (req->bio) { + struct bio *bio = req->bio; + unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); + + if (bio_bytes == bio->bi_iter.bi_size) + req->bio = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + req_bio_endio(req, bio, bio_bytes, error); + + total_bytes += bio_bytes; + nr_bytes -= bio_bytes; + + if (!nr_bytes) + break; + } + + /* + * completely done + */ + if (!req->bio) { + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->__data_len = 0; + return false; + } + + req->__data_len -= total_bytes; + + /* update sector only for requests with clear definition of sector */ + if (!blk_rq_is_passthrough(req)) + req->__sector += total_bytes >> 9; + + /* mixed attributes always follow the first bio */ + if (req->rq_flags & RQF_MIXED_MERGE) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; + } + + if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { + /* + * If total number of sectors is less than the first segment + * size, something has gone terribly wrong. + */ + if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { + blk_dump_rq_flags(req, "request botched"); + req->__data_len = blk_rq_cur_bytes(req); + } + + /* recalculate the number of segments */ + req->nr_phys_segments = blk_recalc_rq_segments(req); + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_update_request); + +static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) +{ if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); blk_stat_add(rq, now); } blk_mq_sched_completed_request(rq, now); - blk_account_io_done(rq, now); +} + +inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +{ + if (blk_mq_need_time_stamp(rq)) + __blk_mq_end_request_acct(rq, ktime_get_ns()); if (rq->end_io) { rq_qos_done(rq->q, rq); @@ -575,6 +808,57 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); +#define TAG_COMP_BATCH 32 + +static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, + int *tag_array, int nr_tags) +{ + struct request_queue *q = hctx->queue; + + blk_mq_put_tags(hctx->tags, tag_array, nr_tags); + percpu_ref_put_many(&q->q_usage_counter, nr_tags); +} + +void blk_mq_end_request_batch(struct io_comp_batch *iob) +{ + int tags[TAG_COMP_BATCH], nr_tags = 0; + struct blk_mq_hw_ctx *cur_hctx = NULL; + struct request *rq; + u64 now = 0; + + if (iob->need_ts) + now = ktime_get_ns(); + + while ((rq = rq_list_pop(&iob->req_list)) != NULL) { + prefetch(rq->bio); + prefetch(rq->rq_next); + + blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq)); + if (iob->need_ts) + __blk_mq_end_request_acct(rq, now); + + WRITE_ONCE(rq->state, MQ_RQ_IDLE); + if (!refcount_dec_and_test(&rq->ref)) + continue; + + blk_crypto_free_request(rq); + blk_pm_mark_last_busy(rq); + rq_qos_done(rq->q, rq); + + if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { + if (cur_hctx) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); + nr_tags = 0; + cur_hctx = rq->mq_hctx; + } + tags[nr_tags++] = rq->tag; + } + + if (nr_tags) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); +} +EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); + static void blk_complete_reqs(struct llist_head *list) { struct llist_node *entry = llist_reverse_order(llist_del_all(list)); @@ -658,7 +942,7 @@ bool blk_mq_complete_request_remote(struct request *rq) * For a polled request, always complete locallly, it's pointless * to redirect the completion. */ - if (rq->cmd_flags & REQ_HIPRI) + if (rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { @@ -723,7 +1007,14 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - rq->io_start_time_ns = ktime_get_ns(); + u64 start_time; +#ifdef CONFIG_BLK_CGROUP + if (rq->bio) + start_time = bio_issue_time(&rq->bio->bi_issue); + else +#endif + start_time = ktime_get_ns(); + rq->io_start_time_ns = start_time; rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); @@ -738,6 +1029,8 @@ void blk_mq_start_request(struct request *rq) if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); #endif + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) + WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq)); } EXPORT_SYMBOL(blk_mq_start_request); @@ -763,7 +1056,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(!list_empty(&rq->queuelist)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -844,17 +1136,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) -{ - if (tag < tags->nr_tags) { - prefetch(tags->rqs[tag]); - return tags->rqs[tag]; - } - - return NULL; -} -EXPORT_SYMBOL(blk_mq_tag_to_rq); - static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -1059,24 +1340,16 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, return data.rq; } -static inline unsigned int queued_to_index(unsigned int queued) -{ - if (!queued) - return 0; - - return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); -} - -static bool __blk_mq_get_driver_tag(struct request *rq) +static bool __blk_mq_alloc_driver_tag(struct request *rq) { - struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; + struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { - bt = rq->mq_hctx->tags->breserved_tags; + bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) @@ -1091,11 +1364,9 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -bool blk_mq_get_driver_tag(struct request *rq) +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && @@ -1119,7 +1390,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, struct sbitmap_queue *sbq; list_del_init(&wait->entry); - sbq = hctx->tags->bitmap_tags; + sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); @@ -1137,7 +1408,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; + struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; @@ -1394,8 +1665,6 @@ out: if (!list_empty(&zone_list)) list_splice_tail_init(&zone_list, list); - hctx->dispatched[queued_to_index(queued)]++; - /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ @@ -1899,54 +2168,106 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, spin_unlock(&ctx->lock); } -static int plug_rq_cmp(void *priv, const struct list_head *a, - const struct list_head *b) +static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued, + bool from_schedule) { - struct request *rqa = container_of(a, struct request, queuelist); - struct request *rqb = container_of(b, struct request, queuelist); + if (hctx->queue->mq_ops->commit_rqs) { + trace_block_unplug(hctx->queue, *queued, !from_schedule); + hctx->queue->mq_ops->commit_rqs(hctx); + } + *queued = 0; +} - if (rqa->mq_ctx != rqb->mq_ctx) - return rqa->mq_ctx > rqb->mq_ctx; - if (rqa->mq_hctx != rqb->mq_hctx) - return rqa->mq_hctx > rqb->mq_hctx; +static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_hw_ctx *hctx = NULL; + struct request *rq; + int queued = 0; + int errors = 0; - return blk_rq_pos(rqa) > blk_rq_pos(rqb); + while ((rq = rq_list_pop(&plug->mq_list))) { + bool last = rq_list_empty(plug->mq_list); + blk_status_t ret; + + if (hctx != rq->mq_hctx) { + if (hctx) + blk_mq_commit_rqs(hctx, &queued, from_schedule); + hctx = rq->mq_hctx; + } + + ret = blk_mq_request_issue_directly(rq, last); + switch (ret) { + case BLK_STS_OK: + queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, false, last); + blk_mq_commit_rqs(hctx, &queued, from_schedule); + return; + default: + blk_mq_end_request(rq, ret); + errors++; + break; + } + } + + /* + * If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if (errors) + blk_mq_commit_rqs(hctx, &queued, from_schedule); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { + struct blk_mq_hw_ctx *this_hctx; + struct blk_mq_ctx *this_ctx; + unsigned int depth; LIST_HEAD(list); - if (list_empty(&plug->mq_list)) + if (rq_list_empty(plug->mq_list)) return; - list_splice_init(&plug->mq_list, &list); - - if (plug->rq_count > 2 && plug->multiple_queues) - list_sort(NULL, &list, plug_rq_cmp); - plug->rq_count = 0; + if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { + blk_mq_plug_issue_direct(plug, from_schedule); + if (rq_list_empty(plug->mq_list)) + return; + } + + this_hctx = NULL; + this_ctx = NULL; + depth = 0; do { - struct list_head rq_list; - struct request *rq, *head_rq = list_entry_rq(list.next); - struct list_head *pos = &head_rq->queuelist; /* skip first */ - struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; - struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; - unsigned int depth = 1; - - list_for_each_continue(pos, &list) { - rq = list_entry_rq(pos); - BUG_ON(!rq->q); - if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) - break; - depth++; + struct request *rq; + + rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + trace_block_unplug(this_hctx->queue, depth, + !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &list, from_schedule); + depth = 0; + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } - list_cut_before(&rq_list, &list, pos); - trace_block_unplug(head_rq->q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, + list_add(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + if (!list_empty(&list)) { + trace_block_unplug(this_hctx->queue, depth, !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_schedule); - } while(!list_empty(&list)); + } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, @@ -1969,19 +2290,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, bool last) + struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; - blk_qc_t new_cookie; blk_status_t ret; - new_cookie = request_to_qc_t(hctx, rq); - /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we @@ -1991,7 +2308,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); - *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: @@ -2000,7 +2316,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, break; default: blk_mq_update_dispatch_busy(hctx, false); - *cookie = BLK_QC_T_NONE; break; } @@ -2009,7 +2324,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie, bool bypass_insert, bool last) { struct request_queue *q = rq->q; @@ -2029,7 +2343,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - if (q->elevator && !bypass_insert) + if ((rq->rq_flags & RQF_ELV) && !bypass_insert) goto insert; budget_token = blk_mq_get_dispatch_budget(q); @@ -2043,7 +2357,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - return __blk_mq_issue_directly(hctx, rq, cookie, last); + return __blk_mq_issue_directly(hctx, rq, last); insert: if (bypass_insert) return BLK_STS_RESOURCE; @@ -2057,7 +2371,6 @@ insert: * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. - * @cookie: Request queue cookie. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so @@ -2065,7 +2378,7 @@ insert: * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) + struct request *rq) { blk_status_t ret; int srcu_idx; @@ -2074,7 +2387,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); + ret = __blk_mq_try_issue_directly(hctx, rq, false, true); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) @@ -2087,11 +2400,10 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { blk_status_t ret; int srcu_idx; - blk_qc_t unused_cookie; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); + ret = __blk_mq_try_issue_directly(hctx, rq, true, last); hctx_unlock(hctx, srcu_idx); return ret; @@ -2135,27 +2447,28 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { - list_add_tail(&rq->queuelist, &plug->mq_list); - plug->rq_count++; - if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { - struct request *tmp; + if (!plug->multiple_queues) { + struct request *nxt = rq_list_peek(&plug->mq_list); - tmp = list_first_entry(&plug->mq_list, struct request, - queuelist); - if (tmp->q != rq->q) + if (nxt && nxt->q != rq->q) plug->multiple_queues = true; } + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; } /* - * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests. */ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) { if (plug->multiple_queues) - return BLK_MAX_REQUEST_COUNT * 4; + return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT; } @@ -2171,57 +2484,63 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) * * It will not queue the request if there is an error with the bio, or at the * request creation. - * - * Returns: Request queue cookie. */ -blk_qc_t blk_mq_submit_bio(struct bio *bio) +void blk_mq_submit_bio(struct bio *bio) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { - .q = q, - }; struct request *rq; struct blk_plug *plug; - struct request *same_queue_rq = NULL; - unsigned int nr_segs; - blk_qc_t cookie; + bool same_queue_rq = false; + unsigned int nr_segs = 1; blk_status_t ret; - bool hipri; blk_queue_bounce(q, &bio); - __blk_queue_split(&bio, &nr_segs); + if (blk_may_split(q, bio)) + __blk_queue_split(q, &bio, &nr_segs); if (!bio_integrity_prep(bio)) goto queue_exit; - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) - goto queue_exit; - - if (blk_mq_sched_bio_merge(q, bio, nr_segs)) - goto queue_exit; + if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { + if (blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) + goto queue_exit; + if (blk_mq_sched_bio_merge(q, bio, nr_segs)) + goto queue_exit; + } rq_qos_throttle(q, bio); - hipri = bio->bi_opf & REQ_HIPRI; - - data.cmd_flags = bio->bi_opf; - rq = __blk_mq_alloc_request(&data); - if (unlikely(!rq)) { - rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) - bio_wouldblock_error(bio); - goto queue_exit; + plug = blk_mq_plug(q, bio); + if (plug && plug->cached_rq) { + rq = rq_list_pop(&plug->cached_rq); + INIT_LIST_HEAD(&rq->queuelist); + } else { + struct blk_mq_alloc_data data = { + .q = q, + .nr_tags = 1, + .cmd_flags = bio->bi_opf, + .rq_flags = q->elevator ? RQF_ELV : 0, + }; + + if (plug) { + data.nr_tags = plug->nr_ios; + plug->nr_ios = 1; + data.cached_rq = &plug->cached_rq; + } + rq = __blk_mq_alloc_requests(&data); + if (unlikely(!rq)) { + rq_qos_cleanup(q, bio); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + goto queue_exit; + } } trace_block_getrq(bio); rq_qos_track(q, rq, bio); - cookie = request_to_qc_t(data.hctx, rq); - blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_init_request(rq); @@ -2229,17 +2548,15 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); - return BLK_QC_T_NONE; + return; } - plug = blk_mq_plug(q, bio); - if (unlikely(is_flush_fua)) { - /* Bypass scheduler for flush requests */ - blk_insert_flush(rq); - blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || - blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) || - q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) + return; + + if (plug && (q->nr_hw_queues == 1 || + blk_mq_is_shared_tags(rq->mq_hctx->flags) || + q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. @@ -2250,22 +2567,26 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) unsigned int request_count = plug->rq_count; struct request *last = NULL; - if (!request_count) + if (!request_count) { trace_block_plug(q); - else - last = list_entry_rq(plug->mq_list.prev); + } else if (!blk_queue_nomerges(q)) { + last = rq_list_peek(&plug->mq_list); + if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE) + last = NULL; + } - if (request_count >= blk_plug_max_rq_count(plug) || (last && - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { - blk_flush_plug_list(plug, false); + if (request_count >= blk_plug_max_rq_count(plug) || last) { + blk_mq_flush_plug_list(plug, false); trace_block_plug(q); } blk_add_rq_to_plug(plug, rq); - } else if (q->elevator) { + } else if (rq->rq_flags & RQF_ELV) { /* Insert the request at the IO scheduler queue */ blk_mq_sched_insert_request(rq, false, true, true); } else if (plug && !blk_queue_nomerges(q)) { + struct request *next_rq = NULL; + /* * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be @@ -2273,39 +2594,32 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) * The plug list might get flushed before this. If that happens, * the plug list is empty, and same_queue_rq is invalid. */ - if (list_empty(&plug->mq_list)) - same_queue_rq = NULL; if (same_queue_rq) { - list_del_init(&same_queue_rq->queuelist); + next_rq = rq_list_pop(&plug->mq_list); plug->rq_count--; } blk_add_rq_to_plug(plug, rq); trace_block_plug(q); - if (same_queue_rq) { - data.hctx = same_queue_rq->mq_hctx; + if (next_rq) { trace_block_unplug(q, 1, true); - blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie); + blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq); } } else if ((q->nr_hw_queues > 1 && is_sync) || - !data.hctx->dispatch_busy) { + !rq->mq_hctx->dispatch_busy) { /* * There is no scheduler and we can try to send directly * to the hardware. */ - blk_mq_try_issue_directly(data.hctx, rq, &cookie); + blk_mq_try_issue_directly(rq->mq_hctx, rq); } else { /* Default case. */ blk_mq_sched_insert_request(rq, false, true, true); } - if (!hipri) - return BLK_QC_T_NONE; - return cookie; + return; queue_exit: blk_queue_exit(q); - return BLK_QC_T_NONE; } static size_t order_to_size(unsigned int order) @@ -2314,19 +2628,22 @@ static size_t order_to_size(unsigned int order) } /* called before freeing request pool in @tags */ -static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, - struct blk_mq_tags *tags, unsigned int hctx_idx) +static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, + struct blk_mq_tags *tags) { - struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; struct page *page; unsigned long flags; + /* There is no need to clear a driver tags own mapping */ + if (drv_tags == tags) + return; + list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private); int i; - for (i = 0; i < set->queue_depth; i++) { + for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq; @@ -2350,9 +2667,15 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { + struct blk_mq_tags *drv_tags; struct page *page; - if (tags->rqs && set->ops->exit_request) { + if (blk_mq_is_shared_tags(set->flags)) + drv_tags = set->shared_tags; + else + drv_tags = set->tags[hctx_idx]; + + if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { @@ -2365,7 +2688,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } - blk_mq_clear_rq_mapping(set, tags, hctx_idx); + blk_mq_clear_rq_mapping(drv_tags, tags); while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); @@ -2379,21 +2702,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) +void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); } -struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags, - unsigned int flags) +static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) { struct blk_mq_tags *tags; int node; @@ -2402,7 +2724,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; @@ -2410,7 +2733,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); return NULL; } @@ -2419,7 +2742,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, node); if (!tags->static_rqs) { kfree(tags->rqs); - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); return NULL; } @@ -2441,8 +2764,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, return 0; } -int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth) +static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; size_t rq_size, left; @@ -2853,37 +3177,58 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } -static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, - int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int depth) { - unsigned int flags = set->flags; - int ret = 0; + struct blk_mq_tags *tags; + int ret; - set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, - set->queue_depth, set->reserved_tags, flags); - if (!set->tags[hctx_idx]) - return false; + tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); + if (!tags) + return NULL; - ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, - set->queue_depth); - if (!ret) - return true; + ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); + if (ret) { + blk_mq_free_rq_map(tags); + return NULL; + } - blk_mq_free_rq_map(set->tags[hctx_idx], flags); - set->tags[hctx_idx] = NULL; - return false; + return tags; } -static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + int hctx_idx) { - unsigned int flags = set->flags; + if (blk_mq_is_shared_tags(set->flags)) { + set->tags[hctx_idx] = set->shared_tags; - if (set->tags && set->tags[hctx_idx]) { - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); - blk_mq_free_rq_map(set->tags[hctx_idx], flags); - set->tags[hctx_idx] = NULL; + return true; } + + set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, + set->queue_depth); + + return set->tags[hctx_idx]; +} + +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx) +{ + if (tags) { + blk_mq_free_rqs(set, tags, hctx_idx); + blk_mq_free_rq_map(tags); + } +} + +static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + if (!blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); + + set->tags[hctx_idx] = NULL; } static void blk_mq_map_swqueue(struct request_queue *q) @@ -2916,7 +3261,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && - !__blk_mq_alloc_map_and_request(set, hctx_idx)) { + !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this @@ -2963,8 +3308,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) - blk_mq_free_map_and_requests(set, i); + if (i) + __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; @@ -3260,8 +3605,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) - blk_mq_free_map_and_requests(set, j); + __blk_mq_free_map_and_rqs(set, j); blk_mq_exit_hctx(q, set, hctx, j); hctxs[j] = NULL; } @@ -3348,8 +3692,16 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; + if (blk_mq_is_shared_tags(set->flags)) { + set->shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + set->queue_depth); + if (!set->shared_tags) + return -ENOMEM; + } + for (i = 0; i < set->nr_hw_queues; i++) { - if (!__blk_mq_alloc_map_and_request(set, i)) + if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } @@ -3358,7 +3710,12 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) out_unwind: while (--i >= 0) - blk_mq_free_map_and_requests(set, i); + __blk_mq_free_map_and_rqs(set, i); + + if (blk_mq_is_shared_tags(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_tags, + BLK_MQ_NO_HCTX_IDX); + } return -ENOMEM; } @@ -3368,7 +3725,7 @@ out_unwind: * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ -static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) +static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; @@ -3534,27 +3891,15 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_mq_map; - ret = blk_mq_alloc_map_and_requests(set); + ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; - if (blk_mq_is_sbitmap_shared(set->flags)) { - atomic_set(&set->active_queues_shared_sbitmap, 0); - - if (blk_mq_init_shared_sbitmap(set)) { - ret = -ENOMEM; - goto out_free_mq_rq_maps; - } - } - mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; -out_free_mq_rq_maps: - for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); @@ -3587,10 +3932,12 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) int i, j; for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); + __blk_mq_free_map_and_rqs(set, i); - if (blk_mq_is_sbitmap_shared(set->flags)) - blk_mq_exit_shared_sbitmap(set); + if (blk_mq_is_shared_tags(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_tags, + BLK_MQ_NO_HCTX_IDX); + } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); @@ -3625,20 +3972,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ - if (!hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); - if (!ret && blk_mq_is_sbitmap_shared(set->flags)) - blk_mq_tag_resize_shared_sbitmap(set, nr); - } else { + if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); - if (blk_mq_is_sbitmap_shared(set->flags)) { - hctx->sched_tags->bitmap_tags = - &q->sched_bitmap_tags; - hctx->sched_tags->breserved_tags = - &q->sched_breserved_tags; - } + nr, true); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, + false); } if (ret) break; @@ -3647,9 +3986,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } if (!ret) { q->nr_requests = nr; - if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) - sbitmap_queue_resize(&q->sched_bitmap_tags, - nr - set->reserved_tags); + if (blk_mq_is_shared_tags(set->flags)) { + if (q->elevator) + blk_mq_tag_update_sched_shared_tags(q); + else + blk_mq_tag_resize_shared_tags(set, nr); + } } blk_mq_unquiesce_queue(q); @@ -3868,15 +4210,20 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, return ret; } -static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, - struct request *rq) +static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) { + struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc); + struct request *rq = blk_qc_to_rq(hctx, qc); struct hrtimer_sleeper hs; enum hrtimer_mode mode; unsigned int nsecs; ktime_t kt; - if (rq->rq_flags & RQF_MQ_POLL_SLEPT) + /* + * If a request has completed on queue that uses an I/O scheduler, we + * won't get back a request from blk_qc_to_rq. + */ + if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT)) return false; /* @@ -3918,92 +4265,37 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, __set_current_state(TASK_RUNNING); destroy_hrtimer_on_stack(&hs.timer); - return true; -} - -static bool blk_mq_poll_hybrid(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) -{ - struct request *rq; - - if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) - return false; - - if (!blk_qc_t_is_internal(cookie)) - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); - else { - rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); - /* - * With scheduling, if the request has completed, we'll - * get a NULL return here, as we clear the sched tag when - * that happens. The request still remains valid, like always, - * so we should be safe with just the NULL check. - */ - if (!rq) - return false; - } - - return blk_mq_poll_hybrid_sleep(q, rq); -} - -/** - * blk_poll - poll for IO completions - * @q: the queue - * @cookie: cookie passed back at IO submission time - * @spin: whether to spin for completions - * - * Description: - * Poll for completions on the passed in queue. Returns number of - * completed entries found. If @spin is true, then blk_poll will continue - * looping until at least one completion is found, unless the task is - * otherwise marked running (or we need to reschedule). - */ -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int state; - - if (!blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return 0; - - if (current->plug) - blk_flush_plug_list(current->plug, false); - - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; /* - * If we sleep, have the caller restart the poll loop to reset - * the state. Like for the other success return cases, the - * caller is responsible for checking if the IO completed. If - * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. If specified not to spin, - * we also should not sleep. + * If we sleep, have the caller restart the poll loop to reset the + * state. Like for the other success return cases, the caller is + * responsible for checking if the IO completed. If the IO isn't + * complete, we'll get called again and will go straight to the busy + * poll loop. */ - if (spin && blk_mq_poll_hybrid(q, hctx, cookie)) - return 1; + return true; +} - hctx->poll_considered++; +static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, + struct io_comp_batch *iob, unsigned int flags) +{ + struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); + long state = get_current_state(); + int ret; - state = get_current_state(); do { - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx); + ret = q->mq_ops->poll(hctx, iob); if (ret > 0) { - hctx->poll_success++; __set_current_state(TASK_RUNNING); return ret; } if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); - if (task_is_running(current)) return 1; - if (ret < 0 || !spin) + + if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); @@ -4011,7 +4303,17 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) __set_current_state(TASK_RUNNING); return 0; } -EXPORT_SYMBOL_GPL(blk_poll); + +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags) +{ + if (!(flags & BLK_POLL_NOSLEEP) && + q->poll_nsec != BLK_MQ_POLL_CLASSIC) { + if (blk_mq_poll_hybrid(q, cookie)) + return 1; + } + return blk_mq_poll_classic(q, cookie, iob, flags); +} unsigned int blk_mq_rq_cpu(struct request *rq) { |