diff options
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 411 |
1 files changed, 298 insertions, 113 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 8e56884fd2e9..9a36ac1c1fa1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> +#include <linux/blk-crypto.h> #include <trace/events/block.h> @@ -270,14 +271,14 @@ static inline bool blk_mq_need_time_stamp(struct request *rq) } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, unsigned int op, u64 alloc_time_ns) + unsigned int tag, u64 alloc_time_ns) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; req_flags_t rq_flags = 0; if (data->flags & BLK_MQ_REQ_INTERNAL) { - rq->tag = -1; + rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { @@ -285,7 +286,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, atomic_inc(&data->hctx->nr_active); } rq->tag = tag; - rq->internal_tag = -1; + rq->internal_tag = BLK_MQ_NO_TAG; data->hctx->tags->rqs[rq->tag] = rq; } @@ -294,7 +295,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->mq_ctx = data->ctx; rq->mq_hctx = data->hctx; rq->rq_flags = rq_flags; - rq->cmd_flags = op; + rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) @@ -317,8 +318,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif + blk_crypto_rq_set_defaults(rq); /* tag was already set */ - rq->extra_len = 0; WRITE_ONCE(rq->deadline, 0); rq->timeout = 0; @@ -326,35 +327,37 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io = NULL; rq->end_io_data = NULL; - data->ctx->rq_dispatched[op_is_sync(op)]++; + data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++; refcount_set(&rq->ref, 1); + + if (!op_is_flush(data->cmd_flags)) { + struct elevator_queue *e = data->q->elevator; + + rq->elv.icq = NULL; + if (e && e->type->ops.prepare_request) { + if (e->type->icq_cache) + blk_mq_sched_assign_ioc(rq); + + e->type->ops.prepare_request(rq); + rq->rq_flags |= RQF_ELVPRIV; + } + } + + data->hctx->queued++; return rq; } -static struct request *blk_mq_get_request(struct request_queue *q, - struct bio *bio, - struct blk_mq_alloc_data *data) +static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) { + struct request_queue *q = data->q; struct elevator_queue *e = q->elevator; - struct request *rq; - unsigned int tag; - bool clear_ctx_on_error = false; u64 alloc_time_ns = 0; - - blk_queue_enter_live(q); + unsigned int tag; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = ktime_get_ns(); - data->q = q; - if (likely(!data->ctx)) { - data->ctx = blk_mq_get_ctx(q); - clear_ctx_on_error = true; - } - if (likely(!data->hctx)) - data->hctx = blk_mq_map_queue(q, data->cmd_flags, - data->ctx); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; @@ -370,37 +373,43 @@ static struct request *blk_mq_get_request(struct request_queue *q, e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.limit_depth(data->cmd_flags, data); - } else { - blk_mq_tag_busy(data->hctx); } - tag = blk_mq_get_tag(data); - if (tag == BLK_MQ_TAG_FAIL) { - if (clear_ctx_on_error) - data->ctx = NULL; - blk_queue_exit(q); - return NULL; - } +retry: + data->ctx = blk_mq_get_ctx(q); + data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + if (!(data->flags & BLK_MQ_REQ_INTERNAL)) + blk_mq_tag_busy(data->hctx); - rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns); - if (!op_is_flush(data->cmd_flags)) { - rq->elv.icq = NULL; - if (e && e->type->ops.prepare_request) { - if (e->type->icq_cache) - blk_mq_sched_assign_ioc(rq); + /* + * Waiting allocations only fail because of an inactive hctx. In that + * case just retry the hctx assignment and tag allocation as CPU hotplug + * should have migrated us to an online CPU by now. + */ + tag = blk_mq_get_tag(data); + if (tag == BLK_MQ_NO_TAG) { + if (data->flags & BLK_MQ_REQ_NOWAIT) + return NULL; - e->type->ops.prepare_request(rq, bio); - rq->rq_flags |= RQF_ELVPRIV; - } + /* + * Give up the CPU and sleep for a random short time to ensure + * that thread using a realtime scheduling class are migrated + * off the the CPU, and thus off the hctx that is going away. + */ + msleep(3); + goto retry; } - data->hctx->queued++; - return rq; + return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { - struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = op, + }; struct request *rq; int ret; @@ -408,34 +417,43 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (ret) return ERR_PTR(ret); - rq = blk_mq_get_request(q, NULL, &alloc_data); - blk_queue_exit(q); - + rq = __blk_mq_alloc_request(&data); if (!rq) - return ERR_PTR(-EWOULDBLOCK); - + goto out_queue_exit; rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; +out_queue_exit: + blk_queue_exit(q); + return ERR_PTR(-EWOULDBLOCK); } EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) { - struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; - struct request *rq; + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = op, + }; + u64 alloc_time_ns = 0; unsigned int cpu; + unsigned int tag; int ret; + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); + /* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level * allocator for this for the rare use case of a command tied to * a specific queue. */ - if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) + if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED)))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) @@ -449,21 +467,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ - alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; - if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { - blk_queue_exit(q); - return ERR_PTR(-EXDEV); - } - cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); - alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - - rq = blk_mq_get_request(q, NULL, &alloc_data); - blk_queue_exit(q); + ret = -EXDEV; + data.hctx = q->queue_hw_ctx[hctx_idx]; + if (!blk_mq_hw_queue_mapped(data.hctx)) + goto out_queue_exit; + cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); + data.ctx = __blk_mq_get_ctx(q, cpu); + + if (q->elevator) + data.flags |= BLK_MQ_REQ_INTERNAL; + else + blk_mq_tag_busy(data.hctx); - if (!rq) - return ERR_PTR(-EWOULDBLOCK); + ret = -EWOULDBLOCK; + tag = blk_mq_get_tag(&data); + if (tag == BLK_MQ_NO_TAG) + goto out_queue_exit; + return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); - return rq; +out_queue_exit: + blk_queue_exit(q); + return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); @@ -474,11 +498,12 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; + blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; - if (rq->tag != -1) + if (rq->tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->tags, ctx, rq->tag); - if (sched_tag != -1) + if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); blk_queue_exit(q); @@ -527,7 +552,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_stat_add(rq, now); } - if (rq->internal_tag != -1) + if (rq->internal_tag != BLK_MQ_NO_TAG) blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); @@ -557,7 +582,17 @@ static void __blk_mq_complete_request_remote(void *data) q->mq_ops->complete(rq); } -static void __blk_mq_complete_request(struct request *rq) +/** + * blk_mq_force_complete_rq() - Force complete the request, bypassing any error + * injection that could drop the completion. + * @rq: Request to be force completed + * + * Drivers should use blk_mq_complete_request() to complete requests in their + * normal IO path. For timeout error recovery, drivers may call this forced + * completion routine after they've reclaimed timed out requests to bypass + * potentially subsequent fake timeouts. + */ +void blk_mq_force_complete_rq(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; struct request_queue *q = rq->q; @@ -603,6 +638,7 @@ static void __blk_mq_complete_request(struct request *rq) } put_cpu(); } +EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq); static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) __releases(hctx->srcu) @@ -636,7 +672,7 @@ bool blk_mq_complete_request(struct request *rq) { if (unlikely(blk_should_fake_timeout(rq->q))) return false; - __blk_mq_complete_request(rq); + blk_mq_force_complete_rq(rq); return true; } EXPORT_SYMBOL(blk_mq_complete_request); @@ -667,15 +703,6 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * Make sure space for the drain appears. We know we can do - * this because max_hw_segments has been adjusted to be one - * fewer than the device can handle. - */ - rq->nr_phys_segments++; - } - #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); @@ -695,8 +722,6 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; } } @@ -1037,7 +1062,7 @@ bool blk_mq_get_driver_tag(struct request *rq) }; bool shared; - if (rq->tag != -1) + if (rq->tag != BLK_MQ_NO_TAG) return true; if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) @@ -1053,7 +1078,7 @@ bool blk_mq_get_driver_tag(struct request *rq) data.hctx->tags->rqs[rq->tag] = rq; } - return rq->tag != -1; + return rq->tag != BLK_MQ_NO_TAG; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, @@ -1195,6 +1220,19 @@ static void blk_mq_handle_dev_resource(struct request *rq, __blk_mq_requeue_request(rq); } +static void blk_mq_handle_zone_resource(struct request *rq, + struct list_head *zone_list) +{ + /* + * If we end up here it is because we cannot dispatch a request to a + * specific zone due to LLD level zone-write locking or other zone + * related resource not being available. In this case, set the request + * aside in zone_list for retrying it later. + */ + list_add(&rq->queuelist, zone_list); + __blk_mq_requeue_request(rq); +} + /* * Returns true if we did some work AND can potentially do more. */ @@ -1206,6 +1244,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool no_tag = false; int errors, queued; blk_status_t ret = BLK_STS_OK; + bool no_budget_avail = false; + LIST_HEAD(zone_list); if (list_empty(list)) return false; @@ -1222,8 +1262,11 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); hctx = rq->mq_hctx; - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) + if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { + blk_mq_put_driver_tag(rq); + no_budget_avail = true; break; + } if (!blk_mq_get_driver_tag(rq)) { /* @@ -1264,6 +1307,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_mq_handle_dev_resource(rq, list); break; + } else if (ret == BLK_STS_ZONE_RESOURCE) { + /* + * Move the request to zone_list and keep going through + * the dispatch list to find more requests the drive can + * accept. + */ + blk_mq_handle_zone_resource(rq, &zone_list); + if (list_empty(list)) + break; + continue; } if (unlikely(ret != BLK_STS_OK)) { @@ -1275,6 +1328,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, queued++; } while (!list_empty(list)); + if (!list_empty(&zone_list)) + list_splice_tail_init(&zone_list, list); + hctx->dispatched[queued_to_index(queued)]++; /* @@ -1318,13 +1374,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls - * that could otherwise occur if the queue is idle. + * that could otherwise occur if the queue is idle. We'll do + * similar if we couldn't get budget and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && (ret == BLK_STS_RESOURCE)) + else if (needs_restart && (ret == BLK_STS_RESOURCE || + no_budget_avail)) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); @@ -1540,6 +1598,25 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) EXPORT_SYMBOL(blk_mq_run_hw_queues); /** + * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. + * @q: Pointer to the request queue to run. + * @msecs: Microseconds of delay to wait before running the queues. + */ +void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_hctx_stopped(hctx)) + continue; + + blk_mq_delay_run_hw_queue(hctx, msecs); + } +} +EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); + +/** * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped * @q: request queue. * @@ -1780,8 +1857,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->__sector = bio->bi_iter.bi_sector; rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); + blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); - blk_account_io_start(rq, true); + blk_account_io_start(rq); } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, @@ -1971,39 +2049,42 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) * * Returns: Request queue cookie. */ -static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0}; + struct blk_mq_alloc_data data = { + .q = q, + }; struct request *rq; struct blk_plug *plug; struct request *same_queue_rq = NULL; unsigned int nr_segs; blk_qc_t cookie; + blk_status_t ret; blk_queue_bounce(q, &bio); __blk_queue_split(q, &bio, &nr_segs); if (!bio_integrity_prep(bio)) - return BLK_QC_T_NONE; + goto queue_exit; if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) - return BLK_QC_T_NONE; + goto queue_exit; if (blk_mq_sched_bio_merge(q, bio, nr_segs)) - return BLK_QC_T_NONE; + goto queue_exit; rq_qos_throttle(q, bio); data.cmd_flags = bio->bi_opf; - rq = blk_mq_get_request(q, bio, &data); + rq = __blk_mq_alloc_request(&data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - return BLK_QC_T_NONE; + goto queue_exit; } trace_block_getrq(q, bio, bio->bi_opf); @@ -2014,6 +2095,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); + ret = blk_crypto_init_request(rq); + if (ret != BLK_STS_OK) { + bio->bi_status = ret; + bio_endio(bio); + blk_mq_free_request(rq); + return BLK_QC_T_NONE; + } + plug = blk_mq_plug(q, bio); if (unlikely(is_flush_fua)) { /* Bypass scheduler for flush requests */ @@ -2082,7 +2171,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } return cookie; +queue_exit: + blk_queue_exit(q); + return BLK_QC_T_NONE; } +EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) @@ -2258,6 +2351,86 @@ fail: return -ENOMEM; } +struct rq_iter_data { + struct blk_mq_hw_ctx *hctx; + bool has_rq; +}; + +static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) +{ + struct rq_iter_data *iter_data = data; + + if (rq->mq_hctx != iter_data->hctx) + return true; + iter_data->has_rq = true; + return false; +} + +static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->sched_tags ? + hctx->sched_tags : hctx->tags; + struct rq_iter_data data = { + .hctx = hctx, + }; + + blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + return data.has_rq; +} + +static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, + struct blk_mq_hw_ctx *hctx) +{ + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + return false; + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) + return false; + return true; +} + +static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (!cpumask_test_cpu(cpu, hctx->cpumask) || + !blk_mq_last_cpu_in_hctx(cpu, hctx)) + return 0; + + /* + * Prevent new request from being allocated on the current hctx. + * + * The smp_mb__after_atomic() Pairs with the implied barrier in + * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is + * seen once we return from the tag allocator. + */ + set_bit(BLK_MQ_S_INACTIVE, &hctx->state); + smp_mb__after_atomic(); + + /* + * Try to grab a reference to the queue and wait for any outstanding + * requests. If we could not grab a reference the queue has been + * frozen and there are no requests. + */ + if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { + while (blk_mq_hctx_has_requests(hctx)) + msleep(5); + percpu_ref_put(&hctx->queue->q_usage_counter); + } + + return 0; +} + +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (cpumask_test_cpu(cpu, hctx->cpumask)) + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); + return 0; +} + /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it @@ -2271,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; @@ -2294,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); } @@ -2353,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q, { hctx->queue_num = hctx_idx; + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); hctx->tags = set->tags[hctx_idx]; @@ -2471,7 +2653,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } -static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) +static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, + int hctx_idx) { int ret = 0; @@ -2519,18 +2702,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { - hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i]; - /* unmapped hw queue can be remapped after CPU topo changed */ - if (!set->tags[hctx_idx] && - !__blk_mq_alloc_rq_map(set, hctx_idx)) { - /* - * If tags initialization fail for some hctx, - * that hctx won't be brought online. In this - * case, remap the current ctx to hctx[0] which - * is guaranteed to always have tags allocated - */ - set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0; - } ctx = per_cpu_ptr(q->queue_ctx, i); for (j = 0; j < set->nr_maps; j++) { @@ -2539,6 +2710,18 @@ static void blk_mq_map_swqueue(struct request_queue *q) HCTX_TYPE_DEFAULT, i); continue; } + hctx_idx = set->map[j].mq_map[i]; + /* unmapped hw queue can be remapped after CPU topo changed */ + if (!set->tags[hctx_idx] && + !__blk_mq_alloc_map_and_request(set, hctx_idx)) { + /* + * If tags initialization fail for some hctx, + * that hctx won't be brought online. In this + * case, remap the current ctx to hctx[0] which + * is guaranteed to always have tags allocated + */ + set->map[j].mq_map[i] = 0; + } hctx = blk_mq_map_queue_type(q, j, i); ctx->hctxs[j] = hctx; @@ -2942,7 +3125,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* @@ -2986,14 +3168,14 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) int i; for (i = 0; i < set->nr_hw_queues; i++) - if (!__blk_mq_alloc_rq_map(set, i)) + if (!__blk_mq_alloc_map_and_request(set, i)) goto out_unwind; return 0; out_unwind: while (--i >= 0) - blk_mq_free_rq_map(set->tags[i]); + blk_mq_free_map_and_requests(set, i); return -ENOMEM; } @@ -3003,7 +3185,7 @@ out_unwind: * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ -static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) { unsigned int depth; int err; @@ -3163,7 +3345,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_mq_map; - ret = blk_mq_alloc_rq_maps(set); + ret = blk_mq_alloc_map_and_requests(set); if (ret) goto out_free_mq_map; @@ -3345,14 +3527,14 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_sysfs_unregister(q); } + prev_nr_hw_queues = set->nr_hw_queues; if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < 0) goto reregister; - prev_nr_hw_queues = set->nr_hw_queues; set->nr_hw_queues = nr_hw_queues; - blk_mq_update_queue_map(set); fallback: + blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { @@ -3607,6 +3789,9 @@ static int __init blk_mq_init(void) { cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); + cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", + blk_mq_hctx_notify_online, + blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init); |