diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-18 20:53:51 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-18 20:53:51 +0200 |
commit | d3dc366bbaf07c125561e90d6da4bb147741101a (patch) | |
tree | 6eb7e79a8ec9df1fa705393c6d15ccea3d104661 /block/blk-mq.c | |
parent | Merge tag 'for-linus-20141015' of git://git.infradead.org/linux-mtd (diff) | |
parent | block: Remove REQ_KERNEL (diff) | |
download | linux-d3dc366bbaf07c125561e90d6da4bb147741101a.tar.xz linux-d3dc366bbaf07c125561e90d6da4bb147741101a.zip |
Merge branch 'for-3.18/core' of git://git.kernel.dk/linux-block
Pull core block layer changes from Jens Axboe:
"This is the core block IO pull request for 3.18. Apart from the new
and improved flush machinery for blk-mq, this is all mostly bug fixes
and cleanups.
- blk-mq timeout updates and fixes from Christoph.
- Removal of REQ_END, also from Christoph. We pass it through the
->queue_rq() hook for blk-mq instead, freeing up one of the request
bits. The space was overly tight on 32-bit, so Martin also killed
REQ_KERNEL since it's no longer used.
- blk integrity updates and fixes from Martin and Gu Zheng.
- Update to the flush machinery for blk-mq from Ming Lei. Now we
have a per hardware context flush request, which both cleans up the
code should scale better for flush intensive workloads on blk-mq.
- Improve the error printing, from Rob Elliott.
- Backing device improvements and cleanups from Tejun.
- Fixup of a misplaced rq_complete() tracepoint from Hannes.
- Make blk_get_request() return error pointers, fixing up issues
where we NULL deref when a device goes bad or missing. From Joe
Lawrence.
- Prep work for drastically reducing the memory consumption of dm
devices from Junichi Nomura. This allows creating clone bio sets
without preallocating a lot of memory.
- Fix a blk-mq hang on certain combinations of queue depths and
hardware queues from me.
- Limit memory consumption for blk-mq devices for crash dump
scenarios and drivers that use crazy high depths (certain SCSI
shared tag setups). We now just use a single queue and limited
depth for that"
* 'for-3.18/core' of git://git.kernel.dk/linux-block: (58 commits)
block: Remove REQ_KERNEL
blk-mq: allocate cpumask on the home node
bio-integrity: remove the needless fail handle of bip_slab creating
block: include func name in __get_request prints
block: make blk_update_request print prefix match ratelimited prefix
blk-merge: don't compute bi_phys_segments from bi_vcnt for cloned bio
block: fix alignment_offset math that assumes io_min is a power-of-2
blk-mq: Make bt_clear_tag() easier to read
blk-mq: fix potential hang if rolling wakeup depth is too high
block: add bioset_create_nobvec()
block: use bio_clone_fast() in blk_rq_prep_clone()
block: misplaced rq_complete tracepoint
sd: Honor block layer integrity handling flags
block: Replace strnicmp with strncasecmp
block: Add T10 Protection Information functions
block: Don't merge requests if integrity flags differ
block: Integrity checksum flag
block: Relocate bio integrity flags
block: Add a disk flag to block integrity profile
block: Add prefix to block integrity profile flags
...
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 364 |
1 files changed, 187 insertions, 177 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index 38f4a165640d..68929bad9a6a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -20,6 +20,7 @@ #include <linux/cache.h> #include <linux/sched/sysctl.h> #include <linux/delay.h> +#include <linux/crash_dump.h> #include <trace/events/block.h> @@ -223,9 +224,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_hw_ctx *hctx; struct request *rq; struct blk_mq_alloc_data alloc_data; + int ret; - if (blk_mq_queue_enter(q)) - return NULL; + ret = blk_mq_queue_enter(q); + if (ret) + return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -245,6 +248,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = alloc_data.ctx; } blk_mq_put_ctx(ctx); + if (!rq) + return ERR_PTR(-EWOULDBLOCK); return rq; } EXPORT_SYMBOL(blk_mq_alloc_request); @@ -276,27 +281,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_free_request(hctx, ctx, rq); } -/* - * Clone all relevant state from a request that has been put on hold in - * the flush state machine into the preallocated flush request that hangs - * off the request queue. - * - * For a driver the flush request should be invisible, that's why we are - * impersonating the original request here. - */ -void blk_mq_clone_flush_request(struct request *flush_rq, - struct request *orig_rq) -{ - struct blk_mq_hw_ctx *hctx = - orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); - - flush_rq->mq_ctx = orig_rq->mq_ctx; - flush_rq->tag = orig_rq->tag; - memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), - hctx->cmd_size); -} - -inline void __blk_mq_end_io(struct request *rq, int error) +inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); @@ -308,15 +293,15 @@ inline void __blk_mq_end_io(struct request *rq, int error) blk_mq_free_request(rq); } } -EXPORT_SYMBOL(__blk_mq_end_io); +EXPORT_SYMBOL(__blk_mq_end_request); -void blk_mq_end_io(struct request *rq, int error) +void blk_mq_end_request(struct request *rq, int error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); - __blk_mq_end_io(rq, error); + __blk_mq_end_request(rq, error); } -EXPORT_SYMBOL(blk_mq_end_io); +EXPORT_SYMBOL(blk_mq_end_request); static void __blk_mq_complete_request_remote(void *data) { @@ -356,7 +341,7 @@ void __blk_mq_complete_request(struct request *rq) struct request_queue *q = rq->q; if (!q->softirq_done_fn) - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); else blk_mq_ipi_complete_request(rq); } @@ -380,7 +365,7 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void blk_mq_start_request(struct request *rq, bool last) +void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -417,35 +402,24 @@ static void blk_mq_start_request(struct request *rq, bool last) */ rq->nr_phys_segments++; } - - /* - * Flag the last request in the series so that drivers know when IO - * should be kicked off, if they don't do it on a per-request basis. - * - * Note: the flag isn't the only condition drivers should do kick off. - * If drive is busy, the last request might not have the bit set. - */ - if (last) - rq->cmd_flags |= REQ_END; } +EXPORT_SYMBOL(blk_mq_start_request); static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - - rq->cmd_flags &= ~REQ_END; - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; + if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (q->dma_drain_size && blk_rq_bytes(rq)) + rq->nr_phys_segments--; + } } void blk_mq_requeue_request(struct request *rq) { __blk_mq_requeue_request(rq); - blk_clear_rq_complete(rq); BUG_ON(blk_queued_rq(rq)); blk_mq_add_to_requeue_list(rq, true); @@ -514,78 +488,35 @@ void blk_mq_kick_requeue_list(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_kick_requeue_list); -static inline bool is_flush_request(struct request *rq, unsigned int tag) +static inline bool is_flush_request(struct request *rq, + struct blk_flush_queue *fq, unsigned int tag) { return ((rq->cmd_flags & REQ_FLUSH_SEQ) && - rq->q->flush_rq->tag == tag); + fq->flush_rq->tag == tag); } struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { struct request *rq = tags->rqs[tag]; + /* mq_ctx of flush rq is always cloned from the corresponding req */ + struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); - if (!is_flush_request(rq, tag)) + if (!is_flush_request(rq, fq, tag)) return rq; - return rq->q->flush_rq; + return fq->flush_rq; } EXPORT_SYMBOL(blk_mq_tag_to_rq); struct blk_mq_timeout_data { - struct blk_mq_hw_ctx *hctx; - unsigned long *next; - unsigned int *next_set; + unsigned long next; + unsigned int next_set; }; -static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) +void blk_mq_rq_timed_out(struct request *req, bool reserved) { - struct blk_mq_timeout_data *data = __data; - struct blk_mq_hw_ctx *hctx = data->hctx; - unsigned int tag; - - /* It may not be in flight yet (this is where - * the REQ_ATOMIC_STARTED flag comes in). The requests are - * statically allocated, so we know it's always safe to access the - * memory associated with a bit offset into ->rqs[]. - */ - tag = 0; - do { - struct request *rq; - - tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag); - if (tag >= hctx->tags->nr_tags) - break; - - rq = blk_mq_tag_to_rq(hctx->tags, tag++); - if (rq->q != hctx->queue) - continue; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - continue; - - blk_rq_check_expired(rq, data->next, data->next_set); - } while (1); -} - -static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, - unsigned long *next, - unsigned int *next_set) -{ - struct blk_mq_timeout_data data = { - .hctx = hctx, - .next = next, - .next_set = next_set, - }; - - /* - * Ask the tagging code to iterate busy requests, so we can - * check them for timeout. - */ - blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); -} - -static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) -{ - struct request_queue *q = rq->q; + struct blk_mq_ops *ops = req->q->mq_ops; + enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; /* * We know that complete is set at this point. If STARTED isn't set @@ -596,21 +527,54 @@ static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) * we both flags will get cleared. So check here again, and ignore * a timeout event with a request that isn't active. */ - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - return BLK_EH_NOT_HANDLED; + if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) + return; + + if (ops->timeout) + ret = ops->timeout(req, reserved); + + switch (ret) { + case BLK_EH_HANDLED: + __blk_mq_complete_request(req); + break; + case BLK_EH_RESET_TIMER: + blk_add_timer(req); + blk_clear_rq_complete(req); + break; + case BLK_EH_NOT_HANDLED: + break; + default: + printk(KERN_ERR "block: bad eh return: %d\n", ret); + break; + } +} + +static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + struct blk_mq_timeout_data *data = priv; - if (!q->mq_ops->timeout) - return BLK_EH_RESET_TIMER; + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + return; - return q->mq_ops->timeout(rq); + if (time_after_eq(jiffies, rq->deadline)) { + if (!blk_mark_rq_complete(rq)) + blk_mq_rq_timed_out(rq, reserved); + } else if (!data->next_set || time_after(data->next, rq->deadline)) { + data->next = rq->deadline; + data->next_set = 1; + } } -static void blk_mq_rq_timer(unsigned long data) +static void blk_mq_rq_timer(unsigned long priv) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = (struct request_queue *)priv; + struct blk_mq_timeout_data data = { + .next = 0, + .next_set = 0, + }; struct blk_mq_hw_ctx *hctx; - unsigned long next = 0; - int i, next_set = 0; + int i; queue_for_each_hw_ctx(q, hctx, i) { /* @@ -620,12 +584,12 @@ static void blk_mq_rq_timer(unsigned long data) if (!hctx->nr_ctx || !hctx->tags) continue; - blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); } - if (next_set) { - next = blk_rq_timeout(round_jiffies_up(next)); - mod_timer(&q->timeout, next); + if (data.next_set) { + data.next = blk_rq_timeout(round_jiffies_up(data.next)); + mod_timer(&q->timeout, data.next); } else { queue_for_each_hw_ctx(q, hctx, i) blk_mq_tag_idle(hctx); @@ -751,9 +715,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_start_request(rq, list_empty(&rq_list)); - - ret = q->mq_ops->queue_rq(hctx, rq); + ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; @@ -766,7 +728,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) pr_err("blk-mq: bad return on queue: %d\n", ret); case BLK_MQ_RQ_QUEUE_ERROR: rq->errors = -EIO; - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); break; } @@ -1194,14 +1156,13 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) int ret; blk_mq_bio_to_request(rq, bio); - blk_mq_start_request(rq, true); /* * For OK queue, we are done. For error, kill it. Any other * error (busy), just add it to our list as we previously * would have done */ - ret = q->mq_ops->queue_rq(data.hctx, rq); + ret = q->mq_ops->queue_rq(data.hctx, rq, true); if (ret == BLK_MQ_RQ_QUEUE_OK) goto done; else { @@ -1209,7 +1170,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) if (ret == BLK_MQ_RQ_QUEUE_ERROR) { rq->errors = -EIO; - blk_mq_end_io(rq, rq->errors); + blk_mq_end_request(rq, rq->errors); goto done; } } @@ -1531,6 +1492,28 @@ static int blk_mq_hctx_notify(void *data, unsigned long action, return NOTIFY_OK; } +static void blk_mq_exit_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + unsigned flush_start_tag = set->queue_depth; + + blk_mq_tag_idle(hctx); + + if (set->ops->exit_request) + set->ops->exit_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx); + + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + blk_free_flush_queue(hctx->fq); + kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); +} + static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { @@ -1540,17 +1523,8 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; - - blk_mq_tag_idle(hctx); - - if (set->ops->exit_hctx) - set->ops->exit_hctx(hctx, i); - - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - kfree(hctx->ctxs); - blk_mq_free_bitmap(&hctx->ctx_map); + blk_mq_exit_hctx(q, set, hctx, i); } - } static void blk_mq_free_hw_queues(struct request_queue *q, @@ -1565,53 +1539,88 @@ static void blk_mq_free_hw_queues(struct request_queue *q, } } -static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_tag_set *set) +static int blk_mq_init_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; + int node; + unsigned flush_start_tag = set->queue_depth; + + node = hctx->numa_node; + if (node == NUMA_NO_NODE) + node = hctx->numa_node = set->numa_node; + + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); + spin_lock_init(&hctx->lock); + INIT_LIST_HEAD(&hctx->dispatch); + hctx->queue = q; + hctx->queue_num = hctx_idx; + hctx->flags = set->flags; + hctx->cmd_size = set->cmd_size; + + blk_mq_init_cpu_notifier(&hctx->cpu_notifier, + blk_mq_hctx_notify, hctx); + blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + + hctx->tags = set->tags[hctx_idx]; /* - * Initialize hardware queues + * Allocate space for all possible cpus to avoid allocation at + * runtime */ - queue_for_each_hw_ctx(q, hctx, i) { - int node; + hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), + GFP_KERNEL, node); + if (!hctx->ctxs) + goto unregister_cpu_notifier; - node = hctx->numa_node; - if (node == NUMA_NO_NODE) - node = hctx->numa_node = set->numa_node; + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + goto free_ctxs; - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); - INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); - spin_lock_init(&hctx->lock); - INIT_LIST_HEAD(&hctx->dispatch); - hctx->queue = q; - hctx->queue_num = i; - hctx->flags = set->flags; - hctx->cmd_size = set->cmd_size; + hctx->nr_ctx = 0; - blk_mq_init_cpu_notifier(&hctx->cpu_notifier, - blk_mq_hctx_notify, hctx); - blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) + goto free_bitmap; - hctx->tags = set->tags[i]; + hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); + if (!hctx->fq) + goto exit_hctx; - /* - * Allocate space for all possible cpus to avoid allocation at - * runtime - */ - hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), - GFP_KERNEL, node); - if (!hctx->ctxs) - break; + if (set->ops->init_request && + set->ops->init_request(set->driver_data, + hctx->fq->flush_rq, hctx_idx, + flush_start_tag + hctx_idx, node)) + goto free_fq; - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) - break; + return 0; - hctx->nr_ctx = 0; + free_fq: + kfree(hctx->fq); + exit_hctx: + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + free_bitmap: + blk_mq_free_bitmap(&hctx->ctx_map); + free_ctxs: + kfree(hctx->ctxs); + unregister_cpu_notifier: + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + + return -1; +} + +static int blk_mq_init_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; - if (set->ops->init_hctx && - set->ops->init_hctx(hctx, set->driver_data, i)) + /* + * Initialize hardware queues + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_init_hctx(q, set, hctx, i)) break; } @@ -1765,6 +1774,16 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!ctx) return ERR_PTR(-ENOMEM); + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, set->numa_node); @@ -1783,7 +1802,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!hctxs[i]) goto err_hctxs; - if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, + node)) goto err_hctxs; atomic_set(&hctxs[i]->nr_active, 0); @@ -1830,7 +1850,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) else blk_queue_make_request(q, blk_sq_make_request); - blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); if (set->timeout) blk_queue_rq_timeout(q, set->timeout); @@ -1842,17 +1861,10 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); - blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, set->nr_hw_queues); - q->flush_rq = kzalloc(round_up(sizeof(struct request) + - set->cmd_size, cache_line_size()), - GFP_KERNEL); - if (!q->flush_rq) - goto err_hw; - if (blk_mq_init_hw_queues(q, set)) - goto err_flush_rq; + goto err_hw; mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); @@ -1864,8 +1876,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) return q; -err_flush_rq: - kfree(q->flush_rq); err_hw: blk_cleanup_queue(q); err_hctxs: |