summaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c574
1 files changed, 438 insertions, 136 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f3d27a6dee09..c3400b5444a7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -30,6 +30,8 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-stat.h"
+#include "blk-wbt.h"
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
@@ -115,6 +117,33 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
+/**
+ * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ * @q: request queue.
+ *
+ * Note: this function does not prevent that the struct request end_io()
+ * callback function is invoked. Additionally, it is not prevented that
+ * new queue_rq() calls occur unless the queue has been stopped first.
+ */
+void blk_mq_quiesce_queue(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i;
+ bool rcu = false;
+
+ blk_mq_stop_hw_queues(q);
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (hctx->flags & BLK_MQ_F_BLOCKING)
+ synchronize_srcu(&hctx->queue_rq_srcu);
+ else
+ rcu = true;
+ }
+ if (rcu)
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
+
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
@@ -139,17 +168,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
EXPORT_SYMBOL(blk_mq_can_queue);
static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
- struct request *rq, int op,
- unsigned int op_flags)
+ struct request *rq, unsigned int op)
{
- if (blk_queue_io_stat(q))
- op_flags |= REQ_IO_STAT;
-
INIT_LIST_HEAD(&rq->queuelist);
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = q;
rq->mq_ctx = ctx;
- req_set_op_attrs(rq, op, op_flags);
+ rq->cmd_flags = op;
+ if (blk_queue_io_stat(q))
+ rq->rq_flags |= RQF_IO_STAT;
/* do not touch atomic flags, it needs atomic ops against the timer */
rq->cpu = -1;
INIT_HLIST_NODE(&rq->hash);
@@ -184,11 +211,11 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
rq->end_io_data = NULL;
rq->next_rq = NULL;
- ctx->rq_dispatched[rw_is_sync(op, op_flags)]++;
+ ctx->rq_dispatched[op_is_sync(op)]++;
}
static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
+__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
{
struct request *rq;
unsigned int tag;
@@ -198,12 +225,12 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
rq = data->hctx->tags->rqs[tag];
if (blk_mq_tag_busy(data->hctx)) {
- rq->cmd_flags = REQ_MQ_INFLIGHT;
+ rq->rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
- blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags);
+ blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
return rq;
}
@@ -226,7 +253,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
ctx = blk_mq_get_ctx(q);
hctx = blk_mq_map_queue(q, ctx->cpu);
blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+ rq = __blk_mq_alloc_request(&alloc_data, rw);
blk_mq_put_ctx(ctx);
if (!rq) {
@@ -278,7 +305,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+ rq = __blk_mq_alloc_request(&alloc_data, rw);
if (!rq) {
ret = -EWOULDBLOCK;
goto out_queue_exit;
@@ -298,11 +325,14 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
const int tag = rq->tag;
struct request_queue *q = rq->q;
- if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+ if (rq->rq_flags & RQF_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
- rq->cmd_flags = 0;
+
+ wbt_done(q->rq_wb, &rq->issue_stat);
+ rq->rq_flags = 0;
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+ clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
blk_mq_put_tag(hctx, ctx, tag);
blk_queue_exit(q);
}
@@ -328,6 +358,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
blk_account_io_done(rq);
if (rq->end_io) {
+ wbt_done(rq->q->rq_wb, &rq->issue_stat);
rq->end_io(rq, error);
} else {
if (unlikely(blk_bidi_rq(rq)))
@@ -378,10 +409,27 @@ static void blk_mq_ipi_complete_request(struct request *rq)
put_cpu();
}
+static void blk_mq_stat_add(struct request *rq)
+{
+ if (rq->rq_flags & RQF_STATS) {
+ /*
+ * We could rq->mq_ctx here, but there's less of a risk
+ * of races if we have the completion event add the stats
+ * to the local software queue.
+ */
+ struct blk_mq_ctx *ctx;
+
+ ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
+ blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+ }
+}
+
static void __blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_mq_stat_add(rq);
+
if (!q->softirq_done_fn)
blk_mq_end_request(rq, rq->errors);
else
@@ -425,6 +473,12 @@ void blk_mq_start_request(struct request *rq)
if (unlikely(blk_bidi_rq(rq)))
rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+ if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+ blk_stat_set_issue_time(&rq->issue_stat);
+ rq->rq_flags |= RQF_STATS;
+ wbt_issue(q->rq_wb, &rq->issue_stat);
+ }
+
blk_add_timer(rq);
/*
@@ -460,6 +514,7 @@ static void __blk_mq_requeue_request(struct request *rq)
struct request_queue *q = rq->q;
trace_block_rq_requeue(q, rq);
+ wbt_requeue(q->rq_wb, &rq->issue_stat);
if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -467,12 +522,12 @@ static void __blk_mq_requeue_request(struct request *rq)
}
}
-void blk_mq_requeue_request(struct request *rq)
+void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
__blk_mq_requeue_request(rq);
BUG_ON(blk_queued_rq(rq));
- blk_mq_add_to_requeue_list(rq, true);
+ blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
}
EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -489,10 +544,10 @@ static void blk_mq_requeue_work(struct work_struct *work)
spin_unlock_irqrestore(&q->requeue_lock, flags);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
- if (!(rq->cmd_flags & REQ_SOFTBARRIER))
+ if (!(rq->rq_flags & RQF_SOFTBARRIER))
continue;
- rq->cmd_flags &= ~REQ_SOFTBARRIER;
+ rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, true, false, false);
}
@@ -503,14 +558,11 @@ static void blk_mq_requeue_work(struct work_struct *work)
blk_mq_insert_request(rq, false, false, false);
}
- /*
- * Use the start variant of queue running here, so that running
- * the requeue work will kick stopped queues.
- */
- blk_mq_start_hw_queues(q);
+ blk_mq_run_hw_queues(q, false);
}
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
+ bool kick_requeue_list)
{
struct request_queue *q = rq->q;
unsigned long flags;
@@ -519,24 +571,21 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
* We abuse this flag that is otherwise used by the I/O scheduler to
* request head insertation from the workqueue.
*/
- BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
+ BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
spin_lock_irqsave(&q->requeue_lock, flags);
if (at_head) {
- rq->cmd_flags |= REQ_SOFTBARRIER;
+ rq->rq_flags |= RQF_SOFTBARRIER;
list_add(&rq->queuelist, &q->requeue_list);
} else {
list_add_tail(&rq->queuelist, &q->requeue_list);
}
spin_unlock_irqrestore(&q->requeue_lock, flags);
-}
-EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
-void blk_mq_cancel_requeue_work(struct request_queue *q)
-{
- cancel_delayed_work_sync(&q->requeue_work);
+ if (kick_requeue_list)
+ blk_mq_kick_requeue_list(q);
}
-EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
+EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q)
{
@@ -772,44 +821,13 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct request_queue *q = hctx->queue;
struct request *rq;
- LIST_HEAD(rq_list);
LIST_HEAD(driver_list);
struct list_head *dptr;
- int queued;
-
- if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
- return;
-
- WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
- cpu_online(hctx->next_cpu));
-
- hctx->run++;
-
- /*
- * Touch any software queue that has pending entries.
- */
- flush_busy_ctxs(hctx, &rq_list);
-
- /*
- * If we have previous entries on our dispatch list, grab them
- * and stuff them at the front for more fair dispatch.
- */
- if (!list_empty_careful(&hctx->dispatch)) {
- spin_lock(&hctx->lock);
- if (!list_empty(&hctx->dispatch))
- list_splice_init(&hctx->dispatch, &rq_list);
- spin_unlock(&hctx->lock);
- }
+ int queued, ret = BLK_MQ_RQ_QUEUE_OK;
/*
* Start off with dptr being NULL, so we start the first request
@@ -821,16 +839,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
* Now process all the entries, sending them to the driver.
*/
queued = 0;
- while (!list_empty(&rq_list)) {
+ while (!list_empty(list)) {
struct blk_mq_queue_data bd;
- int ret;
- rq = list_first_entry(&rq_list, struct request, queuelist);
+ rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
bd.rq = rq;
bd.list = dptr;
- bd.last = list_empty(&rq_list);
+ bd.last = list_empty(list);
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
@@ -838,7 +855,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
queued++;
break;
case BLK_MQ_RQ_QUEUE_BUSY:
- list_add(&rq->queuelist, &rq_list);
+ list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
default:
@@ -856,7 +873,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
* We've done the first request. If we have more than 1
* left in the list, set dptr to defer issue.
*/
- if (!dptr && rq_list.next != rq_list.prev)
+ if (!dptr && list->next != list->prev)
dptr = &driver_list;
}
@@ -866,10 +883,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
* Any items that need requeuing? Stuff them into hctx->dispatch,
* that is where we will continue on next queue run.
*/
- if (!list_empty(&rq_list)) {
+ if (!list_empty(list)) {
spin_lock(&hctx->lock);
- list_splice(&rq_list, &hctx->dispatch);
+ list_splice(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
+
/*
* the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
* it's possible the queue is stopped and restarted again
@@ -881,6 +899,60 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
**/
blk_mq_run_hw_queue(hctx, true);
}
+
+ return ret != BLK_MQ_RQ_QUEUE_BUSY;
+}
+
+/*
+ * Run this hardware queue, pulling any software queues mapped to it in.
+ * Note that this function currently has various problems around ordering
+ * of IO. In particular, we'd like FIFO behaviour on handling existing
+ * items on the hctx->dispatch list. Ignore that for now.
+ */
+static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+{
+ LIST_HEAD(rq_list);
+
+ if (unlikely(blk_mq_hctx_stopped(hctx)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * Touch any software queue that has pending entries.
+ */
+ flush_busy_ctxs(hctx, &rq_list);
+
+ /*
+ * If we have previous entries on our dispatch list, grab them
+ * and stuff them at the front for more fair dispatch.
+ */
+ if (!list_empty_careful(&hctx->dispatch)) {
+ spin_lock(&hctx->lock);
+ if (!list_empty(&hctx->dispatch))
+ list_splice_init(&hctx->dispatch, &rq_list);
+ spin_unlock(&hctx->lock);
+ }
+
+ blk_mq_dispatch_rq_list(hctx, &rq_list);
+}
+
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+ int srcu_idx;
+
+ WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+ cpu_online(hctx->next_cpu));
+
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+ rcu_read_lock();
+ blk_mq_process_rq_list(hctx);
+ rcu_read_unlock();
+ } else {
+ srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+ blk_mq_process_rq_list(hctx);
+ srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+ }
}
/*
@@ -895,7 +967,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
- int cpu = hctx->next_cpu, next_cpu;
+ int next_cpu;
next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
if (next_cpu >= nr_cpu_ids)
@@ -903,8 +975,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
- return cpu;
}
return hctx->next_cpu;
@@ -912,8 +982,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
- !blk_mq_hw_queue_mapped(hctx)))
+ if (unlikely(blk_mq_hctx_stopped(hctx) ||
+ !blk_mq_hw_queue_mapped(hctx)))
return;
if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
@@ -938,7 +1008,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
queue_for_each_hw_ctx(q, hctx, i) {
if ((!blk_mq_hctx_has_pending(hctx) &&
list_empty_careful(&hctx->dispatch)) ||
- test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+ blk_mq_hctx_stopped(hctx))
continue;
blk_mq_run_hw_queue(hctx, async);
@@ -946,6 +1016,26 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);
+/**
+ * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
+ * @q: request queue.
+ *
+ * The caller is responsible for serializing this function against
+ * blk_mq_{start,stop}_hw_queue().
+ */
+bool blk_mq_queue_stopped(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ if (blk_mq_hctx_stopped(hctx))
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(blk_mq_queue_stopped);
+
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
cancel_work(&hctx->run_work);
@@ -982,18 +1072,23 @@ void blk_mq_start_hw_queues(struct request_queue *q)
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+ if (!blk_mq_hctx_stopped(hctx))
+ return;
+
+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ blk_mq_run_hw_queue(hctx, async);
+}
+EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
+
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
int i;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
- continue;
-
- clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
- blk_mq_run_hw_queue(hctx, async);
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_start_stopped_hw_queue(hctx, async);
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@ -1155,7 +1250,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
{
init_request_from_bio(rq, bio);
- blk_account_io_start(rq, 1);
+ blk_account_io_start(rq, true);
}
static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
@@ -1190,40 +1285,27 @@ insert_rq:
}
}
-struct blk_map_ctx {
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
-};
-
static struct request *blk_mq_map_request(struct request_queue *q,
struct bio *bio,
- struct blk_map_ctx *data)
+ struct blk_mq_alloc_data *data)
{
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct request *rq;
- int op = bio_data_dir(bio);
- int op_flags = 0;
- struct blk_mq_alloc_data alloc_data;
blk_queue_enter_live(q);
ctx = blk_mq_get_ctx(q);
hctx = blk_mq_map_queue(q, ctx->cpu);
- if (rw_is_sync(bio_op(bio), bio->bi_opf))
- op_flags |= REQ_SYNC;
-
- trace_block_getrq(q, bio, op);
- blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
+ trace_block_getrq(q, bio, bio->bi_opf);
+ blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+ rq = __blk_mq_alloc_request(data, bio->bi_opf);
- data->hctx = alloc_data.hctx;
- data->ctx = alloc_data.ctx;
data->hctx->queued++;
return rq;
}
-static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
+static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
{
int ret;
struct request_queue *q = rq->q;
@@ -1235,6 +1317,9 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
};
blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+ if (blk_mq_hctx_stopped(hctx))
+ goto insert;
+
/*
* For OK queue, we are done. For error, kill it. Any other
* error (busy), just add it to our list as we previously
@@ -1243,7 +1328,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
ret = q->mq_ops->queue_rq(hctx, &bd);
if (ret == BLK_MQ_RQ_QUEUE_OK) {
*cookie = new_cookie;
- return 0;
+ return;
}
__blk_mq_requeue_request(rq);
@@ -1252,10 +1337,11 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
*cookie = BLK_QC_T_NONE;
rq->errors = -EIO;
blk_mq_end_request(rq, rq->errors);
- return 0;
+ return;
}
- return -1;
+insert:
+ blk_mq_insert_request(rq, false, true, true);
}
/*
@@ -1265,14 +1351,15 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
*/
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
- const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+ const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
- struct blk_map_ctx data;
+ struct blk_mq_alloc_data data;
struct request *rq;
- unsigned int request_count = 0;
+ unsigned int request_count = 0, srcu_idx;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
+ unsigned int wb_acct;
blk_queue_bounce(q, &bio);
@@ -1287,9 +1374,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
return BLK_QC_T_NONE;
+ wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ __wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
+ }
+
+ wbt_track(&rq->issue_stat, wb_acct);
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
@@ -1312,7 +1405,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio);
/*
- * We do limited pluging. If the bio can be merged, do that.
+ * We do limited plugging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
* issued. So the plug list will have one request at most
*/
@@ -1332,9 +1425,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_put_ctx(data.ctx);
if (!old_rq)
goto done;
- if (!blk_mq_direct_issue_request(old_rq, &cookie))
- goto done;
- blk_mq_insert_request(old_rq, false, true, true);
+
+ if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
+ rcu_read_lock();
+ blk_mq_try_issue_directly(old_rq, &cookie);
+ rcu_read_unlock();
+ } else {
+ srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
+ blk_mq_try_issue_directly(old_rq, &cookie);
+ srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+ }
goto done;
}
@@ -1359,13 +1459,14 @@ done:
*/
static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
{
- const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+ const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
struct blk_plug *plug;
unsigned int request_count = 0;
- struct blk_map_ctx data;
+ struct blk_mq_alloc_data data;
struct request *rq;
blk_qc_t cookie;
+ unsigned int wb_acct;
blk_queue_bounce(q, &bio);
@@ -1382,9 +1483,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
} else
request_count = blk_plug_queued_count(q);
+ wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ __wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
+ }
+
+ wbt_track(&rq->issue_stat, wb_acct);
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
@@ -1401,13 +1508,25 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
*/
plug = current->plug;
if (plug) {
+ struct request *last = NULL;
+
blk_mq_bio_to_request(rq, bio);
+
+ /*
+ * @request_count may become stale because of schedule
+ * out, so check the list again.
+ */
+ if (list_empty(&plug->mq_list))
+ request_count = 0;
if (!request_count)
trace_block_plug(q);
+ else
+ last = list_entry_rq(plug->mq_list.prev);
blk_mq_put_ctx(data.ctx);
- if (request_count >= BLK_MAX_REQUEST_COUNT) {
+ if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
+ blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
@@ -1485,7 +1604,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&tags->page_list);
tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
- GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node);
if (!tags->rqs) {
blk_mq_free_tags(tags);
@@ -1511,7 +1630,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
do {
page = alloc_pages_node(set->numa_node,
- GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
this_order);
if (page)
break;
@@ -1532,7 +1651,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
* Allow kmemleak to scan these pages as they contain pointers
* to additional allocations like via ops->init_request().
*/
- kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
+ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
entries_per_page = order_to_size(this_order) / rq_size;
to_do = min(entries_per_page, set->queue_depth - i);
left -= to_do * rq_size;
@@ -1613,6 +1732,9 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
+ if (hctx->flags & BLK_MQ_F_BLOCKING)
+ cleanup_srcu_struct(&hctx->queue_rq_srcu);
+
blk_mq_remove_cpuhp(hctx);
blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map);
@@ -1693,6 +1815,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
flush_start_tag + hctx_idx, node))
goto free_fq;
+ if (hctx->flags & BLK_MQ_F_BLOCKING)
+ init_srcu_struct(&hctx->queue_rq_srcu);
+
return 0;
free_fq:
@@ -1723,6 +1848,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
spin_lock_init(&__ctx->lock);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
+ blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
+ blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpu_online(i))
@@ -1742,7 +1869,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
static void blk_mq_map_swqueue(struct request_queue *q,
const struct cpumask *online_mask)
{
- unsigned int i;
+ unsigned int i, hctx_idx;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
@@ -1765,6 +1892,21 @@ static void blk_mq_map_swqueue(struct request_queue *q,
if (!cpumask_test_cpu(i, online_mask))
continue;
+ hctx_idx = q->mq_map[i];
+ /* unmapped hw queue can be remapped after CPU topo changed */
+ if (!set->tags[hctx_idx]) {
+ set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
+
+ /*
+ * If tags initialization fail for some hctx,
+ * that hctx won't be brought online. In this
+ * case, remap the current ctx to hctx[0] which
+ * is guaranteed to always have tags allocated
+ */
+ if (!set->tags[hctx_idx])
+ q->mq_map[i] = 0;
+ }
+
ctx = per_cpu_ptr(q->queue_ctx, i);
hctx = blk_mq_map_queue(q, i);
@@ -1781,7 +1923,11 @@ static void blk_mq_map_swqueue(struct request_queue *q,
* disable it and free the request entries.
*/
if (!hctx->nr_ctx) {
- if (set->tags[i]) {
+ /* Never unmap queue 0. We need it as a
+ * fallback in case of a new remap fails
+ * allocation
+ */
+ if (i && set->tags[i]) {
blk_mq_free_rq_map(set, set->tags[i], i);
set->tags[i] = NULL;
}
@@ -1789,9 +1935,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
continue;
}
- /* unmapped hw queue can be remapped after CPU topo changed */
- if (!set->tags[i])
- set->tags[i] = blk_mq_init_rq_map(set, i);
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
@@ -2018,6 +2161,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
*/
q->nr_requests = set->queue_depth;
+ /*
+ * Default to classic polling
+ */
+ q->poll_nsec = -1;
+
if (set->ops->complete)
blk_queue_softirq_done(q, set->ops->complete);
@@ -2053,6 +2201,8 @@ void blk_mq_free_queue(struct request_queue *q)
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
+ wbt_exit(q);
+
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2099,16 +2249,9 @@ static void blk_mq_queue_reinit_work(void)
*/
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_start(q);
- list_for_each_entry(q, &all_q_list, all_q_node) {
+ list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_wait(q);
- /*
- * timeout handler can't touch hw queue during the
- * reinitialization
- */
- del_timer_sync(&q->timeout);
- }
-
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_queue_reinit(q, &cpuhp_online_new);
@@ -2353,6 +2496,165 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ struct blk_rq_stat stat[2];
+ unsigned long ret = 0;
+
+ /*
+ * If stats collection isn't on, don't sleep but turn it on for
+ * future users
+ */
+ if (!blk_stat_enable(q))
+ return 0;
+
+ /*
+ * We don't have to do this once per IO, should optimize this
+ * to just use the current window of stats until it changes
+ */
+ memset(&stat, 0, sizeof(stat));
+ blk_hctx_stat_get(hctx, stat);
+
+ /*
+ * As an optimistic guess, use half of the mean service time
+ * for this type of request. We can (and should) make this smarter.
+ * For instance, if the completion latencies are tight, we can
+ * get closer than just half the mean. This is especially
+ * important on devices where the completion latencies are longer
+ * than ~10 usec.
+ */
+ if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
+ ret = (stat[BLK_STAT_READ].mean + 1) / 2;
+ else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
+ ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+
+ return ret;
+}
+
+static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ struct hrtimer_sleeper hs;
+ enum hrtimer_mode mode;
+ unsigned int nsecs;
+ ktime_t kt;
+
+ if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+ return false;
+
+ /*
+ * poll_nsec can be:
+ *
+ * -1: don't ever hybrid sleep
+ * 0: use half of prev avg
+ * >0: use this specific value
+ */
+ if (q->poll_nsec == -1)
+ return false;
+ else if (q->poll_nsec > 0)
+ nsecs = q->poll_nsec;
+ else
+ nsecs = blk_mq_poll_nsecs(q, hctx, rq);
+
+ if (!nsecs)
+ return false;
+
+ set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+
+ /*
+ * This will be replaced with the stats tracking code, using
+ * 'avg_completion_time / 2' as the pre-sleep target.
+ */
+ kt = nsecs;
+
+ mode = HRTIMER_MODE_REL;
+ hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires(&hs.timer, kt);
+
+ hrtimer_init_sleeper(&hs, current);
+ do {
+ if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+ break;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ hrtimer_start_expires(&hs.timer, mode);
+ if (hs.task)
+ io_schedule();
+ hrtimer_cancel(&hs.timer);
+ mode = HRTIMER_MODE_ABS;
+ } while (hs.task && !signal_pending(current));
+
+ __set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&hs.timer);
+ return true;
+}
+
+static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ struct request_queue *q = hctx->queue;
+ long state;
+
+ /*
+ * If we sleep, have the caller restart the poll loop to reset
+ * the state. Like for the other success return cases, the
+ * caller is responsible for checking if the IO completed. If
+ * the IO isn't complete, we'll get called again and will go
+ * straight to the busy poll loop.
+ */
+ if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
+ return true;
+
+ hctx->poll_considered++;
+
+ state = current->state;
+ while (!need_resched()) {
+ int ret;
+
+ hctx->poll_invoked++;
+
+ ret = q->mq_ops->poll(hctx, rq->tag);
+ if (ret > 0) {
+ hctx->poll_success++;
+ set_current_state(TASK_RUNNING);
+ return true;
+ }
+
+ if (signal_pending_state(state, current))
+ set_current_state(TASK_RUNNING);
+
+ if (current->state == TASK_RUNNING)
+ return true;
+ if (ret < 0)
+ break;
+ cpu_relax();
+ }
+
+ return false;
+}
+
+bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_plug *plug;
+ struct request *rq;
+
+ if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ return false;
+
+ plug = current->plug;
+ if (plug)
+ blk_flush_plug_list(plug, false);
+
+ hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+ rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+
+ return __blk_mq_poll(hctx, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_poll);
+
void blk_mq_disable_hotplug(void)
{
mutex_lock(&all_q_mutex);