summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/blk-mq-cpumap.c9
-rw-r--r--block/blk-mq-sysfs.c34
-rw-r--r--block/blk-mq-tag.c27
-rw-r--r--block/blk-mq-tag.h2
-rw-r--r--block/blk-mq.c118
-rw-r--r--block/blk-mq.h3
6 files changed, 128 insertions, 65 deletions
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 1e28ddb656b8..8764c241e5bb 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
return cpu;
}
-int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+ const struct cpumask *online_mask)
{
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
cpumask_var_t cpus;
@@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
cpumask_clear(cpus);
nr_cpus = nr_uniq_cpus = 0;
- for_each_online_cpu(i) {
+ for_each_cpu(i, online_mask) {
nr_cpus++;
first_sibling = get_first_sibling(i);
if (!cpumask_test_cpu(first_sibling, cpus))
@@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
queue = 0;
for_each_possible_cpu(i) {
- if (!cpu_online(i)) {
+ if (!cpumask_test_cpu(i, online_mask)) {
map[i] = 0;
continue;
}
@@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
if (!map)
return NULL;
- if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
+ if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
return map;
kfree(map);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 279c5d674edf..788fffd9b409 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -229,8 +229,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
unsigned int i, first = 1;
ssize_t ret = 0;
- blk_mq_disable_hotplug();
-
for_each_cpu(i, hctx->cpumask) {
if (first)
ret += sprintf(ret + page, "%u", i);
@@ -240,8 +238,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
first = 0;
}
- blk_mq_enable_hotplug();
-
ret += sprintf(ret + page, "\n");
return ret;
}
@@ -343,7 +339,7 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
struct blk_mq_ctx *ctx;
int i;
- if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+ if (!hctx->nr_ctx)
return;
hctx_for_each_ctx(hctx, ctx, i)
@@ -358,7 +354,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
struct blk_mq_ctx *ctx;
int i, ret;
- if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+ if (!hctx->nr_ctx)
return 0;
ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
@@ -381,6 +377,8 @@ void blk_mq_unregister_disk(struct gendisk *disk)
struct blk_mq_ctx *ctx;
int i, j;
+ blk_mq_disable_hotplug();
+
queue_for_each_hw_ctx(q, hctx, i) {
blk_mq_unregister_hctx(hctx);
@@ -395,6 +393,9 @@ void blk_mq_unregister_disk(struct gendisk *disk)
kobject_put(&q->mq_kobj);
kobject_put(&disk_to_dev(disk)->kobj);
+
+ q->mq_sysfs_init_done = false;
+ blk_mq_enable_hotplug();
}
static void blk_mq_sysfs_init(struct request_queue *q)
@@ -425,27 +426,30 @@ int blk_mq_register_disk(struct gendisk *disk)
struct blk_mq_hw_ctx *hctx;
int ret, i;
+ blk_mq_disable_hotplug();
+
blk_mq_sysfs_init(q);
ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
- return ret;
+ goto out;
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
queue_for_each_hw_ctx(q, hctx, i) {
- hctx->flags |= BLK_MQ_F_SYSFS_UP;
ret = blk_mq_register_hctx(hctx);
if (ret)
break;
}
- if (ret) {
+ if (ret)
blk_mq_unregister_disk(disk);
- return ret;
- }
+ else
+ q->mq_sysfs_init_done = true;
+out:
+ blk_mq_enable_hotplug();
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(blk_mq_register_disk);
@@ -454,6 +458,9 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
+ if (!q->mq_sysfs_init_done)
+ return;
+
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
}
@@ -463,6 +470,9 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
+ if (!q->mq_sysfs_init_done)
+ return ret;
+
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9115c6d59948..ed96474d75cb 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -471,17 +471,30 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
}
EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
-void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
{
- struct blk_mq_tags *tags = hctx->tags;
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ struct blk_mq_tags *tags = hctx->tags;
+
+ /*
+ * If not software queues are currently mapped to this
+ * hardware queue, there's nothing to check
+ */
+ if (!blk_mq_hw_queue_mapped(hctx))
+ continue;
+
+ if (tags->nr_reserved_tags)
+ bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
+ bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+ false);
+ }
- if (tags->nr_reserved_tags)
- bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
- bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
- false);
}
-EXPORT_SYMBOL(blk_mq_tag_busy_iter);
static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
{
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 9eb2cf4f01cb..d468a79f2c4a 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -58,6 +58,8 @@ extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+ void *priv);
enum {
BLK_MQ_TAG_CACHE_MIN = 1,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f2d67b4047a0..7785ae96267a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -393,14 +393,16 @@ void __blk_mq_complete_request(struct request *rq)
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
-void blk_mq_complete_request(struct request *rq)
+void blk_mq_complete_request(struct request *rq, int error)
{
struct request_queue *q = rq->q;
if (unlikely(blk_should_fake_timeout(q)))
return;
- if (!blk_mark_rq_complete(rq))
+ if (!blk_mark_rq_complete(rq)) {
+ rq->errors = error;
__blk_mq_complete_request(rq);
+ }
}
EXPORT_SYMBOL(blk_mq_complete_request);
@@ -616,10 +618,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
* If a request wasn't started before the queue was
* marked dying, kill it here or it'll go unnoticed.
*/
- if (unlikely(blk_queue_dying(rq->q))) {
- rq->errors = -EIO;
- blk_mq_complete_request(rq);
- }
+ if (unlikely(blk_queue_dying(rq->q)))
+ blk_mq_complete_request(rq, -EIO);
return;
}
if (rq->cmd_flags & REQ_NO_TIMEOUT)
@@ -641,24 +641,16 @@ static void blk_mq_rq_timer(unsigned long priv)
.next = 0,
.next_set = 0,
};
- struct blk_mq_hw_ctx *hctx;
int i;
- queue_for_each_hw_ctx(q, hctx, i) {
- /*
- * If not software queues are currently mapped to this
- * hardware queue, there's nothing to check
- */
- if (!blk_mq_hw_queue_mapped(hctx))
- continue;
-
- blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
- }
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.next_set) {
data.next = blk_rq_timeout(round_jiffies_up(data.next));
mod_timer(&q->timeout, data.next);
} else {
+ struct blk_mq_hw_ctx *hctx;
+
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
@@ -1789,13 +1781,19 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
-static void blk_mq_map_swqueue(struct request_queue *q)
+static void blk_mq_map_swqueue(struct request_queue *q,
+ const struct cpumask *online_mask)
{
unsigned int i;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
+ /*
+ * Avoid others reading imcomplete hctx->cpumask through sysfs
+ */
+ mutex_lock(&q->sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -1806,16 +1804,17 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*/
queue_for_each_ctx(q, ctx, i) {
/* If the cpu isn't online, the cpu is mapped to first hctx */
- if (!cpu_online(i))
+ if (!cpumask_test_cpu(i, online_mask))
continue;
hctx = q->mq_ops->map_queue(q, i);
cpumask_set_cpu(i, hctx->cpumask);
- cpumask_set_cpu(i, hctx->tags->cpumask);
ctx->index_hw = hctx->nr_ctx;
hctx->ctxs[hctx->nr_ctx++] = ctx;
}
+ mutex_unlock(&q->sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_ctxmap *map = &hctx->ctx_map;
@@ -1851,6 +1850,14 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx->next_cpu = cpumask_first(hctx->cpumask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
+
+ queue_for_each_ctx(q, ctx, i) {
+ if (!cpumask_test_cpu(i, online_mask))
+ continue;
+
+ hctx = q->mq_ops->map_queue(q, i);
+ cpumask_set_cpu(i, hctx->tags->cpumask);
+ }
}
static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
@@ -1918,6 +1925,9 @@ void blk_mq_release(struct request_queue *q)
kfree(hctx);
}
+ kfree(q->mq_map);
+ q->mq_map = NULL;
+
kfree(q->queue_hw_ctx);
/* ctx kobj stays in queue_ctx */
@@ -2027,13 +2037,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
if (blk_mq_init_hw_queues(q, set))
goto err_hctxs;
+ get_online_cpus();
mutex_lock(&all_q_mutex);
- list_add_tail(&q->all_q_node, &all_q_list);
- mutex_unlock(&all_q_mutex);
+ list_add_tail(&q->all_q_node, &all_q_list);
blk_mq_add_queue_tag_set(set, q);
+ blk_mq_map_swqueue(q, cpu_online_mask);
- blk_mq_map_swqueue(q);
+ mutex_unlock(&all_q_mutex);
+ put_online_cpus();
return q;
@@ -2057,30 +2069,27 @@ void blk_mq_free_queue(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
+ mutex_lock(&all_q_mutex);
+ list_del_init(&q->all_q_node);
+ mutex_unlock(&all_q_mutex);
+
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
blk_mq_free_hw_queues(q, set);
percpu_ref_exit(&q->mq_usage_counter);
-
- kfree(q->mq_map);
-
- q->mq_map = NULL;
-
- mutex_lock(&all_q_mutex);
- list_del_init(&q->all_q_node);
- mutex_unlock(&all_q_mutex);
}
/* Basically redo blk_mq_init_queue with queue frozen */
-static void blk_mq_queue_reinit(struct request_queue *q)
+static void blk_mq_queue_reinit(struct request_queue *q,
+ const struct cpumask *online_mask)
{
WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
blk_mq_sysfs_unregister(q);
- blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
+ blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
/*
* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
@@ -2088,7 +2097,7 @@ static void blk_mq_queue_reinit(struct request_queue *q)
* involves free and re-allocate memory, worthy doing?)
*/
- blk_mq_map_swqueue(q);
+ blk_mq_map_swqueue(q, online_mask);
blk_mq_sysfs_register(q);
}
@@ -2097,16 +2106,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
unsigned long action, void *hcpu)
{
struct request_queue *q;
+ int cpu = (unsigned long)hcpu;
+ /*
+ * New online cpumask which is going to be set in this hotplug event.
+ * Declare this cpumasks as global as cpu-hotplug operation is invoked
+ * one-by-one and dynamically allocating this could result in a failure.
+ */
+ static struct cpumask online_new;
/*
- * Before new mappings are established, hotadded cpu might already
- * start handling requests. This doesn't break anything as we map
- * offline CPUs to first hardware queue. We will re-init the queue
- * below to get optimal settings.
+ * Before hotadded cpu starts handling requests, new mappings must
+ * be established. Otherwise, these requests in hw queue might
+ * never be dispatched.
+ *
+ * For example, there is a single hw queue (hctx) and two CPU queues
+ * (ctx0 for CPU0, and ctx1 for CPU1).
+ *
+ * Now CPU1 is just onlined and a request is inserted into
+ * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
+ * still zero.
+ *
+ * And then while running hw queue, flush_busy_ctxs() finds bit0 is
+ * set in pending bitmap and tries to retrieve requests in
+ * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0,
+ * so the request in ctx1->rq_list is ignored.
*/
- if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
- action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ cpumask_copy(&online_new, cpu_online_mask);
+ break;
+ case CPU_UP_PREPARE:
+ cpumask_copy(&online_new, cpu_online_mask);
+ cpumask_set_cpu(cpu, &online_new);
+ break;
+ default:
return NOTIFY_OK;
+ }
mutex_lock(&all_q_mutex);
@@ -2130,7 +2166,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
}
list_for_each_entry(q, &all_q_list, all_q_node)
- blk_mq_queue_reinit(q);
+ blk_mq_queue_reinit(q, &online_new);
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_unfreeze_queue(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6a48c4c0d8a2..f4fea7964910 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
* CPU -> queue mappings
*/
extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
-extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
+ const struct cpumask *online_mask);
extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
/*