summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2020-07-20 23:38:23 +0200
committerJens Axboe <axboe@kernel.dk>2020-07-20 23:38:23 +0200
commit9caaa66c918c020fd16e84d1c6ebcce9960df1b2 (patch)
tree578d455a53453a1aead77da399c5134e73224a66 /block
parentLinux 5.8-rc6 (diff)
parentblk-cgroup: show global disk stats in root cgroup io.stat (diff)
downloadlinux-9caaa66c918c020fd16e84d1c6ebcce9960df1b2.tar.xz
linux-9caaa66c918c020fd16e84d1c6ebcce9960df1b2.zip
Merge branch 'for-5.9/block' into for-5.9/block-merge
* for-5.9/block: (124 commits) blk-cgroup: show global disk stats in root cgroup io.stat blk-cgroup: make iostat functions visible to stat printing block: improve discard bio alignment in __blkdev_issue_discard() block: change REQ_OP_ZONE_RESET and REQ_OP_ZONE_RESET_ALL to be odd numbers block: defer flush request no matter whether we have elevator block: make blk_timeout_init() static block: remove retry loop in ioc_release_fn() block: remove unnecessary ioc nested locking block: integrate bd_start_claiming into __blkdev_get block: use bd_prepare_to_claim directly in the loop driver block: refactor bd_start_claiming block: simplify the restart case in __blkdev_get Revert "blk-rq-qos: remove redundant finish_wait to rq_qos_wait." block: always remove partitions from blk_drop_partitions() block: relax jiffies rounding for timeouts blk-mq: remove redundant validation in __blk_mq_end_request() blk-mq: Remove unnecessary local variable writeback: remove bdi->congested_fn writeback: remove struct bdi_writeback_congested writeback: remove {set,clear}_wb_congested ...
Diffstat (limited to 'block')
-rw-r--r--block/Makefile2
-rw-r--r--block/bio.c161
-rw-r--r--block/blk-cgroup.c402
-rw-r--r--block/blk-core.c286
-rw-r--r--block/blk-crypto-fallback.c2
-rw-r--r--block/blk-crypto.c2
-rw-r--r--block/blk-flush.c23
-rw-r--r--block/blk-ioc.c42
-rw-r--r--block/blk-iocost.c3
-rw-r--r--block/blk-iolatency.c3
-rw-r--r--block/blk-lib.c31
-rw-r--r--block/blk-merge.c25
-rw-r--r--block/blk-mq-debugfs.c8
-rw-r--r--block/blk-mq-sched.c101
-rw-r--r--block/blk-mq-tag.c62
-rw-r--r--block/blk-mq-tag.h41
-rw-r--r--block/blk-mq.c390
-rw-r--r--block/blk-mq.h17
-rw-r--r--block/blk-softirq.c156
-rw-r--r--block/blk-sysfs.c52
-rw-r--r--block/blk-throttle.c14
-rw-r--r--block/blk-timeout.c28
-rw-r--r--block/blk.h37
-rw-r--r--block/bounce.c2
-rw-r--r--block/bsg-lib.c5
-rw-r--r--block/genhd.c85
-rw-r--r--block/partitions/core.c2
27 files changed, 1043 insertions, 939 deletions
diff --git a/block/Makefile b/block/Makefile
index 78719169fb2a..8d841f5f986f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
- blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+ blk-exec.o blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o
diff --git a/block/bio.c b/block/bio.c
index a7366c02c9b5..ef91782fd668 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -234,8 +234,12 @@ fallback:
void bio_uninit(struct bio *bio)
{
- bio_disassociate_blkg(bio);
-
+#ifdef CONFIG_BLK_CGROUP
+ if (bio->bi_blkg) {
+ blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
+#endif
if (bio_integrity(bio))
bio_integrity_free(bio);
@@ -354,7 +358,7 @@ static void bio_alloc_rescue(struct work_struct *work)
if (!bio)
break;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
@@ -412,19 +416,19 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
* submit the previously allocated bio for IO before attempting to allocate
* a new one. Failure to do so can cause deadlocks under memory pressure.
*
- * Note that when running under generic_make_request() (i.e. any block
+ * Note that when running under submit_bio_noacct() (i.e. any block
* driver), bios are not submitted until after you return - see the code in
- * generic_make_request() that converts recursion into iteration, to prevent
+ * submit_bio_noacct() that converts recursion into iteration, to prevent
* stack overflows.
*
* This would normally mean allocating multiple bios under
- * generic_make_request() would be susceptible to deadlocks, but we have
+ * submit_bio_noacct() would be susceptible to deadlocks, but we have
* deadlock avoidance code that resubmits any blocked bios from a rescuer
* thread.
*
* However, we do not guarantee forward progress for allocations from other
* mempools. Doing multiple allocations from the same mempool under
- * generic_make_request() should be avoided - instead, use bio_set's front_pad
+ * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
* for per bio allocations.
*
* RETURNS:
@@ -444,9 +448,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
if (nr_iovecs > UIO_MAXIOV)
return NULL;
- p = kmalloc(sizeof(struct bio) +
- nr_iovecs * sizeof(struct bio_vec),
- gfp_mask);
+ p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
front_pad = 0;
inline_vecs = nr_iovecs;
} else {
@@ -455,14 +457,14 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
nr_iovecs > 0))
return NULL;
/*
- * generic_make_request() converts recursion to iteration; this
+ * submit_bio_noacct() converts recursion to iteration; this
* means if we're running beneath it, any bios we allocate and
* submit will not be submitted (and thus freed) until after we
* return.
*
* This exposes us to a potential deadlock if we allocate
* multiple bios from the same bio_set() while running
- * underneath generic_make_request(). If we were to allocate
+ * underneath submit_bio_noacct(). If we were to allocate
* multiple bios (say a stacking block driver that was splitting
* bios), we would deadlock if we exhausted the mempool's
* reserve.
@@ -1625,141 +1627,6 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
}
EXPORT_SYMBOL(bioset_init_from_src);
-#ifdef CONFIG_BLK_CGROUP
-
-/**
- * bio_disassociate_blkg - puts back the blkg reference if associated
- * @bio: target bio
- *
- * Helper to disassociate the blkg from @bio if a blkg is associated.
- */
-void bio_disassociate_blkg(struct bio *bio)
-{
- if (bio->bi_blkg) {
- blkg_put(bio->bi_blkg);
- bio->bi_blkg = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(bio_disassociate_blkg);
-
-/**
- * __bio_associate_blkg - associate a bio with the a blkg
- * @bio: target bio
- * @blkg: the blkg to associate
- *
- * This tries to associate @bio with the specified @blkg. Association failure
- * is handled by walking up the blkg tree. Therefore, the blkg associated can
- * be anything between @blkg and the root_blkg. This situation only happens
- * when a cgroup is dying and then the remaining bios will spill to the closest
- * alive blkg.
- *
- * A reference will be taken on the @blkg and will be released when @bio is
- * freed.
- */
-static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
-{
- bio_disassociate_blkg(bio);
-
- bio->bi_blkg = blkg_tryget_closest(blkg);
-}
-
-/**
- * bio_associate_blkg_from_css - associate a bio with a specified css
- * @bio: target bio
- * @css: target css
- *
- * Associate @bio with the blkg found by combining the css's blkg and the
- * request_queue of the @bio. This falls back to the queue's root_blkg if
- * the association fails with the css.
- */
-void bio_associate_blkg_from_css(struct bio *bio,
- struct cgroup_subsys_state *css)
-{
- struct request_queue *q = bio->bi_disk->queue;
- struct blkcg_gq *blkg;
-
- rcu_read_lock();
-
- if (!css || !css->parent)
- blkg = q->root_blkg;
- else
- blkg = blkg_lookup_create(css_to_blkcg(css), q);
-
- __bio_associate_blkg(bio, blkg);
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
-
-#ifdef CONFIG_MEMCG
-/**
- * bio_associate_blkg_from_page - associate a bio with the page's blkg
- * @bio: target bio
- * @page: the page to lookup the blkcg from
- *
- * Associate @bio with the blkg from @page's owning memcg and the respective
- * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's
- * root_blkg.
- */
-void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
-{
- struct cgroup_subsys_state *css;
-
- if (!page->mem_cgroup)
- return;
-
- rcu_read_lock();
-
- css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
- bio_associate_blkg_from_css(bio, css);
-
- rcu_read_unlock();
-}
-#endif /* CONFIG_MEMCG */
-
-/**
- * bio_associate_blkg - associate a bio with a blkg
- * @bio: target bio
- *
- * Associate @bio with the blkg found from the bio's css and request_queue.
- * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
- * already associated, the css is reused and association redone as the
- * request_queue may have changed.
- */
-void bio_associate_blkg(struct bio *bio)
-{
- struct cgroup_subsys_state *css;
-
- rcu_read_lock();
-
- if (bio->bi_blkg)
- css = &bio_blkcg(bio)->css;
- else
- css = blkcg_css();
-
- bio_associate_blkg_from_css(bio, css);
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_associate_blkg);
-
-/**
- * bio_clone_blkg_association - clone blkg association from src to dst bio
- * @dst: destination bio
- * @src: source bio
- */
-void bio_clone_blkg_association(struct bio *dst, struct bio *src)
-{
- rcu_read_lock();
-
- if (src->bi_blkg)
- __bio_associate_blkg(dst, src->bi_blkg);
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
-#endif /* CONFIG_BLK_CGROUP */
-
static void __init biovec_init_slabs(void)
{
int i;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ecc897b225c..619a79b51068 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -95,9 +95,6 @@ static void __blkg_release(struct rcu_head *rcu)
css_put(&blkg->blkcg->css);
if (blkg->parent)
blkg_put(blkg->parent);
-
- wb_congested_put(blkg->wb_congested);
-
blkg_free(blkg);
}
@@ -227,7 +224,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
- struct bdi_writeback_congested *wb_congested;
int i, ret;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -245,31 +241,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}
- wb_congested = wb_congested_get_create(q->backing_dev_info,
- blkcg->css.id,
- GFP_NOWAIT | __GFP_NOWARN);
- if (!wb_congested) {
- ret = -ENOMEM;
- goto err_put_css;
- }
-
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto err_put_congested;
+ goto err_put_css;
}
}
blkg = new_blkg;
- blkg->wb_congested = wb_congested;
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
ret = -ENODEV;
- goto err_put_congested;
+ goto err_put_css;
}
blkg_get(blkg->parent);
}
@@ -306,8 +293,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_put(blkg);
return ERR_PTR(ret);
-err_put_congested:
- wb_congested_put(wb_congested);
err_put_css:
css_put(&blkcg->css);
err_free_blkg:
@@ -316,30 +301,35 @@ err_free_blkg:
}
/**
- * __blkg_lookup_create - lookup blkg, try to create one if not there
+ * blkg_lookup_create - lookup blkg, try to create one if not there
* @blkcg: blkcg of interest
* @q: request_queue of interest
*
* Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
* create one. blkg creation is performed recursively from blkcg_root such
* that all non-root blkg's have access to the parent blkg. This function
- * should be called under RCU read lock and @q->queue_lock.
+ * should be called under RCU read lock and takes @q->queue_lock.
*
* Returns the blkg or the closest blkg if blkg_create() fails as it walks
* down from root.
*/
-struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
- struct request_queue *q)
+static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+ struct request_queue *q)
{
struct blkcg_gq *blkg;
+ unsigned long flags;
WARN_ON_ONCE(!rcu_read_lock_held());
- lockdep_assert_held(&q->queue_lock);
- blkg = __blkg_lookup(blkcg, q, true);
+ blkg = blkg_lookup(blkcg, q);
if (blkg)
return blkg;
+ spin_lock_irqsave(&q->queue_lock, flags);
+ blkg = __blkg_lookup(blkcg, q, true);
+ if (blkg)
+ goto found;
+
/*
* Create blkgs walking down from blkcg_root to @blkcg, so that all
* non-root blkgs have access to their parents. Returns the closest
@@ -362,34 +352,16 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
}
blkg = blkg_create(pos, q, NULL);
- if (IS_ERR(blkg))
- return ret_blkg;
+ if (IS_ERR(blkg)) {
+ blkg = ret_blkg;
+ break;
+ }
if (pos == blkcg)
- return blkg;
- }
-}
-
-/**
- * blkg_lookup_create - find or create a blkg
- * @blkcg: target block cgroup
- * @q: target request_queue
- *
- * This looks up or creates the blkg representing the unique pair
- * of the blkcg and the request_queue.
- */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
- struct request_queue *q)
-{
- struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
-
- if (unlikely(!blkg)) {
- unsigned long flags;
-
- spin_lock_irqsave(&q->queue_lock, flags);
- blkg = __blkg_lookup_create(blkcg, q);
- spin_unlock_irqrestore(&q->queue_lock, flags);
+ break;
}
+found:
+ spin_unlock_irqrestore(&q->queue_lock, flags);
return blkg;
}
@@ -739,12 +711,137 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] = src->bytes[i];
+ dst->ios[i] = src->ios[i];
+ }
+}
+
+static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] += src->bytes[i];
+ dst->ios[i] += src->ios[i];
+ }
+}
+
+static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] -= src->bytes[i];
+ dst->ios[i] -= src->ios[i];
+ }
+}
+
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct blkcg_gq *blkg;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct blkcg_gq *parent = blkg->parent;
+ struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
+ struct blkg_iostat cur, delta;
+ unsigned int seq;
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = u64_stats_fetch_begin(&bisc->sync);
+ blkg_iostat_set(&cur, &bisc->cur);
+ } while (u64_stats_fetch_retry(&bisc->sync, seq));
+
+ /* propagate percpu delta to global */
+ u64_stats_update_begin(&blkg->iostat.sync);
+ blkg_iostat_set(&delta, &cur);
+ blkg_iostat_sub(&delta, &bisc->last);
+ blkg_iostat_add(&blkg->iostat.cur, &delta);
+ blkg_iostat_add(&bisc->last, &delta);
+ u64_stats_update_end(&blkg->iostat.sync);
+
+ /* propagate global delta to parent */
+ if (parent) {
+ u64_stats_update_begin(&parent->iostat.sync);
+ blkg_iostat_set(&delta, &blkg->iostat.cur);
+ blkg_iostat_sub(&delta, &blkg->iostat.last);
+ blkg_iostat_add(&parent->iostat.cur, &delta);
+ blkg_iostat_add(&blkg->iostat.last, &delta);
+ u64_stats_update_end(&parent->iostat.sync);
+ }
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * The rstat algorithms intentionally don't handle the root cgroup to avoid
+ * incurring overhead when no cgroups are defined. For that reason,
+ * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the
+ * iostat in the root cgroup's blkcg_gq.
+ *
+ * However, we would like to re-use the printing code between the root and
+ * non-root cgroups to the extent possible. For that reason, we simulate
+ * flushing the root cgroup's stats by explicitly filling in the iostat
+ * with disk level statistics.
+ */
+static void blkcg_fill_root_iostats(void)
+{
+ struct class_dev_iter iter;
+ struct device *dev;
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct gendisk *disk = dev_to_disk(dev);
+ struct hd_struct *part = disk_get_part(disk, 0);
+ struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
+ struct blkg_iostat tmp;
+ int cpu;
+
+ memset(&tmp, 0, sizeof(tmp));
+ for_each_possible_cpu(cpu) {
+ struct disk_stats *cpu_dkstats;
+
+ cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
+ tmp.ios[BLKG_IOSTAT_READ] +=
+ cpu_dkstats->ios[STAT_READ];
+ tmp.ios[BLKG_IOSTAT_WRITE] +=
+ cpu_dkstats->ios[STAT_WRITE];
+ tmp.ios[BLKG_IOSTAT_DISCARD] +=
+ cpu_dkstats->ios[STAT_DISCARD];
+ // convert sectors to bytes
+ tmp.bytes[BLKG_IOSTAT_READ] +=
+ cpu_dkstats->sectors[STAT_READ] << 9;
+ tmp.bytes[BLKG_IOSTAT_WRITE] +=
+ cpu_dkstats->sectors[STAT_WRITE] << 9;
+ tmp.bytes[BLKG_IOSTAT_DISCARD] +=
+ cpu_dkstats->sectors[STAT_DISCARD] << 9;
+
+ u64_stats_update_begin(&blkg->iostat.sync);
+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
+ u64_stats_update_end(&blkg->iostat.sync);
+ }
+ }
+}
+
static int blkcg_print_stat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct blkcg_gq *blkg;
- cgroup_rstat_flush(blkcg->css.cgroup);
+ if (!seq_css(sf)->parent)
+ blkcg_fill_root_iostats();
+ else
+ cgroup_rstat_flush(blkcg->css.cgroup);
+
rcu_read_lock();
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
@@ -833,7 +930,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
static struct cftype blkcg_files[] = {
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = blkcg_print_stat,
},
{ } /* terminate */
@@ -1025,7 +1121,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css)
* blkcg_init_queue - initialize blkcg part of request queue
* @q: request_queue to initialize
*
- * Called from __blk_alloc_queue(). Responsible for initializing blkcg
+ * Called from blk_alloc_queue(). Responsible for initializing blkcg
* part of new request_queue @q.
*
* RETURNS:
@@ -1114,77 +1210,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset)
return ret;
}
-static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] = src->bytes[i];
- dst->ios[i] = src->ios[i];
- }
-}
-
-static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] += src->bytes[i];
- dst->ios[i] += src->ios[i];
- }
-}
-
-static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] -= src->bytes[i];
- dst->ios[i] -= src->ios[i];
- }
-}
-
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
-{
- struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg_gq *blkg;
-
- rcu_read_lock();
-
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
- struct blkcg_gq *parent = blkg->parent;
- struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
- struct blkg_iostat cur, delta;
- unsigned seq;
-
- /* fetch the current per-cpu values */
- do {
- seq = u64_stats_fetch_begin(&bisc->sync);
- blkg_iostat_set(&cur, &bisc->cur);
- } while (u64_stats_fetch_retry(&bisc->sync, seq));
-
- /* propagate percpu delta to global */
- u64_stats_update_begin(&blkg->iostat.sync);
- blkg_iostat_set(&delta, &cur);
- blkg_iostat_sub(&delta, &bisc->last);
- blkg_iostat_add(&blkg->iostat.cur, &delta);
- blkg_iostat_add(&bisc->last, &delta);
- u64_stats_update_end(&blkg->iostat.sync);
-
- /* propagate global delta to parent */
- if (parent) {
- u64_stats_update_begin(&parent->iostat.sync);
- blkg_iostat_set(&delta, &blkg->iostat.cur);
- blkg_iostat_sub(&delta, &blkg->iostat.last);
- blkg_iostat_add(&parent->iostat.cur, &delta);
- blkg_iostat_add(&blkg->iostat.last, &delta);
- u64_stats_update_end(&parent->iostat.sync);
- }
- }
-
- rcu_read_unlock();
-}
-
static void blkcg_bind(struct cgroup_subsys_state *root_css)
{
int i;
@@ -1727,6 +1752,139 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}
+/**
+ * blkg_tryget_closest - try and get a blkg ref on the closet blkg
+ * @bio: target bio
+ * @css: target css
+ *
+ * As the failure mode here is to walk up the blkg tree, this ensure that the
+ * blkg->parent pointers are always valid. This returns the blkg that it ended
+ * up taking a reference on or %NULL if no reference was taken.
+ */
+static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
+ struct cgroup_subsys_state *css)
+{
+ struct blkcg_gq *blkg, *ret_blkg = NULL;
+
+ rcu_read_lock();
+ blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
+ while (blkg) {
+ if (blkg_tryget(blkg)) {
+ ret_blkg = blkg;
+ break;
+ }
+ blkg = blkg->parent;
+ }
+ rcu_read_unlock();
+
+ return ret_blkg;
+}
+
+/**
+ * bio_associate_blkg_from_css - associate a bio with a specified css
+ * @bio: target bio
+ * @css: target css
+ *
+ * Associate @bio with the blkg found by combining the css's blkg and the
+ * request_queue of the @bio. An association failure is handled by walking up
+ * the blkg tree. Therefore, the blkg associated can be anything between @blkg
+ * and q->root_blkg. This situation only happens when a cgroup is dying and
+ * then the remaining bios will spill to the closest alive blkg.
+ *
+ * A reference will be taken on the blkg and will be released when @bio is
+ * freed.
+ */
+void bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css)
+{
+ if (bio->bi_blkg)
+ blkg_put(bio->bi_blkg);
+
+ if (css && css->parent) {
+ bio->bi_blkg = blkg_tryget_closest(bio, css);
+ } else {
+ blkg_get(bio->bi_disk->queue->root_blkg);
+ bio->bi_blkg = bio->bi_disk->queue->root_blkg;
+ }
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
+
+/**
+ * bio_associate_blkg - associate a bio with a blkg
+ * @bio: target bio
+ *
+ * Associate @bio with the blkg found from the bio's css and request_queue.
+ * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
+ * already associated, the css is reused and association redone as the
+ * request_queue may have changed.
+ */
+void bio_associate_blkg(struct bio *bio)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ if (bio->bi_blkg)
+ css = &bio_blkcg(bio)->css;
+ else
+ css = blkcg_css();
+
+ bio_associate_blkg_from_css(bio, css);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg);
+
+/**
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
+ * @dst: destination bio
+ * @src: source bio
+ */
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
+{
+ if (src->bi_blkg) {
+ if (dst->bi_blkg)
+ blkg_put(dst->bi_blkg);
+ blkg_get(src->bi_blkg);
+ dst->bi_blkg = src->bi_blkg;
+ }
+}
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
+
+static int blk_cgroup_io_type(struct bio *bio)
+{
+ if (op_is_discard(bio->bi_opf))
+ return BLKG_IOSTAT_DISCARD;
+ if (op_is_write(bio->bi_opf))
+ return BLKG_IOSTAT_WRITE;
+ return BLKG_IOSTAT_READ;
+}
+
+void blk_cgroup_bio_start(struct bio *bio)
+{
+ int rwd = blk_cgroup_io_type(bio), cpu;
+ struct blkg_iostat_set *bis;
+
+ cpu = get_cpu();
+ bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
+ u64_stats_update_begin(&bis->sync);
+
+ /*
+ * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
+ * bio and we would have already accounted for the size of the bio.
+ */
+ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
+ bio_set_flag(bio, BIO_CGROUP_ACCT);
+ bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+ }
+ bis->cur.ios[rwd]++;
+
+ u64_stats_update_end(&bis->sync);
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys))
+ cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+ put_cpu();
+}
+
static int __init blkcg_init(void)
{
blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
diff --git a/block/blk-core.c b/block/blk-core.c
index 03252af8c82c..93104c7470e8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,9 +51,7 @@
#include "blk-pm.h"
#include "blk-rq-qos.h"
-#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
-#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -285,7 +283,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags);
* A block device may call blk_sync_queue to ensure that any
* such activity is cancelled, thus allowing it to release resources
* that the callbacks might use. The caller must already have made sure
- * that its ->make_request_fn will not re-add plugging prior to calling
+ * that its ->submit_bio will not re-add plugging prior to calling
* this function.
*
* This function does not cancel any asynchronous activity arising
@@ -321,6 +319,16 @@ void blk_clear_pm_only(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
+/**
+ * blk_put_queue - decrement the request_queue refcount
+ * @q: the request_queue structure to decrement the refcount for
+ *
+ * Decrements the refcount of the request_queue kobject. When this reaches 0
+ * we'll have blk_release_queue() called.
+ *
+ * Context: Any context, but the last reference must not be dropped from
+ * atomic context.
+ */
void blk_put_queue(struct request_queue *q)
{
kobject_put(&q->kobj);
@@ -352,9 +360,14 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying);
*
* Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
* put it. All future requests will be failed immediately with -ENODEV.
+ *
+ * Context: can sleep
*/
void blk_cleanup_queue(struct request_queue *q)
{
+ /* cannot be called from atomic context */
+ might_sleep();
+
WARN_ON_ONCE(blk_queue_registered(q));
/* mark @q DYING, no new request or merges will be allowed afterwards */
@@ -497,7 +510,7 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *__blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id)
{
struct request_queue *q;
int ret;
@@ -540,9 +553,7 @@ struct request_queue *__blk_alloc_queue(int node_id)
kobject_init(&q->kobj, &blk_queue_ktype);
-#ifdef CONFIG_BLK_DEV_IO_TRACE
- mutex_init(&q->blk_trace_mutex);
-#endif
+ mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
@@ -564,6 +575,7 @@ struct request_queue *__blk_alloc_queue(int node_id)
blk_queue_dma_alignment(q, 511);
blk_set_default_limits(&q->limits);
+ q->nr_requests = BLKDEV_MAX_RQ;
return q;
@@ -581,23 +593,16 @@ fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
-
-struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id)
-{
- struct request_queue *q;
-
- if (WARN_ON_ONCE(!make_request))
- return NULL;
-
- q = __blk_alloc_queue(node_id);
- if (!q)
- return NULL;
- q->make_request_fn = make_request;
- q->nr_requests = BLKDEV_MAX_RQ;
- return q;
-}
EXPORT_SYMBOL(blk_alloc_queue);
+/**
+ * blk_get_queue - increment the request_queue refcount
+ * @q: the request_queue structure to increment the refcount for
+ *
+ * Increment the refcount of the request_queue kobject.
+ *
+ * Context: Any context.
+ */
bool blk_get_queue(struct request_queue *q)
{
if (likely(!blk_queue_dying(q))) {
@@ -850,8 +855,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
return false;
WARN_ONCE(1,
- "generic_make_request: Trying to write "
- "to read-only block-device %s (partno %d)\n",
+ "Trying to write to read-only block-device %s (partno %d)\n",
bio_devname(bio, b), part->partno);
/* Older lvm-tools actually trigger this */
return false;
@@ -952,25 +956,13 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_OK;
}
-static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+static noinline_for_stack bool submit_bio_checks(struct bio *bio)
{
- struct request_queue *q;
- int nr_sectors = bio_sectors(bio);
+ struct request_queue *q = bio->bi_disk->queue;
blk_status_t status = BLK_STS_IOERR;
- char b[BDEVNAME_SIZE];
might_sleep();
- q = bio->bi_disk->queue;
- if (unlikely(!q)) {
- printk(KERN_ERR
- "generic_make_request: Trying to access "
- "nonexistent block-device %s (%Lu)\n",
- bio_devname(bio, b), (long long)bio->bi_iter.bi_sector);
- goto end_io;
- }
-
/*
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue.
@@ -992,14 +984,13 @@ generic_make_request_checks(struct bio *bio)
}
/*
- * Filter flush bio's early so that make_request based
- * drivers without flush support don't have to worry
- * about them.
+ * Filter flush bio's early so that bio based drivers without flush
+ * support don't have to worry about them.
*/
if (op_is_flush(bio->bi_opf) &&
!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
- if (!nr_sectors) {
+ if (!bio_sectors(bio)) {
status = BLK_STS_OK;
goto end_io;
}
@@ -1054,8 +1045,13 @@ generic_make_request_checks(struct bio *bio)
if (unlikely(!current->io_context))
create_task_io_context(current, GFP_ATOMIC, q->node);
- if (!blkcg_bio_issue_check(q, bio))
+ if (blk_throtl_bio(bio)) {
+ blkcg_bio_issue_init(bio);
return false;
+ }
+
+ blk_cgroup_bio_start(bio);
+ blkcg_bio_issue_init(bio);
if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_queue(q, bio);
@@ -1074,138 +1070,144 @@ end_io:
return false;
}
-static blk_qc_t do_make_request(struct bio *bio)
+static blk_qc_t __submit_bio(struct bio *bio)
{
- struct request_queue *q = bio->bi_disk->queue;
+ struct gendisk *disk = bio->bi_disk;
blk_qc_t ret = BLK_QC_T_NONE;
if (blk_crypto_bio_prep(&bio)) {
- if (!q->make_request_fn)
- return blk_mq_make_request(q, bio);
- ret = q->make_request_fn(q, bio);
+ if (!disk->fops->submit_bio)
+ return blk_mq_submit_bio(bio);
+ ret = disk->fops->submit_bio(bio);
}
- blk_queue_exit(q);
+ blk_queue_exit(disk->queue);
return ret;
}
-/**
- * generic_make_request - re-submit a bio to the block device layer for I/O
- * @bio: The bio describing the location in memory and on the device.
+/*
+ * The loop in this function may be a bit non-obvious, and so deserves some
+ * explanation:
*
- * This is a version of submit_bio() that shall only be used for I/O that is
- * resubmitted to lower level drivers by stacking block drivers. All file
- * systems and other upper level users of the block layer should use
- * submit_bio() instead.
+ * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
+ * that), so we have a list with a single bio.
+ * - We pretend that we have just taken it off a longer list, so we assign
+ * bio_list to a pointer to the bio_list_on_stack, thus initialising the
+ * bio_list of new bios to be added. ->submit_bio() may indeed add some more
+ * bios through a recursive call to submit_bio_noacct. If it did, we find a
+ * non-NULL value in bio_list and re-enter the loop from the top.
+ * - In this case we really did just take the bio of the top of the list (no
+ * pretending) and so remove it from bio_list, and call into ->submit_bio()
+ * again.
+ *
+ * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
+ * bio_list_on_stack[1] contains bios that were submitted before the current
+ * ->submit_bio_bio, but that haven't been processed yet.
*/
-blk_qc_t generic_make_request(struct bio *bio)
+static blk_qc_t __submit_bio_noacct(struct bio *bio)
{
- /*
- * bio_list_on_stack[0] contains bios submitted by the current
- * make_request_fn.
- * bio_list_on_stack[1] contains bios that were submitted before
- * the current make_request_fn, but that haven't been processed
- * yet.
- */
struct bio_list bio_list_on_stack[2];
blk_qc_t ret = BLK_QC_T_NONE;
- if (!generic_make_request_checks(bio))
- goto out;
-
- /*
- * We only want one ->make_request_fn to be active at a time, else
- * stack usage with stacked devices could be a problem. So use
- * current->bio_list to keep a list of requests submited by a
- * make_request_fn function. current->bio_list is also used as a
- * flag to say if generic_make_request is currently active in this
- * task or not. If it is NULL, then no make_request is active. If
- * it is non-NULL, then a make_request is active, and new requests
- * should be added at the tail
- */
- if (current->bio_list) {
- bio_list_add(&current->bio_list[0], bio);
- goto out;
- }
-
- /* following loop may be a bit non-obvious, and so deserves some
- * explanation.
- * Before entering the loop, bio->bi_next is NULL (as all callers
- * ensure that) so we have a list with a single bio.
- * We pretend that we have just taken it off a longer list, so
- * we assign bio_list to a pointer to the bio_list_on_stack,
- * thus initialising the bio_list of new bios to be
- * added. ->make_request() may indeed add some more bios
- * through a recursive call to generic_make_request. If it
- * did, we find a non-NULL value in bio_list and re-enter the loop
- * from the top. In this case we really did just take the bio
- * of the top of the list (no pretending) and so remove it from
- * bio_list, and call into ->make_request() again.
- */
BUG_ON(bio->bi_next);
+
bio_list_init(&bio_list_on_stack[0]);
current->bio_list = bio_list_on_stack;
+
do {
struct request_queue *q = bio->bi_disk->queue;
+ struct bio_list lower, same;
+
+ if (unlikely(bio_queue_enter(bio) != 0))
+ continue;
- if (likely(bio_queue_enter(bio) == 0)) {
- struct bio_list lower, same;
+ /*
+ * Create a fresh bio_list for all subordinate requests.
+ */
+ bio_list_on_stack[1] = bio_list_on_stack[0];
+ bio_list_init(&bio_list_on_stack[0]);
- /* Create a fresh bio_list for all subordinate requests */
- bio_list_on_stack[1] = bio_list_on_stack[0];
- bio_list_init(&bio_list_on_stack[0]);
- ret = do_make_request(bio);
+ ret = __submit_bio(bio);
- /* sort new bios into those for a lower level
- * and those for the same level
- */
- bio_list_init(&lower);
- bio_list_init(&same);
- while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
- if (q == bio->bi_disk->queue)
- bio_list_add(&same, bio);
- else
- bio_list_add(&lower, bio);
- /* now assemble so we handle the lowest level first */
- bio_list_merge(&bio_list_on_stack[0], &lower);
- bio_list_merge(&bio_list_on_stack[0], &same);
- bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
+ /*
+ * Sort new bios into those for a lower level and those for the
+ * same level.
+ */
+ bio_list_init(&lower);
+ bio_list_init(&same);
+ while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
+ if (q == bio->bi_disk->queue)
+ bio_list_add(&same, bio);
+ else
+ bio_list_add(&lower, bio);
+
+ /*
+ * Now assemble so we handle the lowest level first.
+ */
+ bio_list_merge(&bio_list_on_stack[0], &lower);
+ bio_list_merge(&bio_list_on_stack[0], &same);
+ bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
+ } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+
+ current->bio_list = NULL;
+ return ret;
+}
+
+static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
+{
+ struct bio_list bio_list[2] = { };
+ blk_qc_t ret = BLK_QC_T_NONE;
+
+ current->bio_list = bio_list;
+
+ do {
+ struct gendisk *disk = bio->bi_disk;
+
+ if (unlikely(bio_queue_enter(bio) != 0))
+ continue;
+
+ if (!blk_crypto_bio_prep(&bio)) {
+ blk_queue_exit(disk->queue);
+ ret = BLK_QC_T_NONE;
+ continue;
}
- bio = bio_list_pop(&bio_list_on_stack[0]);
- } while (bio);
- current->bio_list = NULL; /* deactivate */
-out:
+ ret = blk_mq_submit_bio(bio);
+ } while ((bio = bio_list_pop(&bio_list[0])));
+
+ current->bio_list = NULL;
return ret;
}
-EXPORT_SYMBOL(generic_make_request);
/**
- * direct_make_request - hand a buffer directly to its device driver for I/O
+ * submit_bio_noacct - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
*
- * This function behaves like generic_make_request(), but does not protect
- * against recursion. Must only be used if the called driver is known
- * to be blk-mq based.
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
*/
-blk_qc_t direct_make_request(struct bio *bio)
+blk_qc_t submit_bio_noacct(struct bio *bio)
{
- struct request_queue *q = bio->bi_disk->queue;
-
- if (WARN_ON_ONCE(q->make_request_fn)) {
- bio_io_error(bio);
- return BLK_QC_T_NONE;
- }
- if (!generic_make_request_checks(bio))
- return BLK_QC_T_NONE;
- if (unlikely(bio_queue_enter(bio)))
+ if (!submit_bio_checks(bio))
return BLK_QC_T_NONE;
- if (!blk_crypto_bio_prep(&bio)) {
- blk_queue_exit(q);
+
+ /*
+ * We only want one ->submit_bio to be active at a time, else stack
+ * usage with stacked devices could be a problem. Use current->bio_list
+ * to collect a list of requests submited by a ->submit_bio method while
+ * it is active, and then process them after it returned.
+ */
+ if (current->bio_list) {
+ bio_list_add(&current->bio_list[0], bio);
return BLK_QC_T_NONE;
}
- return blk_mq_make_request(q, bio);
+
+ if (!bio->bi_disk->fops->submit_bio)
+ return __submit_bio_noacct_mq(bio);
+ return __submit_bio_noacct(bio);
}
-EXPORT_SYMBOL_GPL(direct_make_request);
+EXPORT_SYMBOL(submit_bio_noacct);
/**
* submit_bio - submit a bio to the block device layer for I/O
@@ -1266,13 +1268,13 @@ blk_qc_t submit_bio(struct bio *bio)
blk_qc_t ret;
psi_memstall_enter(&pflags);
- ret = generic_make_request(bio);
+ ret = submit_bio_noacct(bio);
psi_memstall_leave(&pflags);
return ret;
}
- return generic_make_request(bio);
+ return submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);
@@ -1908,9 +1910,7 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
-#ifdef CONFIG_DEBUG_FS
blk_debugfs_root = debugfs_create_dir("block", NULL);
-#endif
return 0;
}
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 6e49688a2d80..c162b754efbd 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -228,7 +228,7 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
return false;
}
bio_chain(split_bio, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
*bio_ptr = split_bio;
}
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 6533c9b36ab8..2d5e60023b08 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -239,7 +239,7 @@ void __blk_crypto_free_request(struct request *rq)
* kernel crypto API. When the crypto API fallback is used for encryption,
* blk-crypto may choose to split the bio into 2 - the first one that will
* continue to be processed and the second one that will be resubmitted via
- * generic_make_request. A bounce bio will be allocated to encrypt the contents
+ * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents
* of the aforementioned "first one", and *bio_ptr will be updated to this
* bounce bio.
*
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 15ae0155ec07..6e1543c10493 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -219,7 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
struct request *rq, *n;
unsigned long flags = 0;
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
- struct blk_mq_hw_ctx *hctx;
blk_account_io_flush(flush_rq);
@@ -235,13 +234,11 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
if (fq->rq_status != BLK_STS_OK)
error = fq->rq_status;
- hctx = flush_rq->mq_hctx;
if (!q->elevator) {
- blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
- flush_rq->tag = -1;
+ flush_rq->tag = BLK_MQ_NO_TAG;
} else {
blk_mq_put_driver_tag(flush_rq);
- flush_rq->internal_tag = -1;
+ flush_rq->internal_tag = BLK_MQ_NO_TAG;
}
running = &fq->flush_queue[fq->flush_running_idx];
@@ -286,13 +283,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
return;
- /* C2 and C3
- *
- * For blk-mq + scheduling, we can risk having all driver tags
- * assigned to empty flushes, and we deadlock if we are expecting
- * other requests to make progress. Don't defer for that case.
- */
- if (!list_empty(&fq->flush_data_in_flight) && q->elevator &&
+ /* C2 and C3 */
+ if (!list_empty(&fq->flush_data_in_flight) &&
time_before(jiffies,
fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
return;
@@ -316,13 +308,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->mq_hctx = first_rq->mq_hctx;
- if (!q->elevator) {
- fq->orig_rq = first_rq;
+ if (!q->elevator)
flush_rq->tag = first_rq->tag;
- blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
- } else {
+ else
flush_rq->internal_tag = first_rq->internal_tag;
- }
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9df50fb507ca..57299f860d41 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -96,15 +96,7 @@ static void ioc_release_fn(struct work_struct *work)
{
struct io_context *ioc = container_of(work, struct io_context,
release_work);
- unsigned long flags;
-
- /*
- * Exiting icq may call into put_io_context() through elevator
- * which will trigger lockdep warning. The ioc's are guaranteed to
- * be different, use a different locking subclass here. Use
- * irqsave variant as there's no spin_lock_irq_nested().
- */
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ spin_lock_irq(&ioc->lock);
while (!hlist_empty(&ioc->icq_list)) {
struct io_cq *icq = hlist_entry(ioc->icq_list.first,
@@ -115,13 +107,27 @@ static void ioc_release_fn(struct work_struct *work)
ioc_destroy_icq(icq);
spin_unlock(&q->queue_lock);
} else {
- spin_unlock_irqrestore(&ioc->lock, flags);
- cpu_relax();
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ /* Make sure q and icq cannot be freed. */
+ rcu_read_lock();
+
+ /* Re-acquire the locks in the correct order. */
+ spin_unlock(&ioc->lock);
+ spin_lock(&q->queue_lock);
+ spin_lock(&ioc->lock);
+
+ /*
+ * The icq may have been destroyed when the ioc lock
+ * was released.
+ */
+ if (!(icq->flags & ICQ_DESTROYED))
+ ioc_destroy_icq(icq);
+
+ spin_unlock(&q->queue_lock);
+ rcu_read_unlock();
}
}
- spin_unlock_irqrestore(&ioc->lock, flags);
+ spin_unlock_irq(&ioc->lock);
kmem_cache_free(iocontext_cachep, ioc);
}
@@ -170,7 +176,6 @@ void put_io_context(struct io_context *ioc)
*/
void put_io_context_active(struct io_context *ioc)
{
- unsigned long flags;
struct io_cq *icq;
if (!atomic_dec_and_test(&ioc->active_ref)) {
@@ -178,19 +183,14 @@ void put_io_context_active(struct io_context *ioc)
return;
}
- /*
- * Need ioc lock to walk icq_list and q lock to exit icq. Perform
- * reverse double locking. Read comment in ioc_release_fn() for
- * explanation on the nested locking annotation.
- */
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ spin_lock_irq(&ioc->lock);
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
if (icq->flags & ICQ_EXITED)
continue;
ioc_exit_icq(icq);
}
- spin_unlock_irqrestore(&ioc->lock, flags);
+ spin_unlock_irq(&ioc->lock);
put_io_context(ioc);
}
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 8ac4aad66ebc..cea5ee9be639 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2045,8 +2045,7 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
int levels = blkcg->css.cgroup->level + 1;
struct ioc_gq *iocg;
- iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
- gfp, q->node);
+ iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
if (!iocg)
return NULL;
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index c128d50cb410..f90429cf4edf 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -591,7 +591,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
struct rq_wait *rqw;
struct iolatency_grp *iolat;
u64 window_start;
- u64 now = ktime_to_ns(ktime_get());
+ u64 now;
bool issue_as_root = bio_issue_as_root_blkg(bio);
bool enabled = false;
int inflight = 0;
@@ -608,6 +608,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
if (!enabled)
return;
+ now = ktime_to_ns(ktime_get());
while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg);
if (!iolat) {
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 5f2c429d4378..019e09bb9c0e 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
unsigned int op;
- sector_t bs_mask;
+ sector_t bs_mask, part_offset = 0;
if (!q)
return -ENXIO;
@@ -54,9 +54,34 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (!nr_sects)
return -EINVAL;
+ /* In case the discard request is in a partition */
+ if (bdev->bd_partno)
+ part_offset = bdev->bd_part->start_sect;
+
while (nr_sects) {
- sector_t req_sects = min_t(sector_t, nr_sects,
- bio_allowed_max_sectors(q));
+ sector_t granularity_aligned_lba, req_sects;
+ sector_t sector_mapped = sector + part_offset;
+
+ granularity_aligned_lba = round_up(sector_mapped,
+ q->limits.discard_granularity >> SECTOR_SHIFT);
+
+ /*
+ * Check whether the discard bio starts at a discard_granularity
+ * aligned LBA,
+ * - If no: set (granularity_aligned_lba - sector_mapped) to
+ * bi_size of the first split bio, then the second bio will
+ * start at a discard_granularity aligned LBA on the device.
+ * - If yes: use bio_aligned_discard_max_sectors() as the max
+ * possible bi_size of the first split bio. Then when this bio
+ * is split in device drive, the split ones are very probably
+ * to be aligned to discard_granularity of the device's queue.
+ */
+ if (granularity_aligned_lba == sector_mapped)
+ req_sects = min_t(sector_t, nr_sects,
+ bio_aligned_discard_max_sectors(q));
+ else
+ req_sects = min_t(sector_t, nr_sects,
+ granularity_aligned_lba - sector_mapped);
WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f0b0bae075a0..5196dc145270 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -283,20 +283,20 @@ split:
/**
* __blk_queue_split - split a bio and submit the second half
- * @q: [in] request queue pointer
* @bio: [in, out] bio to be split
* @nr_segs: [out] number of segments in the first bio
*
* Split a bio into two bios, chain the two bios, submit the second half and
* store a pointer to the first half in *@bio. If the second bio is still too
* big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from @q->bio_split, it is the responsibility
- * of the caller to ensure that @q is only released after processing of the
+ * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is
+ * the responsibility of the caller to ensure that
+ * @bio->bi_disk->queue->bio_split is only released after processing of the
* split bio has finished.
*/
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
- unsigned int *nr_segs)
+void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
{
+ struct request_queue *q = (*bio)->bi_disk->queue;
struct bio *split = NULL;
switch (bio_op(*bio)) {
@@ -338,27 +338,26 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
- generic_make_request(*bio);
+ submit_bio_noacct(*bio);
*bio = split;
}
}
/**
* blk_queue_split - split a bio and submit the second half
- * @q: [in] request queue pointer
* @bio: [in, out] bio to be split
*
* Split a bio into two bios, chains the two bios, submit the second half and
* store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from @q->bio_split, it is the responsibility of the caller to
- * ensure that @q is only released after processing of the split bio has
- * finished.
+ * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of
+ * the caller to ensure that @bio->bi_disk->queue->bio_split is only released
+ * after processing of the split bio has finished.
*/
-void blk_queue_split(struct request_queue *q, struct bio **bio)
+void blk_queue_split(struct bio **bio)
{
unsigned int nr_segs;
- __blk_queue_split(q, bio, &nr_segs);
+ __blk_queue_split(bio, &nr_segs);
}
EXPORT_SYMBOL(blk_queue_split);
@@ -793,6 +792,8 @@ static struct request *attempt_merge(struct request_queue *q,
*/
blk_account_io_merge_request(next);
+ trace_block_rq_merge(q, next);
+
/*
* ownership of bio passed from next to req, return 'next' for
* the caller to free
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index e0b2bc131bf5..3f09bcb8a6fd 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -404,8 +404,7 @@ static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
const struct show_busy_params *params = data;
if (rq->mq_hctx == params->hctx)
- __blk_mq_debugfs_rq_show(params->m,
- list_entry_rq(&rq->queuelist));
+ __blk_mq_debugfs_rq_show(params->m, rq);
return true;
}
@@ -827,9 +826,6 @@ void blk_mq_debugfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
- q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
- blk_debugfs_root);
-
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
/*
@@ -860,9 +856,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
void blk_mq_debugfs_unregister(struct request_queue *q)
{
- debugfs_remove_recursive(q->debugfs_dir);
q->sched_debugfs_dir = NULL;
- q->debugfs_dir = NULL;
}
static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index fdcc2c1dd178..b8db72cf1043 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blk-mq.h>
+#include <linux/list_sort.h>
#include <trace/events/block.h>
@@ -80,6 +81,35 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
blk_mq_run_hw_queue(hctx, true);
}
+static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct request *rqa = container_of(a, struct request, queuelist);
+ struct request *rqb = container_of(b, struct request, queuelist);
+
+ return rqa->mq_hctx > rqb->mq_hctx;
+}
+
+static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
+{
+ struct blk_mq_hw_ctx *hctx =
+ list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
+ struct request *rq;
+ LIST_HEAD(hctx_list);
+ unsigned int count = 0;
+
+ list_for_each_entry(rq, rq_list, queuelist) {
+ if (rq->mq_hctx != hctx) {
+ list_cut_before(&hctx_list, rq_list, &rq->queuelist);
+ goto dispatch;
+ }
+ count++;
+ }
+ list_splice_tail_init(rq_list, &hctx_list);
+
+dispatch:
+ return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
+}
+
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
/*
@@ -90,12 +120,20 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
* be run again. This is necessary to avoid starving flushes.
*/
-static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
+ bool multi_hctxs = false, run_queue = false;
+ bool dispatched = false, busy = false;
+ unsigned int max_dispatch;
LIST_HEAD(rq_list);
- int ret = 0;
+ int count = 0;
+
+ if (hctx->dispatch_busy)
+ max_dispatch = 1;
+ else
+ max_dispatch = hctx->queue->nr_requests;
do {
struct request *rq;
@@ -104,16 +142,16 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
break;
if (!list_empty_careful(&hctx->dispatch)) {
- ret = -EAGAIN;
+ busy = true;
break;
}
- if (!blk_mq_get_dispatch_budget(hctx))
+ if (!blk_mq_get_dispatch_budget(q))
break;
rq = e->type->ops.dispatch_request(hctx);
if (!rq) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(q);
/*
* We're releasing without dispatching. Holding the
* budget could have blocked any "hctx"s with the
@@ -121,7 +159,7 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
* no guarantee anyone will kick the queue. Kick it
* ourselves.
*/
- blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
+ run_queue = true;
break;
}
@@ -130,8 +168,42 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
* if this rq won't be queued to driver via .queue_rq()
* in blk_mq_dispatch_rq_list().
*/
- list_add(&rq->queuelist, &rq_list);
- } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+ list_add_tail(&rq->queuelist, &rq_list);
+ if (rq->mq_hctx != hctx)
+ multi_hctxs = true;
+ } while (++count < max_dispatch);
+
+ if (!count) {
+ if (run_queue)
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
+ } else if (multi_hctxs) {
+ /*
+ * Requests from different hctx may be dequeued from some
+ * schedulers, such as bfq and deadline.
+ *
+ * Sort the requests in the list according to their hctx,
+ * dispatch batching requests from same hctx at a time.
+ */
+ list_sort(NULL, &rq_list, sched_rq_cmp);
+ do {
+ dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
+ } while (!list_empty(&rq_list));
+ } else {
+ dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
+ }
+
+ if (busy)
+ return -EAGAIN;
+ return !!dispatched;
+}
+
+static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+{
+ int ret;
+
+ do {
+ ret = __blk_mq_do_dispatch_sched(hctx);
+ } while (ret == 1);
return ret;
}
@@ -161,10 +233,9 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
int ret = 0;
+ struct request *rq;
do {
- struct request *rq;
-
if (!list_empty_careful(&hctx->dispatch)) {
ret = -EAGAIN;
break;
@@ -173,12 +244,12 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
- if (!blk_mq_get_dispatch_budget(hctx))
+ if (!blk_mq_get_dispatch_budget(q))
break;
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(q);
/*
* We're releasing without dispatching. Holding the
* budget could have blocked any "hctx"s with the
@@ -200,7 +271,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
- } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+ } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
WRITE_ONCE(hctx->dispatch_from, ctx);
return ret;
@@ -240,7 +311,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+ if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) {
if (has_sched_dispatch)
ret = blk_mq_do_dispatch_sched(hctx);
else
@@ -253,7 +324,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
ret = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(q, &rq_list, false);
+ blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
}
return ret;
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index ae722f8b13fb..32d82e23b095 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -56,43 +56,12 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
blk_mq_tag_wakeup_all(tags, false);
}
-/*
- * For shared tag users, we track the number of currently active users
- * and attempt to provide a fair share of the tag depth for each of them.
- */
-static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
- struct sbitmap_queue *bt)
-{
- unsigned int depth, users;
-
- if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
- return true;
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- return true;
-
- /*
- * Don't try dividing an ant
- */
- if (bt->sb.depth == 1)
- return true;
-
- users = atomic_read(&hctx->tags->active_queues);
- if (!users)
- return true;
-
- /*
- * Allow at least some tags
- */
- depth = max((bt->sb.depth + users - 1) / users, 4U);
- return atomic_read(&hctx->nr_active) < depth;
-}
-
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
struct sbitmap_queue *bt)
{
- if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
- !hctx_may_queue(data->hctx, bt))
+ if (!data->q->elevator && !hctx_may_queue(data->hctx, bt))
return BLK_MQ_NO_TAG;
+
if (data->shallow_depth)
return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
@@ -191,33 +160,6 @@ found_tag:
return tag + tag_offset;
}
-bool __blk_mq_get_driver_tag(struct request *rq)
-{
- struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
- unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
- bool shared = blk_mq_tag_busy(rq->mq_hctx);
- int tag;
-
- if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
- bt = &rq->mq_hctx->tags->breserved_tags;
- tag_offset = 0;
- }
-
- if (!hctx_may_queue(rq->mq_hctx, bt))
- return false;
- tag = __sbitmap_queue_get(bt);
- if (tag == BLK_MQ_NO_TAG)
- return false;
-
- rq->tag = tag + tag_offset;
- if (shared) {
- rq->rq_flags |= RQF_MQ_INFLIGHT;
- atomic_inc(&rq->mq_hctx->nr_active);
- }
- rq->mq_hctx->tags->rqs[rq->tag] = rq;
- return true;
-}
-
void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag)
{
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 2e4ef51cdb32..b1acac518c4e 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -51,14 +51,6 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
-bool __blk_mq_get_driver_tag(struct request *rq);
-static inline bool blk_mq_get_driver_tag(struct request *rq)
-{
- if (rq->tag != BLK_MQ_NO_TAG)
- return true;
- return __blk_mq_get_driver_tag(rq);
-}
-
extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
@@ -79,15 +71,34 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
}
/*
- * This helper should only be used for flush request to share tag
- * with the request cloned from, and both the two requests can't be
- * in flight at the same time. The caller has to make sure the tag
- * can't be freed.
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
*/
-static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
- unsigned int tag, struct request *rq)
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+ struct sbitmap_queue *bt)
{
- hctx->tags->rqs[tag] = rq;
+ unsigned int depth, users;
+
+ if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
+ return true;
+ if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return true;
+
+ /*
+ * Don't try dividing an ant
+ */
+ if (bt->sb.depth == 1)
+ return true;
+
+ users = atomic_read(&hctx->tags->active_queues);
+ if (!users)
+ return true;
+
+ /*
+ * Allow at least some tags
+ */
+ depth = max((bt->sb.depth + users - 1) / users, 4U);
+ return atomic_read(&hctx->nr_active) < depth;
}
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e0d173beaa3..667155f752f7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,6 +41,8 @@
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -275,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
- req_flags_t rq_flags = 0;
- if (data->flags & BLK_MQ_REQ_INTERNAL) {
+ if (data->q->elevator) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
} else {
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
- rq_flags = RQF_MQ_INFLIGHT;
- atomic_inc(&data->hctx->nr_active);
- }
rq->tag = tag;
rq->internal_tag = BLK_MQ_NO_TAG;
- data->hctx->tags->rqs[rq->tag] = rq;
}
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
rq->mq_hctx = data->hctx;
- rq->rq_flags = rq_flags;
+ rq->rq_flags = 0;
rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
@@ -362,8 +358,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (e) {
- data->flags |= BLK_MQ_REQ_INTERNAL;
-
/*
* Flush requests are special and go directly to the
* dispatch list. Don't include reserved tags in the
@@ -378,7 +372,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+ if (!e)
blk_mq_tag_busy(data->hctx);
/*
@@ -474,9 +468,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
data.ctx = __blk_mq_get_ctx(q, cpu);
- if (q->elevator)
- data.flags |= BLK_MQ_REQ_INTERNAL;
- else
+ if (!q->elevator)
blk_mq_tag_busy(data.hctx);
ret = -EWOULDBLOCK;
@@ -552,8 +544,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_stat_add(rq, now);
}
- if (rq->internal_tag != BLK_MQ_NO_TAG)
- blk_mq_sched_completed_request(rq, now);
+ blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@@ -574,71 +565,139 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
}
EXPORT_SYMBOL(blk_mq_end_request);
-static void __blk_mq_complete_request_remote(void *data)
+/*
+ * Softirq action handler - move entries to local list and loop over them
+ * while passing them to the queue registered handler.
+ */
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
- struct request *rq = data;
- struct request_queue *q = rq->q;
+ struct list_head *cpu_list, local_list;
- q->mq_ops->complete(rq);
+ local_irq_disable();
+ cpu_list = this_cpu_ptr(&blk_cpu_done);
+ list_replace_init(cpu_list, &local_list);
+ local_irq_enable();
+
+ while (!list_empty(&local_list)) {
+ struct request *rq;
+
+ rq = list_entry(local_list.next, struct request, ipi_list);
+ list_del_init(&rq->ipi_list);
+ rq->q->mq_ops->complete(rq);
+ }
}
-/**
- * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
- * injection that could drop the completion.
- * @rq: Request to be force completed
- *
- * Drivers should use blk_mq_complete_request() to complete requests in their
- * normal IO path. For timeout error recovery, drivers may call this forced
- * completion routine after they've reclaimed timed out requests to bypass
- * potentially subsequent fake timeouts.
- */
-void blk_mq_force_complete_rq(struct request *rq)
+static void blk_mq_trigger_softirq(struct request *rq)
{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- struct request_queue *q = rq->q;
- bool shared = false;
- int cpu;
+ struct list_head *list;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ list = this_cpu_ptr(&blk_cpu_done);
+ list_add_tail(&rq->ipi_list, list);
+
+ /*
+ * If the list only contains our just added request, signal a raise of
+ * the softirq. If there are already entries there, someone already
+ * raised the irq but it hasn't run yet.
+ */
+ if (list->next == &rq->ipi_list)
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+static int blk_softirq_cpu_dead(unsigned int cpu)
+{
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
+ this_cpu_ptr(&blk_cpu_done));
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_enable();
+
+ return 0;
+}
+
+
+static void __blk_mq_complete_request_remote(void *data)
+{
+ struct request *rq = data;
- WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
- * Most of single queue controllers, there is only one irq vector
- * for handling IO completion, and the only irq's affinity is set
- * as all possible CPUs. On most of ARCHs, this affinity means the
- * irq is handled on one specific CPU.
+ * For most of single queue controllers, there is only one irq vector
+ * for handling I/O completion, and the only irq's affinity is set
+ * to all possible CPUs. On most of ARCHs, this affinity means the irq
+ * is handled on one specific CPU.
*
- * So complete IO reqeust in softirq context in case of single queue
- * for not degrading IO performance by irqsoff latency.
+ * So complete I/O requests in softirq context in case of single queue
+ * devices to avoid degrading I/O performance due to irqsoff latency.
*/
- if (q->nr_hw_queues == 1) {
- __blk_complete_request(rq);
- return;
- }
+ if (rq->q->nr_hw_queues == 1)
+ blk_mq_trigger_softirq(rq);
+ else
+ rq->q->mq_ops->complete(rq);
+}
+
+static inline bool blk_mq_complete_need_ipi(struct request *rq)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!IS_ENABLED(CONFIG_SMP) ||
+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
+ return false;
+
+ /* same CPU or cache domain? Complete locally */
+ if (cpu == rq->mq_ctx->cpu ||
+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
+ return false;
+
+ /* don't try to IPI to an offline CPU */
+ return cpu_online(rq->mq_ctx->cpu);
+}
+
+bool blk_mq_complete_request_remote(struct request *rq)
+{
+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
* For a polled request, always complete locallly, it's pointless
* to redirect the completion.
*/
- if ((rq->cmd_flags & REQ_HIPRI) ||
- !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
- q->mq_ops->complete(rq);
- return;
- }
-
- cpu = get_cpu();
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
- shared = cpus_share_cache(cpu, ctx->cpu);
+ if (rq->cmd_flags & REQ_HIPRI)
+ return false;
- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
+ if (blk_mq_complete_need_ipi(rq)) {
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = 0;
- smp_call_function_single_async(ctx->cpu, &rq->csd);
+ smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
} else {
- q->mq_ops->complete(rq);
+ if (rq->q->nr_hw_queues > 1)
+ return false;
+ blk_mq_trigger_softirq(rq);
}
- put_cpu();
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
+
+/**
+ * blk_mq_complete_request - end I/O on a request
+ * @rq: the request being processed
+ *
+ * Description:
+ * Complete a request by scheduling the ->complete_rq operation.
+ **/
+void blk_mq_complete_request(struct request *rq)
+{
+ if (!blk_mq_complete_request_remote(rq))
+ rq->q->mq_ops->complete(rq);
}
-EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
+EXPORT_SYMBOL(blk_mq_complete_request);
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
__releases(hctx->srcu)
@@ -661,23 +720,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
}
/**
- * blk_mq_complete_request - end I/O on a request
- * @rq: the request being processed
- *
- * Description:
- * Ends all I/O on a request. It does not handle partial completions.
- * The actual completion happens out-of-order, through a IPI handler.
- **/
-bool blk_mq_complete_request(struct request *rq)
-{
- if (unlikely(blk_should_fake_timeout(rq->q)))
- return false;
- blk_mq_force_complete_rq(rq);
- return true;
-}
-EXPORT_SYMBOL(blk_mq_complete_request);
-
-/**
* blk_mq_start_request - Start processing a request
* @rq: Pointer to request to be started
*
@@ -1052,6 +1094,45 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
+static bool __blk_mq_get_driver_tag(struct request *rq)
+{
+ struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+ int tag;
+
+ blk_mq_tag_busy(rq->mq_hctx);
+
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+ bt = &rq->mq_hctx->tags->breserved_tags;
+ tag_offset = 0;
+ }
+
+ if (!hctx_may_queue(rq->mq_hctx, bt))
+ return false;
+ tag = __sbitmap_queue_get(bt);
+ if (tag == BLK_MQ_NO_TAG)
+ return false;
+
+ rq->tag = tag + tag_offset;
+ return true;
+}
+
+static bool blk_mq_get_driver_tag(struct request *rq)
+{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+ return false;
+
+ if ((hctx->flags & BLK_MQ_F_TAG_SHARED) &&
+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
+ atomic_inc(&hctx->nr_active);
+ }
+ hctx->tags->rqs[rq->tag] = rq;
+ return true;
+}
+
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
int flags, void *key)
{
@@ -1204,25 +1285,70 @@ static void blk_mq_handle_zone_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
+enum prep_dispatch {
+ PREP_DISPATCH_OK,
+ PREP_DISPATCH_NO_TAG,
+ PREP_DISPATCH_NO_BUDGET,
+};
+
+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
+ bool need_budget)
+{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
+ blk_mq_put_driver_tag(rq);
+ return PREP_DISPATCH_NO_BUDGET;
+ }
+
+ if (!blk_mq_get_driver_tag(rq)) {
+ /*
+ * The initial allocation attempt failed, so we need to
+ * rerun the hardware queue when a tag is freed. The
+ * waitqueue takes care of that. If the queue is run
+ * before we add this entry back on the dispatch list,
+ * we'll re-run it below.
+ */
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
+ /*
+ * All budgets not got from this function will be put
+ * together during handling partial dispatch
+ */
+ if (need_budget)
+ blk_mq_put_dispatch_budget(rq->q);
+ return PREP_DISPATCH_NO_TAG;
+ }
+ }
+
+ return PREP_DISPATCH_OK;
+}
+
+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
+static void blk_mq_release_budgets(struct request_queue *q,
+ unsigned int nr_budgets)
+{
+ int i;
+
+ for (i = 0; i < nr_budgets; i++)
+ blk_mq_put_dispatch_budget(q);
+}
+
/*
* Returns true if we did some work AND can potentially do more.
*/
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
- bool got_budget)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
+ unsigned int nr_budgets)
{
- struct blk_mq_hw_ctx *hctx;
+ enum prep_dispatch prep;
+ struct request_queue *q = hctx->queue;
struct request *rq, *nxt;
- bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
- bool no_budget_avail = false;
LIST_HEAD(zone_list);
if (list_empty(list))
return false;
- WARN_ON(!list_is_singular(list) && got_budget);
-
/*
* Now process all the entries, sending them to the driver.
*/
@@ -1232,32 +1358,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
- hctx = rq->mq_hctx;
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
- blk_mq_put_driver_tag(rq);
- no_budget_avail = true;
+ WARN_ON_ONCE(hctx != rq->mq_hctx);
+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+ if (prep != PREP_DISPATCH_OK)
break;
- }
-
- if (!blk_mq_get_driver_tag(rq)) {
- /*
- * The initial allocation attempt failed, so we need to
- * rerun the hardware queue when a tag is freed. The
- * waitqueue takes care of that. If the queue is run
- * before we add this entry back on the dispatch list,
- * we'll re-run it below.
- */
- if (!blk_mq_mark_tag_wait(hctx, rq)) {
- blk_mq_put_dispatch_budget(hctx);
- /*
- * For non-shared tags, the RESTART check
- * will suffice.
- */
- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
- no_tag = true;
- break;
- }
- }
list_del_init(&rq->queuelist);
@@ -1274,31 +1378,35 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bd.last = !blk_mq_get_driver_tag(nxt);
}
+ /*
+ * once the request is queued to lld, no need to cover the
+ * budget any more
+ */
+ if (nr_budgets)
+ nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
- blk_mq_handle_dev_resource(rq, list);
+ switch (ret) {
+ case BLK_STS_OK:
+ queued++;
break;
- } else if (ret == BLK_STS_ZONE_RESOURCE) {
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_handle_dev_resource(rq, list);
+ goto out;
+ case BLK_STS_ZONE_RESOURCE:
/*
* Move the request to zone_list and keep going through
* the dispatch list to find more requests the drive can
* accept.
*/
blk_mq_handle_zone_resource(rq, &zone_list);
- if (list_empty(list))
- break;
- continue;
- }
-
- if (unlikely(ret != BLK_STS_OK)) {
+ break;
+ default:
errors++;
blk_mq_end_request(rq, BLK_STS_IOERR);
- continue;
}
-
- queued++;
} while (!list_empty(list));
-
+out:
if (!list_empty(&zone_list))
list_splice_tail_init(&zone_list, list);
@@ -1310,6 +1418,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
*/
if (!list_empty(list)) {
bool needs_restart;
+ /* For non-shared tags, the RESTART check will suffice */
+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
+ (hctx->flags & BLK_MQ_F_TAG_SHARED);
+ bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
+
+ blk_mq_release_budgets(q, nr_budgets);
/*
* If we didn't flush the entire list, we could have told
@@ -1361,13 +1475,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
} else
blk_mq_update_dispatch_busy(hctx, false);
- /*
- * If the host/device is unable to accept more work, inform the
- * caller of that.
- */
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- return false;
-
return (queued + errors) != 0;
}
@@ -1896,11 +2003,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (q->elevator && !bypass_insert)
goto insert;
- if (!blk_mq_get_dispatch_budget(hctx))
+ if (!blk_mq_get_dispatch_budget(q))
goto insert;
if (!blk_mq_get_driver_tag(rq)) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(q);
goto insert;
}
@@ -2005,8 +2112,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
}
/**
- * blk_mq_make_request - Create and send a request to block device.
- * @q: Request queue pointer.
+ * blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
*
* Builds up a request structure from @q and @bio and send to the device. The
@@ -2020,8 +2126,9 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*
* Returns: Request queue cookie.
*/
-blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+blk_qc_t blk_mq_submit_bio(struct bio *bio)
{
+ struct request_queue *q = bio->bi_disk->queue;
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = {
@@ -2035,7 +2142,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_status_t ret;
blk_queue_bounce(q, &bio);
- __blk_queue_split(q, &bio, &nr_segs);
+ __blk_queue_split(&bio, &nr_segs);
if (!bio_integrity_prep(bio))
goto queue_exit;
@@ -2146,7 +2253,7 @@ queue_exit:
blk_queue_exit(q);
return BLK_QC_T_NONE;
}
-EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
+EXPORT_SYMBOL_GPL(blk_mq_submit_bio); /* only for request based dm */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
@@ -2886,7 +2993,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
{
struct request_queue *uninit_q, *q;
- uninit_q = __blk_alloc_queue(set->numa_node);
+ uninit_q = blk_alloc_queue(set->numa_node);
if (!uninit_q)
return ERR_PTR(-ENOMEM);
uninit_q->queuedata = queuedata;
@@ -3760,6 +3867,15 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
static int __init blk_mq_init(void)
{
+ int i;
+
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+
+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
+ "block/softirq:dead", NULL,
+ blk_softirq_cpu_dead);
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b3ce0f3a2ad2..863a2f3346d4 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -40,7 +40,8 @@ struct blk_mq_ctx {
void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
+ unsigned int);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
bool kick_requeue_list);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
@@ -159,7 +160,7 @@ struct blk_mq_alloc_data {
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
- if (data->flags & BLK_MQ_REQ_INTERNAL)
+ if (data->q->elevator)
return data->hctx->sched_tags;
return data->hctx->tags;
@@ -179,20 +180,16 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
-static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_put_dispatch_budget(struct request_queue *q)
{
- struct request_queue *q = hctx->queue;
-
if (q->mq_ops->put_budget)
- q->mq_ops->put_budget(hctx);
+ q->mq_ops->put_budget(q);
}
-static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+static inline bool blk_mq_get_dispatch_budget(struct request_queue *q)
{
- struct request_queue *q = hctx->queue;
-
if (q->mq_ops->get_budget)
- return q->mq_ops->get_budget(hctx);
+ return q->mq_ops->get_budget(q);
return true;
}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
deleted file mode 100644
index 6e7ec87d49fa..000000000000
--- a/block/blk-softirq.c
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions related to softirq rq completions
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/sched/topology.h>
-
-#include "blk.h"
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
-
-/*
- * Softirq action handler - move entries to local list and loop over them
- * while passing them to the queue registered handler.
- */
-static __latent_entropy void blk_done_softirq(struct softirq_action *h)
-{
- struct list_head *cpu_list, local_list;
-
- local_irq_disable();
- cpu_list = this_cpu_ptr(&blk_cpu_done);
- list_replace_init(cpu_list, &local_list);
- local_irq_enable();
-
- while (!list_empty(&local_list)) {
- struct request *rq;
-
- rq = list_entry(local_list.next, struct request, ipi_list);
- list_del_init(&rq->ipi_list);
- rq->q->mq_ops->complete(rq);
- }
-}
-
-#ifdef CONFIG_SMP
-static void trigger_softirq(void *data)
-{
- struct request *rq = data;
- struct list_head *list;
-
- list = this_cpu_ptr(&blk_cpu_done);
- list_add_tail(&rq->ipi_list, list);
-
- if (list->next == &rq->ipi_list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
-}
-
-/*
- * Setup and invoke a run of 'trigger_softirq' on the given cpu.
- */
-static int raise_blk_irq(int cpu, struct request *rq)
-{
- if (cpu_online(cpu)) {
- call_single_data_t *data = &rq->csd;
-
- data->func = trigger_softirq;
- data->info = rq;
- data->flags = 0;
-
- smp_call_function_single_async(cpu, data);
- return 0;
- }
-
- return 1;
-}
-#else /* CONFIG_SMP */
-static int raise_blk_irq(int cpu, struct request *rq)
-{
- return 1;
-}
-#endif
-
-static int blk_softirq_cpu_dead(unsigned int cpu)
-{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_done, cpu),
- this_cpu_ptr(&blk_cpu_done));
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_enable();
-
- return 0;
-}
-
-void __blk_complete_request(struct request *req)
-{
- struct request_queue *q = req->q;
- int cpu, ccpu = req->mq_ctx->cpu;
- unsigned long flags;
- bool shared = false;
-
- BUG_ON(!q->mq_ops->complete);
-
- local_irq_save(flags);
- cpu = smp_processor_id();
-
- /*
- * Select completion CPU
- */
- if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) {
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
- shared = cpus_share_cache(cpu, ccpu);
- } else
- ccpu = cpu;
-
- /*
- * If current CPU and requested CPU share a cache, run the softirq on
- * the current CPU. One might concern this is just like
- * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
- * running in interrupt handler, and currently I/O controller doesn't
- * support multiple interrupts, so current CPU is unique actually. This
- * avoids IPI sending from current CPU to the first CPU of a group.
- */
- if (ccpu == cpu || shared) {
- struct list_head *list;
-do_local:
- list = this_cpu_ptr(&blk_cpu_done);
- list_add_tail(&req->ipi_list, list);
-
- /*
- * if the list only contains our just added request,
- * signal a raise of the softirq. If there are already
- * entries there, someone already raised the irq but it
- * hasn't run yet.
- */
- if (list->next == &req->ipi_list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- } else if (raise_blk_irq(ccpu, req))
- goto do_local;
-
- local_irq_restore(flags);
-}
-
-static __init int blk_softirq_init(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-
- open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
- cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
- "block/softirq:dead", NULL,
- blk_softirq_cpu_dead);
- return 0;
-}
-subsys_initcall(blk_softirq_init);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 02643e149d5e..be67952e7be2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -11,6 +11,7 @@
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
#include <linux/blk-cgroup.h>
+#include <linux/debugfs.h>
#include "blk.h"
#include "blk-mq.h"
@@ -873,22 +874,32 @@ static void blk_exit_queue(struct request_queue *q)
bdi_put(q->backing_dev_info);
}
-
/**
- * __blk_release_queue - release a request queue
- * @work: pointer to the release_work member of the request queue to be released
+ * blk_release_queue - releases all allocated resources of the request_queue
+ * @kobj: pointer to a kobject, whose container is a request_queue
+ *
+ * This function releases all allocated resources of the request queue.
+ *
+ * The struct request_queue refcount is incremented with blk_get_queue() and
+ * decremented with blk_put_queue(). Once the refcount reaches 0 this function
+ * is called.
+ *
+ * For drivers that have a request_queue on a gendisk and added with
+ * __device_add_disk() the refcount to request_queue will reach 0 with
+ * the last put_disk() called by the driver. For drivers which don't use
+ * __device_add_disk() this happens with blk_cleanup_queue().
+ *
+ * Drivers exist which depend on the release of the request_queue to be
+ * synchronous, it should not be deferred.
*
- * Description:
- * This function is called when a block device is being unregistered. The
- * process of releasing a request queue starts with blk_cleanup_queue, which
- * set the appropriate flags and then calls blk_put_queue, that decrements
- * the reference counter of the request queue. Once the reference counter
- * of the request queue reaches zero, blk_release_queue is called to release
- * all allocated resources of the request queue.
+ * Context: can sleep
*/
-static void __blk_release_queue(struct work_struct *work)
+static void blk_release_queue(struct kobject *kobj)
{
- struct request_queue *q = container_of(work, typeof(*q), release_work);
+ struct request_queue *q =
+ container_of(kobj, struct request_queue, kobj);
+
+ might_sleep();
if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
blk_stat_remove_callback(q, q->poll_cb);
@@ -907,6 +918,9 @@ static void __blk_release_queue(struct work_struct *work)
blk_mq_release(q);
blk_trace_shutdown(q);
+ mutex_lock(&q->debugfs_mutex);
+ debugfs_remove_recursive(q->debugfs_dir);
+ mutex_unlock(&q->debugfs_mutex);
if (queue_is_mq(q))
blk_mq_debugfs_unregister(q);
@@ -917,15 +931,6 @@ static void __blk_release_queue(struct work_struct *work)
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
-static void blk_release_queue(struct kobject *kobj)
-{
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
-
- INIT_WORK(&q->release_work, __blk_release_queue);
- schedule_work(&q->release_work);
-}
-
static const struct sysfs_ops queue_sysfs_ops = {
.show = queue_attr_show,
.store = queue_attr_store,
@@ -988,6 +993,11 @@ int blk_register_queue(struct gendisk *disk)
goto unlock;
}
+ mutex_lock(&q->debugfs_mutex);
+ q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
+ blk_debugfs_root);
+ mutex_unlock(&q->debugfs_mutex);
+
if (queue_is_mq(q)) {
__blk_mq_register_dev(dev, q);
blk_mq_debugfs_register(q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 209fdd8939fb..fee3325edf27 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1339,8 +1339,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
- while((bio = bio_list_pop(&bio_list_on_stack)))
- generic_make_request(bio);
+ while ((bio = bio_list_pop(&bio_list_on_stack)))
+ submit_bio_noacct(bio);
blk_finish_plug(&plug);
}
}
@@ -2158,17 +2158,18 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
}
#endif
-bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
- struct bio *bio)
+bool blk_throtl_bio(struct bio *bio)
{
+ struct request_queue *q = bio->bi_disk->queue;
+ struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
- struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
+ struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
bool throttled = false;
struct throtl_data *td = tg->td;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ rcu_read_lock();
/* see throtl_charge_bio() */
if (bio_flagged(bio, BIO_THROTTLED))
@@ -2273,6 +2274,7 @@ out:
if (throttled || !td->track_bio_latency)
bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
#endif
+ rcu_read_unlock();
return throttled;
}
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 8aa68fae96ad..4c1fc3417460 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -20,13 +20,11 @@ static int __init setup_fail_io_timeout(char *str)
}
__setup("fail_io_timeout=", setup_fail_io_timeout);
-int blk_should_fake_timeout(struct request_queue *q)
+bool __blk_should_fake_timeout(struct request_queue *q)
{
- if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
- return 0;
-
return should_fail(&fail_io_timeout, 1);
}
+EXPORT_SYMBOL_GPL(__blk_should_fake_timeout);
static int __init fail_io_timeout_debugfs(void)
{
@@ -90,11 +88,29 @@ void blk_abort_request(struct request *req)
}
EXPORT_SYMBOL_GPL(blk_abort_request);
+static unsigned long blk_timeout_mask __read_mostly;
+
+static int __init blk_timeout_init(void)
+{
+ blk_timeout_mask = roundup_pow_of_two(HZ) - 1;
+ return 0;
+}
+
+late_initcall(blk_timeout_init);
+
+/*
+ * Just a rough estimate, we don't care about specific values for timeouts.
+ */
+static inline unsigned long blk_round_jiffies(unsigned long j)
+{
+ return (j + blk_timeout_mask) + 1;
+}
+
unsigned long blk_rq_timeout(unsigned long timeout)
{
unsigned long maxt;
- maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
+ maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT);
if (time_after(timeout, maxt))
timeout = maxt;
@@ -131,7 +147,7 @@ void blk_add_timer(struct request *req)
* than an existing one, modify the timer. Round up to next nearest
* second.
*/
- expiry = blk_rq_timeout(round_jiffies_up(expiry));
+ expiry = blk_rq_timeout(blk_round_jiffies(expiry));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk.h b/block/blk.h
index b5d1f0fc6547..49e2928a1632 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -14,9 +14,7 @@
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
-#ifdef CONFIG_DEBUG_FS
extern struct dentry *blk_debugfs_root;
-#endif
struct blk_flush_queue {
unsigned int flush_pending_idx:1;
@@ -27,11 +25,6 @@ struct blk_flush_queue {
struct list_head flush_data_in_flight;
struct request *flush_rq;
- /*
- * flush_rq shares tag with this rq, both can't be active
- * at the same time
- */
- struct request *orig_rq;
struct lock_class_key key;
spinlock_t mq_flush_lock;
};
@@ -223,21 +216,11 @@ ssize_t part_fail_show(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t part_fail_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count);
-
-#ifdef CONFIG_FAIL_IO_TIMEOUT
-int blk_should_fake_timeout(struct request_queue *);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
-#else
-static inline int blk_should_fake_timeout(struct request_queue *q)
-{
- return 0;
-}
-#endif
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
- unsigned int *nr_segs);
+void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
int ll_front_merge_fn(struct request *req, struct bio *bio,
@@ -282,6 +265,20 @@ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
}
/*
+ * The max bio size which is aligned to q->limits.discard_granularity. This
+ * is a hint to split large discard bio in generic block layer, then if device
+ * driver needs to split the discard bio into smaller ones, their bi_size can
+ * be very probably and easily aligned to discard_granularity of the device's
+ * queue.
+ */
+static inline unsigned int bio_aligned_discard_max_sectors(
+ struct request_queue *q)
+{
+ return round_down(UINT_MAX, q->limits.discard_granularity) >>
+ SECTOR_SHIFT;
+}
+
+/*
* Internal io_context interface
*/
void get_io_context(struct io_context *ioc);
@@ -299,10 +296,12 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
+bool blk_throtl_bio(struct bio *bio);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
+static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
@@ -434,8 +433,6 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
#endif
}
-struct request_queue *__blk_alloc_queue(int node_id);
-
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
diff --git a/block/bounce.c b/block/bounce.c
index c3aaed070124..431be88a0240 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -309,7 +309,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
if (!passthrough && sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
bio_chain(bio, *bio_orig);
- generic_make_request(*bio_orig);
+ submit_bio_noacct(*bio_orig);
*bio_orig = bio;
}
bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 6cbb7926534c..fb7b347f8010 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -181,9 +181,12 @@ EXPORT_SYMBOL_GPL(bsg_job_get);
void bsg_job_done(struct bsg_job *job, int result,
unsigned int reply_payload_rcv_len)
{
+ struct request *rq = blk_mq_rq_from_pdu(job);
+
job->result = result;
job->reply_payload_rcv_len = reply_payload_rcv_len;
- blk_mq_complete_request(blk_mq_rq_from_pdu(job));
+ if (likely(!blk_should_fake_timeout(rq->q)))
+ blk_mq_complete_request(rq);
}
EXPORT_SYMBOL_GPL(bsg_job_done);
diff --git a/block/genhd.c b/block/genhd.c
index 1a7659327664..8b1e9f48957c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -38,8 +38,6 @@ static struct kobject *block_depr;
static DEFINE_SPINLOCK(ext_devt_lock);
static DEFINE_IDR(ext_devt_idr);
-static const struct device_type disk_type;
-
static void disk_check_events(struct disk_events *ev,
unsigned int *clearing_ptr);
static void disk_alloc_events(struct gendisk *disk);
@@ -876,11 +874,32 @@ static void invalidate_partition(struct gendisk *disk, int partno)
bdput(bdev);
}
+/**
+ * del_gendisk - remove the gendisk
+ * @disk: the struct gendisk to remove
+ *
+ * Removes the gendisk and all its associated resources. This deletes the
+ * partitions associated with the gendisk, and unregisters the associated
+ * request_queue.
+ *
+ * This is the counter to the respective __device_add_disk() call.
+ *
+ * The final removal of the struct gendisk happens when its refcount reaches 0
+ * with put_disk(), which should be called after del_gendisk(), if
+ * __device_add_disk() was used.
+ *
+ * Drivers exist which depend on the release of the gendisk to be synchronous,
+ * it should not be deferred.
+ *
+ * Context: can sleep
+ */
void del_gendisk(struct gendisk *disk)
{
struct disk_part_iter piter;
struct hd_struct *part;
+ might_sleep();
+
blk_integrity_del(disk);
disk_del_events(disk);
@@ -971,11 +990,15 @@ static ssize_t disk_badblocks_store(struct device *dev,
*
* This function gets the structure containing partitioning
* information for the given device @devt.
+ *
+ * Context: can sleep
*/
struct gendisk *get_gendisk(dev_t devt, int *partno)
{
struct gendisk *disk = NULL;
+ might_sleep();
+
if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
struct kobject *kobj;
@@ -1514,10 +1537,31 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
return 0;
}
+/**
+ * disk_release - releases all allocated resources of the gendisk
+ * @dev: the device representing this disk
+ *
+ * This function releases all allocated resources of the gendisk.
+ *
+ * The struct gendisk refcount is incremented with get_gendisk() or
+ * get_disk_and_module(), and its refcount is decremented with
+ * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this
+ * function is called.
+ *
+ * Drivers which used __device_add_disk() have a gendisk with a request_queue
+ * assigned. Since the request_queue sits on top of the gendisk for these
+ * drivers we also call blk_put_queue() for them, and we expect the
+ * request_queue refcount to reach 0 at this point, and so the request_queue
+ * will also be freed prior to the disk.
+ *
+ * Context: can sleep
+ */
static void disk_release(struct device *dev)
{
struct gendisk *disk = dev_to_disk(dev);
+ might_sleep();
+
blk_free_devt(dev->devt);
disk_release_events(disk);
kfree(disk->random);
@@ -1541,7 +1585,7 @@ static char *block_devnode(struct device *dev, umode_t *mode,
return NULL;
}
-static const struct device_type disk_type = {
+const struct device_type disk_type = {
.name = "disk",
.groups = disk_attr_groups,
.release = disk_release,
@@ -1727,6 +1771,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
}
EXPORT_SYMBOL(__alloc_disk_node);
+/**
+ * get_disk_and_module - increments the gendisk and gendisk fops module refcount
+ * @disk: the struct gendisk to to increment the refcount for
+ *
+ * This increments the refcount for the struct gendisk, and the gendisk's
+ * fops module owner.
+ *
+ * Context: Any context.
+ */
struct kobject *get_disk_and_module(struct gendisk *disk)
{
struct module *owner;
@@ -1747,6 +1800,16 @@ struct kobject *get_disk_and_module(struct gendisk *disk)
}
EXPORT_SYMBOL(get_disk_and_module);
+/**
+ * put_disk - decrements the gendisk refcount
+ * @disk: the struct gendisk to to decrement the refcount for
+ *
+ * This decrements the refcount for the struct gendisk. When this reaches 0
+ * we'll have disk_release() called.
+ *
+ * Context: Any context, but the last reference must not be dropped from
+ * atomic context.
+ */
void put_disk(struct gendisk *disk)
{
if (disk)
@@ -1754,9 +1817,15 @@ void put_disk(struct gendisk *disk)
}
EXPORT_SYMBOL(put_disk);
-/*
+/**
+ * put_disk_and_module - decrements the module and gendisk refcount
+ * @disk: the struct gendisk to to decrement the refcount for
+ *
* This is a counterpart of get_disk_and_module() and thus also of
* get_gendisk().
+ *
+ * Context: Any context, but the last reference must not be dropped from
+ * atomic context.
*/
void put_disk_and_module(struct gendisk *disk)
{
@@ -1985,18 +2054,12 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
*/
unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
- const struct block_device_operations *bdops = disk->fops;
struct disk_events *ev = disk->ev;
unsigned int pending;
unsigned int clearing = mask;
- if (!ev) {
- /* for drivers still using the old ->media_changed method */
- if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
- bdops->media_changed && bdops->media_changed(disk))
- return DISK_EVENT_MEDIA_CHANGE;
+ if (!ev)
return 0;
- }
disk_block_events(disk);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 78951e33b2d7..e62a98a8eeb7 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -619,8 +619,6 @@ int blk_drop_partitions(struct block_device *bdev)
struct disk_part_iter piter;
struct hd_struct *part;
- if (!disk_part_scan_enabled(bdev->bd_disk))
- return 0;
if (bdev->bd_part_count)
return -EBUSY;