summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig3
-rw-r--r--block/Kconfig.iosched1
-rw-r--r--block/Makefile2
-rw-r--r--block/bdev.c34
-rw-r--r--block/bfq-iosched.c304
-rw-r--r--block/bfq-iosched.h35
-rw-r--r--block/bio.c35
-rw-r--r--block/blk-cgroup.c10
-rw-r--r--block/blk-core.c396
-rw-r--r--block/blk-crypto-profile.c5
-rw-r--r--block/blk-exec.c116
-rw-r--r--block/blk-flush.c30
-rw-r--r--block/blk-integrity.c2
-rw-r--r--block/blk-ioc.c318
-rw-r--r--block/blk-iocost.c9
-rw-r--r--block/blk-ioprio.c13
-rw-r--r--block/blk-merge.c18
-rw-r--r--block/blk-mq-debugfs.c5
-rw-r--r--block/blk-mq-sched.c29
-rw-r--r--block/blk-mq-sched.h2
-rw-r--r--block/blk-mq-sysfs.c2
-rw-r--r--block/blk-mq-tag.c67
-rw-r--r--block/blk-mq-tag.h2
-rw-r--r--block/blk-mq.c990
-rw-r--r--block/blk-mq.h24
-rw-r--r--block/blk-stat.c39
-rw-r--r--block/blk-stat.h2
-rw-r--r--block/blk-sysfs.c23
-rw-r--r--block/blk-throttle.c1
-rw-r--r--block/blk.h117
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/elevator.c20
-rw-r--r--block/fops.c41
-rw-r--r--block/genhd.c62
-rw-r--r--block/ioctl.c31
-rw-r--r--block/ioprio.c44
-rw-r--r--block/kyber-iosched.c1
-rw-r--r--block/partitions/core.c24
38 files changed, 1478 insertions, 1381 deletions
diff --git a/block/Kconfig b/block/Kconfig
index c6ce41a5e5b2..d5d4197b7ed2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT
config BLK_DEV_BSG_COMMON
tristate
+config BLK_ICQ
+ bool
+
config BLK_DEV_BSGLIB
bool "Block layer SG support v4 helper lib"
select BLK_DEV_BSG_COMMON
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 885fee86dfca..615516146086 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -18,6 +18,7 @@ config MQ_IOSCHED_KYBER
config IOSCHED_BFQ
tristate "BFQ I/O scheduler"
+ select BLK_ICQ
help
BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
of the device among all processes according to their weights,
diff --git a/block/Makefile b/block/Makefile
index 44df57e562bf..f38eaa612929 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
- blk-exec.o blk-merge.o blk-timeout.o \
+ blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
diff --git a/block/bdev.c b/block/bdev.c
index b4dab2fb6a74..8bf93a19041b 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -665,7 +665,7 @@ static void blkdev_flush_mapping(struct block_device *bdev)
static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
{
struct gendisk *disk = bdev->bd_disk;
- int ret = 0;
+ int ret;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
@@ -750,21 +750,11 @@ struct block_device *blkdev_get_no_open(dev_t dev)
if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
bdev = NULL;
iput(inode);
-
- if (!bdev)
- return NULL;
- if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
- !try_module_get(bdev->bd_disk->fops->owner)) {
- put_device(&bdev->bd_device);
- return NULL;
- }
-
return bdev;
}
void blkdev_put_no_open(struct block_device *bdev)
{
- module_put(bdev->bd_disk->fops->owner);
put_device(&bdev->bd_device);
}
@@ -820,12 +810,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
ret = -ENXIO;
if (!disk_live(disk))
goto abort_claiming;
+ if (!try_module_get(disk->fops->owner))
+ goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
- goto abort_claiming;
+ goto put_module;
if (mode & FMODE_EXCL) {
bd_finish_claiming(bdev, holder);
@@ -837,7 +829,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
* used in blkdev_get/put().
*/
if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
- (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+ (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
bdev->bd_write_holder = true;
unblock_events = false;
}
@@ -847,7 +839,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
if (unblock_events)
disk_unblock_events(disk);
return bdev;
-
+put_module:
+ module_put(disk->fops->owner);
abort_claiming:
if (mode & FMODE_EXCL)
bd_abort_claiming(bdev, holder);
@@ -956,20 +949,21 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
blkdev_put_whole(bdev, mode);
mutex_unlock(&disk->open_mutex);
+ module_put(disk->fops->owner);
blkdev_put_no_open(bdev);
}
EXPORT_SYMBOL(blkdev_put);
/**
- * lookup_bdev - lookup a struct block_device by name
- * @pathname: special file representing the block device
- * @dev: return value of the block device's dev_t
+ * lookup_bdev() - Look up a struct block_device by name.
+ * @pathname: Name of the block device in the filesystem.
+ * @dev: Pointer to the block device's dev_t, if found.
*
* Lookup the block device's dev_t at @pathname in the current
- * namespace if possible and return it by @dev.
+ * namespace if possible and return it in @dev.
*
- * RETURNS:
- * 0 if succeeded, errno otherwise.
+ * Context: May sleep.
+ * Return: 0 if succeeded, negative errno otherwise.
*/
int lookup_bdev(const char *pathname, dev_t *dev)
{
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fec18118dc30..0c612a911696 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
/**
* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
- * @bfqd: the lookup key.
- * @ioc: the io_context of the process doing I/O.
* @q: the request queue.
*/
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
- struct io_context *ioc,
- struct request_queue *q)
+static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
{
- if (ioc) {
- unsigned long flags;
- struct bfq_io_cq *icq;
+ struct bfq_io_cq *icq;
+ unsigned long flags;
- spin_lock_irqsave(&q->queue_lock, flags);
- icq = icq_to_bic(ioc_lookup_icq(ioc, q));
- spin_unlock_irqrestore(&q->queue_lock, flags);
+ if (!current->io_context)
+ return NULL;
- return icq;
- }
+ spin_lock_irqsave(&q->queue_lock, flags);
+ icq = icq_to_bic(ioc_lookup_icq(q));
+ spin_unlock_irqrestore(&q->queue_lock, flags);
- return NULL;
+ return icq;
}
/*
@@ -565,26 +560,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
}
}
+#define BFQ_LIMIT_INLINE_DEPTH 16
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
+ struct bfq_entity **entities = inline_entities;
+ int depth, level;
+ int class_idx = bfqq->ioprio_class - 1;
+ struct bfq_sched_data *sched_data;
+ unsigned long wsum;
+ bool ret = false;
+
+ if (!entity->on_st_or_in_serv)
+ return false;
+
+ /* +1 for bfqq entity, root cgroup not included */
+ depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
+ if (depth > BFQ_LIMIT_INLINE_DEPTH) {
+ entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
+ if (!entities)
+ return false;
+ }
+
+ spin_lock_irq(&bfqd->lock);
+ sched_data = entity->sched_data;
+ /* Gather our ancestors as we need to traverse them in reverse order */
+ level = 0;
+ for_each_entity(entity) {
+ /*
+ * If at some level entity is not even active, allow request
+ * queueing so that BFQ knows there's work to do and activate
+ * entities.
+ */
+ if (!entity->on_st_or_in_serv)
+ goto out;
+ /* Uh, more parents than cgroup subsystem thinks? */
+ if (WARN_ON_ONCE(level >= depth))
+ break;
+ entities[level++] = entity;
+ }
+ WARN_ON_ONCE(level != depth);
+ for (level--; level >= 0; level--) {
+ entity = entities[level];
+ if (level > 0) {
+ wsum = bfq_entity_service_tree(entity)->wsum;
+ } else {
+ int i;
+ /*
+ * For bfqq itself we take into account service trees
+ * of all higher priority classes and multiply their
+ * weights so that low prio queue from higher class
+ * gets more requests than high prio queue from lower
+ * class.
+ */
+ wsum = 0;
+ for (i = 0; i <= class_idx; i++) {
+ wsum = wsum * IOPRIO_BE_NR +
+ sched_data->service_tree[i].wsum;
+ }
+ }
+ limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
+ if (entity->allocated >= limit) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "too many requests: allocated %d limit %d level %d",
+ entity->allocated, limit, level);
+ ret = true;
+ break;
+ }
+ }
+out:
+ spin_unlock_irq(&bfqd->lock);
+ if (entities != inline_entities)
+ kfree(entities);
+ return ret;
+}
+#else
+static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+{
+ return false;
+}
+#endif
+
/*
* Async I/O can easily starve sync I/O (both sync reads and sync
* writes), by consuming all tags. Similarly, storms of sync writes,
* such as those that sync(2) may trigger, can starve sync reads.
* Limit depths of async I/O and sync writes so as to counter both
* problems.
+ *
+ * Also if a bfq queue or its parent cgroup consume more tags than would be
+ * appropriate for their weight, we trim the available tag depth to 1. This
+ * avoids a situation where one cgroup can starve another cgroup from tags and
+ * thus block service differentiation among cgroups. Note that because the
+ * queue / cgroup already has many requests allocated and queued, this does not
+ * significantly affect service guarantees coming from the BFQ scheduling
+ * algorithm.
*/
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
+ struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
+ struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
+ int depth;
+ unsigned limit = data->q->nr_requests;
+
+ /* Sync reads have full depth available */
+ if (op_is_sync(op) && !op_is_write(op)) {
+ depth = 0;
+ } else {
+ depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+ limit = (limit * depth) >> bfqd->full_depth_shift;
+ }
- if (op_is_sync(op) && !op_is_write(op))
- return;
-
- data->shallow_depth =
- bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+ /*
+ * Does queue (or any parent entity) exceed number of requests that
+ * should be available to it? Heavily limit depth so that it cannot
+ * consume more available requests and thus starve other entities.
+ */
+ if (bfqq && bfqq_request_over_limit(bfqq, limit))
+ depth = 1;
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- __func__, bfqd->wr_busy_queues, op_is_sync(op),
- data->shallow_depth);
+ __func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
+ if (depth)
+ data->shallow_depth = depth;
}
static struct bfq_queue *
@@ -1113,7 +1216,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
+ return bfqq->ref - bfqq->entity.allocated -
+ bfqq->entity.on_st_or_in_serv -
(bfqq->weight_counter != NULL) - bfqq->stable_ref;
}
@@ -1982,20 +2086,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
* aspect, see the comments on the choice of the queue for injection
* in bfq_select_queue().
*
- * Turning back to the detection of a waker queue, a queue Q is deemed
- * as a waker queue for bfqq if, for three consecutive times, bfqq
- * happens to become non empty right after a request of Q has been
- * completed. In this respect, even if bfqq is empty, we do not check
- * for a waker if it still has some in-flight I/O. In fact, in this
- * case bfqq is actually still being served by the drive, and may
- * receive new I/O on the completion of some of the in-flight
- * requests. In particular, on the first time, Q is tentatively set as
- * a candidate waker queue, while on the third consecutive time that Q
- * is detected, the field waker_bfqq is set to Q, to confirm that Q is
- * a waker queue for bfqq. These detection steps are performed only if
- * bfqq has a long think time, so as to make it more likely that
- * bfqq's I/O is actually being blocked by a synchronization. This
- * last filter, plus the above three-times requirement, make false
+ * Turning back to the detection of a waker queue, a queue Q is deemed as a
+ * waker queue for bfqq if, for three consecutive times, bfqq happens to become
+ * non empty right after a request of Q has been completed within given
+ * timeout. In this respect, even if bfqq is empty, we do not check for a waker
+ * if it still has some in-flight I/O. In fact, in this case bfqq is actually
+ * still being served by the drive, and may receive new I/O on the completion
+ * of some of the in-flight requests. In particular, on the first time, Q is
+ * tentatively set as a candidate waker queue, while on the third consecutive
+ * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q
+ * is a waker queue for bfqq. These detection steps are performed only if bfqq
+ * has a long think time, so as to make it more likely that bfqq's I/O is
+ * actually being blocked by a synchronization. This last filter, plus the
+ * above three-times requirement and time limit for detection, make false
* positives less likely.
*
* NOTE
@@ -2019,6 +2122,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
u64 now_ns)
{
+ char waker_name[MAX_BFQQ_NAME_LENGTH];
+
if (!bfqd->last_completed_rq_bfqq ||
bfqd->last_completed_rq_bfqq == bfqq ||
bfq_bfqq_has_short_ttime(bfqq) ||
@@ -2027,8 +2132,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
return;
+ /*
+ * We reset waker detection logic also if too much time has passed
+ * since the first detection. If wakeups are rare, pointless idling
+ * doesn't hurt throughput that much. The condition below makes sure
+ * we do not uselessly idle blocking waker in more than 1/64 cases.
+ */
if (bfqd->last_completed_rq_bfqq !=
- bfqq->tentative_waker_bfqq) {
+ bfqq->tentative_waker_bfqq ||
+ now_ns > bfqq->waker_detection_started +
+ 128 * (u64)bfqd->bfq_slice_idle) {
/*
* First synchronization detected with a
* candidate waker queue, or with a different
@@ -2037,12 +2150,19 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->tentative_waker_bfqq =
bfqd->last_completed_rq_bfqq;
bfqq->num_waker_detections = 1;
+ bfqq->waker_detection_started = now_ns;
+ bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
+ MAX_BFQQ_NAME_LENGTH);
+ bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
} else /* Same tentative waker queue detected again */
bfqq->num_waker_detections++;
if (bfqq->num_waker_detections == 3) {
bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
bfqq->tentative_waker_bfqq = NULL;
+ bfq_bfqq_name(bfqq->waker_bfqq, waker_name,
+ MAX_BFQQ_NAME_LENGTH);
+ bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name);
/*
* If the waker queue disappears, then
@@ -2332,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
* returned by bfq_bic_lookup does not go away before
* bfqd->lock is taken.
*/
- struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+ struct bfq_io_cq *bic = bfq_bic_lookup(q);
bool ret;
spin_lock_irq(&bfqd->lock);
@@ -5878,6 +5998,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
}
+static void bfqq_request_allocated(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ for_each_entity(entity)
+ entity->allocated++;
+}
+
+static void bfqq_request_freed(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ for_each_entity(entity)
+ entity->allocated--;
+}
+
/* returns true if it causes the idle timer to be disabled */
static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
@@ -5891,8 +6027,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
*/
- new_bfqq->allocated++;
- bfqq->allocated--;
+ bfqq_request_allocated(new_bfqq);
+ bfqq_request_freed(bfqq);
new_bfqq->ref++;
/*
* If the bic associated with the process
@@ -5991,48 +6127,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
-
- /*
- * Reqs with at_head or passthrough flags set are to be put
- * directly into dispatch list. Additional case for putting rq
- * directly into the dispatch queue: the only active
- * bfq_queues are bfqq and either its waker bfq_queue or one
- * of its woken bfq_queues. The rationale behind this
- * additional condition is as follows:
- * - consider a bfq_queue, say Q1, detected as a waker of
- * another bfq_queue, say Q2
- * - by definition of a waker, Q1 blocks the I/O of Q2, i.e.,
- * some I/O of Q1 needs to be completed for new I/O of Q2
- * to arrive. A notable example of waker is journald
- * - so, Q1 and Q2 are in any respect the queues of two
- * cooperating processes (or of two cooperating sets of
- * processes): the goal of Q1's I/O is doing what needs to
- * be done so that new Q2's I/O can finally be
- * issued. Therefore, if the service of Q1's I/O is delayed,
- * then Q2's I/O is delayed too. Conversely, if Q2's I/O is
- * delayed, the goal of Q1's I/O is hindered.
- * - as a consequence, if some I/O of Q1/Q2 arrives while
- * Q2/Q1 is the only queue in service, there is absolutely
- * no point in delaying the service of such an I/O. The
- * only possible result is a throughput loss
- * - so, when the above condition holds, the best option is to
- * have the new I/O dispatched as soon as possible
- * - the most effective and efficient way to attain the above
- * goal is to put the new I/O directly in the dispatch
- * list
- * - as an additional restriction, Q1 and Q2 must be the only
- * busy queues for this commit to put the I/O of Q2/Q1 in
- * the dispatch list. This is necessary, because, if also
- * other queues are waiting for service, then putting new
- * I/O directly in the dispatch list may evidently cause a
- * violation of service guarantees for the other queues
- */
- if (!bfqq ||
- (bfqq != bfqd->in_service_queue &&
- bfqd->in_service_queue != NULL &&
- bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) &&
- (bfqq->waker_bfqq == bfqd->in_service_queue ||
- bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) {
+ if (!bfqq || at_head) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
@@ -6059,7 +6154,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* merge).
*/
cmd_flags = rq->cmd_flags;
-
spin_unlock_irq(&bfqd->lock);
bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
@@ -6251,8 +6345,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
{
- bfqq->allocated--;
-
+ bfqq_request_freed(bfqq);
bfq_put_queue(bfqq);
}
@@ -6476,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq)
rq->elv.priv[1] = NULL;
}
+static void bfq_finish_request(struct request *rq)
+{
+ bfq_finish_requeue_request(rq);
+
+ if (rq->elv.icq) {
+ put_io_context(rq->elv.icq->ioc);
+ rq->elv.icq = NULL;
+ }
+}
+
/*
* Removes the association between the current task and bfqq, assuming
* that bic points to the bfq iocontext of the task.
@@ -6573,6 +6676,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
*/
static void bfq_prepare_request(struct request *rq)
{
+ rq->elv.icq = ioc_find_get_icq(rq->q);
+
/*
* Regardless of whether we have an icq attached, we have to
* clear the scheduler pointers, as they might point to
@@ -6672,7 +6777,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
}
}
- bfqq->allocated++;
+ bfqq_request_allocated(bfqq);
bfqq->ref++;
bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
rq, bfqq, bfqq->ref);
@@ -6835,11 +6940,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
* See the comments on bfq_limit_depth for the purpose of
* the depths set in the function. Return minimum shallow depth we'll use.
*/
-static unsigned int bfq_update_depths(struct bfq_data *bfqd,
- struct sbitmap_queue *bt)
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
- unsigned int i, j, min_shallow = UINT_MAX;
+ unsigned int depth = 1U << bt->sb.shift;
+ bfqd->full_depth_shift = bt->sb.shift;
/*
* In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads.
@@ -6851,13 +6956,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* limit 'something'.
*/
/* no more than 50% of tags for async I/O */
- bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+ bfqd->word_depths[0][0] = max(depth >> 1, 1U);
/*
* no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync
* writes)
*/
- bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+ bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
/*
* In-word depths in case some bfq_queue is being weight-
@@ -6867,25 +6972,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* shortage.
*/
/* no more than ~18% of tags for async I/O */
- bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+ bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
- bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
-
- return min_shallow;
+ bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
}
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- unsigned int min_shallow;
- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+ bfq_update_depths(bfqd, &tags->bitmap_tags);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
}
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
@@ -7300,7 +7398,7 @@ static struct elevator_type iosched_bfq_mq = {
.limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request,
.requeue_request = bfq_finish_requeue_request,
- .finish_request = bfq_finish_requeue_request,
+ .finish_request = bfq_finish_request,
.exit_icq = bfq_exit_icq,
.insert_requests = bfq_insert_requests,
.dispatch_request = bfq_dispatch_request,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index a73488eec8a4..07288b9da389 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -25,7 +25,7 @@
#define BFQ_DEFAULT_GRP_IOPRIO 0
#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
-#define MAX_PID_STR_LENGTH 12
+#define MAX_BFQQ_NAME_LENGTH 16
/*
* Soft real-time applications are extremely more latency sensitive
@@ -170,6 +170,9 @@ struct bfq_entity {
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget;
+ /* Number of requests allocated in the subtree of this entity */
+ int allocated;
+
/* device weight, if non-zero, it overrides the default weight of
* bfq_group_data */
int dev_weight;
@@ -266,8 +269,6 @@ struct bfq_queue {
struct request *next_rq;
/* number of sync and async requests queued */
int queued[2];
- /* number of requests currently allocated */
- int allocated;
/* number of pending metadata requests */
int meta_pending;
/* fifo list of requests in sort_list */
@@ -387,6 +388,8 @@ struct bfq_queue {
struct bfq_queue *tentative_waker_bfqq;
/* number of times the same tentative waker has been detected */
unsigned int num_waker_detections;
+ /* time when we started considering this waker */
+ u64 waker_detection_started;
/* node for woken_list, see below */
struct hlist_node woken_list_node;
@@ -768,6 +771,7 @@ struct bfq_data {
* function)
*/
unsigned int word_depths[2][2];
+ unsigned int full_depth_shift;
};
enum bfqq_state_flags {
@@ -1079,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */
/* Logging facilities. */
-static inline void bfq_pid_to_str(int pid, char *str, int len)
+static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len)
{
- if (pid != -1)
- snprintf(str, len, "%d", pid);
+ char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A';
+
+ if (bfqq->pid != -1)
+ snprintf(str, len, "bfq%d%c", bfqq->pid, type);
else
- snprintf(str, len, "SHARED-");
+ snprintf(str, len, "bfqSHARED-%c", type);
}
#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
- char pid_str[MAX_PID_STR_LENGTH]; \
+ char pid_str[MAX_BFQQ_NAME_LENGTH]; \
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
break; \
- bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
+ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_cgroup_trace_msg((bfqd)->queue, \
bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
- "bfq%s%c " fmt, pid_str, \
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
+ "%s " fmt, pid_str, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
@@ -1109,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#else /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
- char pid_str[MAX_PID_STR_LENGTH]; \
+ char pid_str[MAX_BFQQ_NAME_LENGTH]; \
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
break; \
- bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
- blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
- ##args); \
+ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
diff --git a/block/bio.c b/block/bio.c
index 15ab0d6d1c06..0d400ba2dbd1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -26,7 +26,7 @@
#include "blk-rq-qos.h"
struct bio_alloc_cache {
- struct bio_list free_list;
+ struct bio *free_list;
unsigned int nr;
};
@@ -630,7 +630,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
unsigned int i = 0;
struct bio *bio;
- while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+ while ((bio = cache->free_list) != NULL) {
+ cache->free_list = bio->bi_next;
cache->nr--;
bio_free(bio);
if (++i == nr)
@@ -689,7 +690,8 @@ void bio_put(struct bio *bio)
bio_uninit(bio);
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
- bio_list_add_head(&cache->free_list, bio);
+ bio->bi_next = cache->free_list;
+ cache->free_list = bio;
if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
put_cpu();
@@ -1033,6 +1035,28 @@ int bio_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_add_page);
+/**
+ * bio_add_folio - Attempt to add part of a folio to a bio.
+ * @bio: BIO to add to.
+ * @folio: Folio to add.
+ * @len: How many bytes from the folio to add.
+ * @off: First byte in this folio to add.
+ *
+ * Filesystems that use folios can call this function instead of calling
+ * bio_add_page() for each page in the folio. If @off is bigger than
+ * PAGE_SIZE, this function can create a bio_vec that starts in a page
+ * after the bv_page. BIOs do not support folios that are 4GiB or larger.
+ *
+ * Return: Whether the addition was successful.
+ */
+bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
+ size_t off)
+{
+ if (len > UINT_MAX || off > UINT_MAX)
+ return 0;
+ return bio_add_page(bio, &folio->page, len, off) > 0;
+}
+
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct bvec_iter_all iter_all;
@@ -1704,8 +1728,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
cache = per_cpu_ptr(bs->cache, get_cpu());
- bio = bio_list_pop(&cache->free_list);
- if (bio) {
+ if (cache->free_list) {
+ bio = cache->free_list;
+ cache->free_list = bio->bi_next;
cache->nr--;
put_cpu();
bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 88b1fce90520..650f7e27989f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,6 +30,7 @@
#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
#include <linux/psi.h>
+#include <linux/part_stat.h>
#include "blk.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"
@@ -640,7 +641,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
*/
ret = blk_queue_enter(q, 0);
if (ret)
- return ret;
+ goto fail;
rcu_read_lock();
spin_lock_irq(&q->queue_lock);
@@ -676,13 +677,13 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto fail;
+ goto fail_exit_queue;
}
if (radix_tree_preload(GFP_KERNEL)) {
blkg_free(new_blkg);
ret = -ENOMEM;
- goto fail;
+ goto fail_exit_queue;
}
rcu_read_lock();
@@ -722,9 +723,10 @@ fail_preloaded:
fail_unlock:
spin_unlock_irq(&q->queue_lock);
rcu_read_unlock();
+fail_exit_queue:
+ blk_queue_exit(q);
fail:
blkdev_put_no_open(bdev);
- blk_queue_exit(q);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
diff --git a/block/blk-core.c b/block/blk-core.c
index 9ee32f85d74e..97f8bc8d3a79 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,7 +16,6 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
#include <linux/blk-pm.h>
#include <linux/blk-integrity.h>
#include <linux/highmem.h>
@@ -40,6 +39,7 @@
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/psi.h>
+#include <linux/part_stat.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
@@ -47,7 +47,6 @@
#include <trace/events/block.h>
#include "blk.h"
-#include "blk-mq.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-throttle.h"
@@ -67,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
* For queue allocation
*/
struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *blk_requestq_srcu_cachep;
/*
* Controlling structure to kblockd
@@ -109,23 +109,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
-void blk_rq_init(struct request_queue *q, struct request *rq)
-{
- memset(rq, 0, sizeof(*rq));
-
- INIT_LIST_HEAD(&rq->queuelist);
- rq->q = q;
- rq->__sector = (sector_t) -1;
- INIT_HLIST_NODE(&rq->hash);
- RB_CLEAR_NODE(&rq->rb_node);
- rq->tag = BLK_MQ_NO_TAG;
- rq->internal_tag = BLK_MQ_NO_TAG;
- rq->start_time_ns = ktime_get_ns();
- rq->part = NULL;
- blk_crypto_rq_set_defaults(rq);
-}
-EXPORT_SYMBOL(blk_rq_init);
-
#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
static const char *const blk_op_name[] = {
REQ_OP_NAME(READ),
@@ -216,38 +199,15 @@ int blk_status_to_errno(blk_status_t status)
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);
-void blk_print_req_error(struct request *req, blk_status_t status)
+const char *blk_status_to_str(blk_status_t status)
{
int idx = (__force int)status;
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
- return;
-
- printk_ratelimited(KERN_ERR
- "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
- "phys_seg %u prio class %u\n",
- blk_errors[idx].name,
- req->rq_disk ? req->rq_disk->disk_name : "?",
- blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
- req->cmd_flags & ~REQ_OP_MASK,
- req->nr_phys_segments,
- IOPRIO_PRIO_CLASS(req->ioprio));
+ return "<null>";
+ return blk_errors[idx].name;
}
-void blk_dump_rq_flags(struct request *rq, char *msg)
-{
- printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
- rq->rq_disk ? rq->rq_disk->disk_name : "?",
- (unsigned long long) rq->cmd_flags);
-
- printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
- (unsigned long long)blk_rq_pos(rq),
- blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
- printk(KERN_INFO " bio %p, biotail %p, len %u\n",
- rq->bio, rq->biotail, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(blk_dump_rq_flags);
-
/**
* blk_sync_queue - cancel any pending callbacks on a queue
* @q: the queue
@@ -363,8 +323,10 @@ void blk_cleanup_queue(struct request_queue *q)
blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
blk_sync_queue(q);
- if (queue_is_mq(q))
+ if (queue_is_mq(q)) {
+ blk_mq_cancel_work_sync(q);
blk_mq_exit_queue(q);
+ }
/*
* In theory, request pool of sched_tags belongs to request queue.
@@ -476,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{
struct request_queue *q;
int ret;
- q = kmem_cache_alloc_node(blk_requestq_cachep,
- GFP_KERNEL | __GFP_ZERO, node_id);
+ q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
+ GFP_KERNEL | __GFP_ZERO, node_id);
if (!q)
return NULL;
+ if (alloc_srcu) {
+ blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
+ if (init_srcu_struct(q->srcu) != 0)
+ goto fail_q;
+ }
+
q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
if (q->id < 0)
- goto fail_q;
+ goto fail_srcu;
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
if (ret)
@@ -547,8 +515,11 @@ fail_split:
bioset_exit(&q->bio_split);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
+fail_srcu:
+ if (alloc_srcu)
+ cleanup_srcu_struct(q->srcu);
fail_q:
- kmem_cache_free(blk_requestq_cachep, q);
+ kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
return NULL;
}
@@ -592,7 +563,7 @@ static int __init setup_fail_make_request(char *str)
}
__setup("fail_make_request=", setup_fail_make_request);
-static bool should_fail_request(struct block_device *part, unsigned int bytes)
+bool should_fail_request(struct block_device *part, unsigned int bytes)
{
return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
}
@@ -606,15 +577,6 @@ static int __init fail_make_request_debugfs(void)
}
late_initcall(fail_make_request_debugfs);
-
-#else /* CONFIG_FAIL_MAKE_REQUEST */
-
-static inline bool should_fail_request(struct block_device *part,
- unsigned int bytes)
-{
- return false;
-}
-
#endif /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool bio_check_ro(struct bio *bio)
@@ -800,15 +762,6 @@ noinline_for_stack bool submit_bio_checks(struct bio *bio)
break;
}
- /*
- * Various block parts want %current->io_context, so allocate it up
- * front rather than dealing with lots of pain to allocate it only
- * where needed. This may fail and the block layer knows how to live
- * with it.
- */
- if (unlikely(!current->io_context))
- create_task_io_context(current, GFP_ATOMIC, q->node);
-
if (blk_throtl_bio(bio))
return false;
@@ -834,17 +787,21 @@ end_io:
static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
{
- if (unlikely(bio_queue_enter(bio) != 0))
- return;
- if (submit_bio_checks(bio) && blk_crypto_bio_prep(&bio))
- disk->fops->submit_bio(bio);
- blk_queue_exit(disk->queue);
+ if (blk_crypto_bio_prep(&bio)) {
+ if (likely(bio_queue_enter(bio) == 0)) {
+ disk->fops->submit_bio(bio);
+ blk_queue_exit(disk->queue);
+ }
+ }
}
static void __submit_bio(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
+ if (unlikely(!submit_bio_checks(bio)))
+ return;
+
if (!disk->fops->submit_bio)
blk_mq_submit_bio(bio);
else
@@ -1015,6 +972,7 @@ EXPORT_SYMBOL(submit_bio);
/**
* bio_poll - poll for BIO completions
* @bio: bio to poll for
+ * @iob: batches of IO
* @flags: BLK_POLL_* flags that control the behavior
*
* Poll for completions on queue associated with the bio. Returns number of
@@ -1087,135 +1045,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
}
EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
-/**
- * blk_cloned_rq_check_limits - Helper function to check a cloned request
- * for the new queue limits
- * @q: the queue
- * @rq: the request being checked
- *
- * Description:
- * @rq may have been made based on weaker limitations of upper-level queues
- * in request stacking drivers, and it may violate the limitation of @q.
- * Since the block layer and the underlying device driver trust @rq
- * after it is inserted to @q, it should be checked against @q before
- * the insertion using this generic function.
- *
- * Request stacking drivers like request-based dm may change the queue
- * limits when retrying requests on other queues. Those requests need
- * to be checked against the new queue limits again during dispatch.
- */
-static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
- struct request *rq)
-{
- unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
-
- if (blk_rq_sectors(rq) > max_sectors) {
- /*
- * SCSI device does not have a good way to return if
- * Write Same/Zero is actually supported. If a device rejects
- * a non-read/write command (discard, write same,etc.) the
- * low-level device driver will set the relevant queue limit to
- * 0 to prevent blk-lib from issuing more of the offending
- * operations. Commands queued prior to the queue limit being
- * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
- * errors being propagated to upper layers.
- */
- if (max_sectors == 0)
- return BLK_STS_NOTSUPP;
-
- printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
- __func__, blk_rq_sectors(rq), max_sectors);
- return BLK_STS_IOERR;
- }
-
- /*
- * The queue settings related to segment counting may differ from the
- * original queue.
- */
- rq->nr_phys_segments = blk_recalc_rq_segments(rq);
- if (rq->nr_phys_segments > queue_max_segments(q)) {
- printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
- __func__, rq->nr_phys_segments, queue_max_segments(q));
- return BLK_STS_IOERR;
- }
-
- return BLK_STS_OK;
-}
-
-/**
- * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q: the queue to submit the request
- * @rq: the request being queued
- */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
-{
- blk_status_t ret;
-
- ret = blk_cloned_rq_check_limits(q, rq);
- if (ret != BLK_STS_OK)
- return ret;
-
- if (rq->rq_disk &&
- should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
- return BLK_STS_IOERR;
-
- if (blk_crypto_insert_cloned_request(rq))
- return BLK_STS_IOERR;
-
- blk_account_io_start(rq);
-
- /*
- * Since we have a scheduler attached on the top device,
- * bypass a potential scheduler on the bottom device for
- * insert.
- */
- return blk_mq_request_issue_directly(rq, true);
-}
-EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
-
-/**
- * blk_rq_err_bytes - determine number of bytes till the next failure boundary
- * @rq: request to examine
- *
- * Description:
- * A request could be merge of IOs which require different failure
- * handling. This function determines the number of bytes which
- * can be failed from the beginning of the request without
- * crossing into area which need to be retried further.
- *
- * Return:
- * The number of bytes to fail.
- */
-unsigned int blk_rq_err_bytes(const struct request *rq)
-{
- unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
- unsigned int bytes = 0;
- struct bio *bio;
-
- if (!(rq->rq_flags & RQF_MIXED_MERGE))
- return blk_rq_bytes(rq);
-
- /*
- * Currently the only 'mixing' which can happen is between
- * different fastfail types. We can safely fail portions
- * which have all the failfast bits that the first one has -
- * the ones which are at least as eager to fail as the first
- * one.
- */
- for (bio = rq->bio; bio; bio = bio->bi_next) {
- if ((bio->bi_opf & ff) != ff)
- break;
- bytes += bio->bi_iter.bi_size;
- }
-
- /* this could lead to infinite loop */
- BUG_ON(blk_rq_bytes(rq) && !bytes);
- return bytes;
-}
-EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
-
-static void update_io_ticks(struct block_device *part, unsigned long now,
- bool end)
+void update_io_ticks(struct block_device *part, unsigned long now, bool end)
{
unsigned long stamp;
again:
@@ -1230,30 +1060,6 @@ again:
}
}
-void __blk_account_io_done(struct request *req, u64 now)
-{
- const int sgrp = op_stat_group(req_op(req));
-
- part_stat_lock();
- update_io_ticks(req->part, jiffies, true);
- part_stat_inc(req->part, ios[sgrp]);
- part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
- part_stat_unlock();
-}
-
-void __blk_account_io_start(struct request *rq)
-{
- /* passthrough requests can hold bios that do not have ->bi_bdev set */
- if (rq->bio && rq->bio->bi_bdev)
- rq->part = rq->bio->bi_bdev;
- else
- rq->part = rq->rq_disk->part0;
-
- part_stat_lock();
- update_io_ticks(rq->part, jiffies, false);
- part_stat_unlock();
-}
-
static unsigned long __part_start_io_acct(struct block_device *part,
unsigned int sectors, unsigned int op)
{
@@ -1317,46 +1123,6 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
}
EXPORT_SYMBOL(disk_end_io_acct);
-/*
- * Steal bios from a request and add them to a bio list.
- * The request must not have been partially completed before.
- */
-void blk_steal_bios(struct bio_list *list, struct request *rq)
-{
- if (rq->bio) {
- if (list->tail)
- list->tail->bi_next = rq->bio;
- else
- list->head = rq->bio;
- list->tail = rq->biotail;
-
- rq->bio = NULL;
- rq->biotail = NULL;
- }
-
- rq->__data_len = 0;
-}
-EXPORT_SYMBOL_GPL(blk_steal_bios);
-
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-/**
- * rq_flush_dcache_pages - Helper function to flush all pages in a request
- * @rq: the request to be flushed
- *
- * Description:
- * Flush all pages in @rq.
- */
-void rq_flush_dcache_pages(struct request *rq)
-{
- struct req_iterator iter;
- struct bio_vec bvec;
-
- rq_for_each_segment(bvec, rq, iter)
- flush_dcache_page(bvec.bv_page);
-}
-EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
-#endif
-
/**
* blk_lld_busy - Check if underlying low-level drivers of a device are busy
* @q : the queue of the device being checked
@@ -1385,93 +1151,6 @@ int blk_lld_busy(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_lld_busy);
-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- * Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
- struct bio *bio;
-
- while ((bio = rq->bio) != NULL) {
- rq->bio = bio->bi_next;
-
- bio_put(bio);
- }
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- * Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- * Also, pages which the original bios are pointing to are not copied
- * and the cloned bios just point same pages.
- * So cloned bios must be completed before original bios, which means
- * the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
- struct bio_set *bs, gfp_t gfp_mask,
- int (*bio_ctr)(struct bio *, struct bio *, void *),
- void *data)
-{
- struct bio *bio, *bio_src;
-
- if (!bs)
- bs = &fs_bio_set;
-
- __rq_for_each_bio(bio_src, rq_src) {
- bio = bio_clone_fast(bio_src, gfp_mask, bs);
- if (!bio)
- goto free_and_out;
-
- if (bio_ctr && bio_ctr(bio, bio_src, data))
- goto free_and_out;
-
- if (rq->bio) {
- rq->biotail->bi_next = bio;
- rq->biotail = bio;
- } else {
- rq->bio = rq->biotail = bio;
- }
- bio = NULL;
- }
-
- /* Copy attributes of the original request to the clone request. */
- rq->__sector = blk_rq_pos(rq_src);
- rq->__data_len = blk_rq_bytes(rq_src);
- if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
- rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
- rq->special_vec = rq_src->special_vec;
- }
- rq->nr_phys_segments = rq_src->nr_phys_segments;
- rq->ioprio = rq_src->ioprio;
-
- if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
- goto free_and_out;
-
- return 0;
-
-free_and_out:
- if (bio)
- bio_put(bio);
- blk_rq_unprep_clone(rq);
-
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
-
int kblockd_schedule_work(struct work_struct *work)
{
return queue_work(kblockd_workqueue, work);
@@ -1636,6 +1315,9 @@ int __init blk_dev_init(void)
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf));
+ BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
+ __alignof__(struct request_queue)) !=
+ sizeof(struct request_queue));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1646,6 +1328,10 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+ blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
+ sizeof(struct request_queue) +
+ sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
+
blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0;
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 605ba0626a5c..96c511967386 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -463,11 +463,6 @@ bool blk_crypto_register(struct blk_crypto_profile *profile,
}
EXPORT_SYMBOL_GPL(blk_crypto_register);
-void blk_crypto_unregister(struct request_queue *q)
-{
- q->crypto_profile = NULL;
-}
-
/**
* blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
* by child device
diff --git a/block/blk-exec.c b/block/blk-exec.c
deleted file mode 100644
index 1b8b47f6e79b..000000000000
--- a/block/blk-exec.c
+++ /dev/null
@@ -1,116 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions related to setting various queue properties from drivers
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/sched/sysctl.h>
-
-#include "blk.h"
-#include "blk-mq-sched.h"
-
-/**
- * blk_end_sync_rq - executes a completion event on a request
- * @rq: request to complete
- * @error: end I/O status of the request
- */
-static void blk_end_sync_rq(struct request *rq, blk_status_t error)
-{
- struct completion *waiting = rq->end_io_data;
-
- rq->end_io_data = (void *)(uintptr_t)error;
-
- /*
- * complete last, if this is a stack request the process (and thus
- * the rq pointer) could be invalid right after this complete()
- */
- complete(waiting);
-}
-
-/**
- * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
- * @bd_disk: matching gendisk
- * @rq: request to insert
- * @at_head: insert request at head or tail of queue
- * @done: I/O completion handler
- *
- * Description:
- * Insert a fully prepared request at the back of the I/O scheduler queue
- * for execution. Don't wait for completion.
- *
- * Note:
- * This function will invoke @done directly if the queue is dead.
- */
-void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq,
- int at_head, rq_end_io_fn *done)
-{
- WARN_ON(irqs_disabled());
- WARN_ON(!blk_rq_is_passthrough(rq));
-
- rq->rq_disk = bd_disk;
- rq->end_io = done;
-
- blk_account_io_start(rq);
-
- /*
- * don't check dying flag for MQ because the request won't
- * be reused after dying flag is set
- */
- blk_mq_sched_insert_request(rq, at_head, true, false);
-}
-EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
-
-static bool blk_rq_is_poll(struct request *rq)
-{
- if (!rq->mq_hctx)
- return false;
- if (rq->mq_hctx->type != HCTX_TYPE_POLL)
- return false;
- if (WARN_ON_ONCE(!rq->bio))
- return false;
- return true;
-}
-
-static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
-{
- do {
- bio_poll(rq->bio, NULL, 0);
- cond_resched();
- } while (!completion_done(wait));
-}
-
-/**
- * blk_execute_rq - insert a request into queue for execution
- * @bd_disk: matching gendisk
- * @rq: request to insert
- * @at_head: insert request at head or tail of queue
- *
- * Description:
- * Insert a fully prepared request at the back of the I/O scheduler queue
- * for execution and wait for completion.
- * Return: The blk_status_t result provided to blk_mq_end_request().
- */
-blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, int at_head)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- unsigned long hang_check;
-
- rq->end_io_data = &wait;
- blk_execute_rq_nowait(bd_disk, rq, at_head, blk_end_sync_rq);
-
- /* Prevent hang_check timer from firing at us during very long I/O */
- hang_check = sysctl_hung_task_timeout_secs;
-
- if (blk_rq_is_poll(rq))
- blk_rq_poll_completion(rq, &wait);
- else if (hang_check)
- while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
- else
- wait_for_completion_io(&wait);
-
- return (blk_status_t)(uintptr_t)rq->end_io_data;
-}
-EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8e364bda5166..e4df894189ce 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,6 +69,7 @@
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
+#include <linux/part_stat.h>
#include "blk.h"
#include "blk-mq.h"
@@ -95,6 +96,12 @@ enum {
static void blk_kick_flush(struct request_queue *q,
struct blk_flush_queue *fq, unsigned int flags);
+static inline struct blk_flush_queue *
+blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
+{
+ return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+}
+
static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
{
unsigned int policy = 0;
@@ -138,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
static void blk_account_io_flush(struct request *rq)
{
- struct block_device *part = rq->rq_disk->part0;
+ struct block_device *part = rq->q->disk->part0;
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
@@ -222,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
- if (!refcount_dec_and_test(&flush_rq->ref)) {
+ if (!req_ref_put_and_test(flush_rq)) {
fq->rq_status = error;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
return;
@@ -235,8 +242,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
* avoiding use-after-free.
*/
WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
- if (fq->rq_status != BLK_STS_OK)
+ if (fq->rq_status != BLK_STS_OK) {
error = fq->rq_status;
+ fq->rq_status = BLK_STS_OK;
+ }
if (!q->elevator) {
flush_rq->tag = BLK_MQ_NO_TAG;
@@ -332,7 +341,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
flush_rq->rq_flags |= RQF_FLUSH_SEQ;
- flush_rq->rq_disk = first_rq->rq_disk;
flush_rq->end_io = flush_end_io;
/*
* Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
@@ -341,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
* and READ flush_rq->end_io
*/
smp_wmb();
- refcount_set(&flush_rq->ref, 1);
+ req_ref_set(flush_rq, 1);
blk_flush_queue_rq(flush_rq, false);
}
@@ -379,7 +387,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*/
-bool blk_insert_flush(struct request *rq)
+void blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags; /* may change, cache */
@@ -409,7 +417,7 @@ bool blk_insert_flush(struct request *rq)
*/
if (!policy) {
blk_mq_end_request(rq, 0);
- return true;
+ return;
}
BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
@@ -420,8 +428,10 @@ bool blk_insert_flush(struct request *rq)
* for normal execution.
*/
if ((policy & REQ_FSEQ_DATA) &&
- !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))
- return false;
+ !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+ blk_mq_request_bypass_insert(rq, false, true);
+ return;
+ }
/*
* @rq should go through flush machinery. Mark it part of flush
@@ -437,8 +447,6 @@ bool blk_insert_flush(struct request *rq)
spin_lock_irq(&fq->mq_flush_lock);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&fq->mq_flush_lock);
-
- return true;
}
/**
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index d670d54e5f7a..69eed260a823 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -411,7 +411,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
if (disk->queue->crypto_profile) {
pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
- blk_crypto_unregister(disk->queue);
+ disk->queue->crypto_profile = NULL;
}
#endif
}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 57299f860d41..11f49f78db32 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -8,22 +8,25 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include <linux/sched/task.h>
#include "blk.h"
+#include "blk-mq-sched.h"
/*
* For io context allocations
*/
static struct kmem_cache *iocontext_cachep;
+#ifdef CONFIG_BLK_ICQ
/**
* get_io_context - increment reference count to io_context
* @ioc: io_context to get
*
* Increment reference count to @ioc.
*/
-void get_io_context(struct io_context *ioc)
+static void get_io_context(struct io_context *ioc)
{
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
atomic_long_inc(&ioc->refcount);
@@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq)
icq->flags |= ICQ_EXITED;
}
+static void ioc_exit_icqs(struct io_context *ioc)
+{
+ struct io_cq *icq;
+
+ spin_lock_irq(&ioc->lock);
+ hlist_for_each_entry(icq, &ioc->icq_list, ioc_node)
+ ioc_exit_icq(icq);
+ spin_unlock_irq(&ioc->lock);
+}
+
/*
* Release an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy.
@@ -132,68 +145,74 @@ static void ioc_release_fn(struct work_struct *work)
kmem_cache_free(iocontext_cachep, ioc);
}
-/**
- * put_io_context - put a reference of io_context
- * @ioc: io_context to put
- *
- * Decrement reference count of @ioc and release it if the count reaches
- * zero.
+/*
+ * Releasing icqs requires reverse order double locking and we may already be
+ * holding a queue_lock. Do it asynchronously from a workqueue.
*/
-void put_io_context(struct io_context *ioc)
+static bool ioc_delay_free(struct io_context *ioc)
{
unsigned long flags;
- bool free_ioc = false;
- if (ioc == NULL)
- return;
-
- BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
-
- /*
- * Releasing ioc requires reverse order double locking and we may
- * already be holding a queue_lock. Do it asynchronously from wq.
- */
- if (atomic_long_dec_and_test(&ioc->refcount)) {
- spin_lock_irqsave(&ioc->lock, flags);
- if (!hlist_empty(&ioc->icq_list))
- queue_work(system_power_efficient_wq,
- &ioc->release_work);
- else
- free_ioc = true;
+ spin_lock_irqsave(&ioc->lock, flags);
+ if (!hlist_empty(&ioc->icq_list)) {
+ queue_work(system_power_efficient_wq, &ioc->release_work);
spin_unlock_irqrestore(&ioc->lock, flags);
+ return true;
}
-
- if (free_ioc)
- kmem_cache_free(iocontext_cachep, ioc);
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ return false;
}
/**
- * put_io_context_active - put active reference on ioc
- * @ioc: ioc of interest
+ * ioc_clear_queue - break any ioc association with the specified queue
+ * @q: request_queue being cleared
*
- * Undo get_io_context_active(). If active reference reaches zero after
- * put, @ioc can never issue further IOs and ioscheds are notified.
+ * Walk @q->icq_list and exit all io_cq's.
*/
-void put_io_context_active(struct io_context *ioc)
+void ioc_clear_queue(struct request_queue *q)
{
- struct io_cq *icq;
+ LIST_HEAD(icq_list);
- if (!atomic_dec_and_test(&ioc->active_ref)) {
- put_io_context(ioc);
- return;
- }
+ spin_lock_irq(&q->queue_lock);
+ list_splice_init(&q->icq_list, &icq_list);
+ spin_unlock_irq(&q->queue_lock);
- spin_lock_irq(&ioc->lock);
- hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
- if (icq->flags & ICQ_EXITED)
- continue;
+ rcu_read_lock();
+ while (!list_empty(&icq_list)) {
+ struct io_cq *icq =
+ list_entry(icq_list.next, struct io_cq, q_node);
- ioc_exit_icq(icq);
+ spin_lock_irq(&icq->ioc->lock);
+ if (!(icq->flags & ICQ_DESTROYED))
+ ioc_destroy_icq(icq);
+ spin_unlock_irq(&icq->ioc->lock);
}
- spin_unlock_irq(&ioc->lock);
+ rcu_read_unlock();
+}
+#else /* CONFIG_BLK_ICQ */
+static inline void ioc_exit_icqs(struct io_context *ioc)
+{
+}
+static inline bool ioc_delay_free(struct io_context *ioc)
+{
+ return false;
+}
+#endif /* CONFIG_BLK_ICQ */
- put_io_context(ioc);
+/**
+ * put_io_context - put a reference of io_context
+ * @ioc: io_context to put
+ *
+ * Decrement reference count of @ioc and release it if the count reaches
+ * zero.
+ */
+void put_io_context(struct io_context *ioc)
+{
+ BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+ if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc))
+ kmem_cache_free(iocontext_cachep, ioc);
}
+EXPORT_SYMBOL_GPL(put_io_context);
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
@@ -205,132 +224,109 @@ void exit_io_context(struct task_struct *task)
task->io_context = NULL;
task_unlock(task);
- atomic_dec(&ioc->nr_tasks);
- put_io_context_active(ioc);
-}
-
-static void __ioc_clear_queue(struct list_head *icq_list)
-{
- unsigned long flags;
-
- rcu_read_lock();
- while (!list_empty(icq_list)) {
- struct io_cq *icq = list_entry(icq_list->next,
- struct io_cq, q_node);
- struct io_context *ioc = icq->ioc;
-
- spin_lock_irqsave(&ioc->lock, flags);
- if (icq->flags & ICQ_DESTROYED) {
- spin_unlock_irqrestore(&ioc->lock, flags);
- continue;
- }
- ioc_destroy_icq(icq);
- spin_unlock_irqrestore(&ioc->lock, flags);
+ if (atomic_dec_and_test(&ioc->active_ref)) {
+ ioc_exit_icqs(ioc);
+ put_io_context(ioc);
}
- rcu_read_unlock();
}
-/**
- * ioc_clear_queue - break any ioc association with the specified queue
- * @q: request_queue being cleared
- *
- * Walk @q->icq_list and exit all io_cq's.
- */
-void ioc_clear_queue(struct request_queue *q)
-{
- LIST_HEAD(icq_list);
-
- spin_lock_irq(&q->queue_lock);
- list_splice_init(&q->icq_list, &icq_list);
- spin_unlock_irq(&q->queue_lock);
-
- __ioc_clear_queue(&icq_list);
-}
-
-int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
+static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
{
struct io_context *ioc;
- int ret;
ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
node);
if (unlikely(!ioc))
- return -ENOMEM;
+ return NULL;
- /* initialize */
atomic_long_set(&ioc->refcount, 1);
- atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1);
+#ifdef CONFIG_BLK_ICQ
spin_lock_init(&ioc->lock);
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
+#endif
+ return ioc;
+}
- /*
- * Try to install. ioc shouldn't be installed if someone else
- * already did or @task, which isn't %current, is exiting. Note
- * that we need to allow ioc creation on exiting %current as exit
- * path may issue IOs from e.g. exit_files(). The exit path is
- * responsible for not issuing IO after exit_io_context().
- */
- task_lock(task);
- if (!task->io_context &&
- (task == current || !(task->flags & PF_EXITING)))
- task->io_context = ioc;
- else
- kmem_cache_free(iocontext_cachep, ioc);
+int set_task_ioprio(struct task_struct *task, int ioprio)
+{
+ int err;
+ const struct cred *cred = current_cred(), *tcred;
- ret = task->io_context ? 0 : -EBUSY;
+ rcu_read_lock();
+ tcred = __task_cred(task);
+ if (!uid_eq(tcred->uid, cred->euid) &&
+ !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+ rcu_read_unlock();
- task_unlock(task);
+ err = security_task_setioprio(task, ioprio);
+ if (err)
+ return err;
- return ret;
-}
+ task_lock(task);
+ if (unlikely(!task->io_context)) {
+ struct io_context *ioc;
-/**
- * get_task_io_context - get io_context of a task
- * @task: task of interest
- * @gfp_flags: allocation flags, used if allocation is necessary
- * @node: allocation node, used if allocation is necessary
- *
- * Return io_context of @task. If it doesn't exist, it is created with
- * @gfp_flags and @node. The returned io_context has its reference count
- * incremented.
- *
- * This function always goes through task_lock() and it's better to use
- * %current->io_context + get_io_context() for %current.
- */
-struct io_context *get_task_io_context(struct task_struct *task,
- gfp_t gfp_flags, int node)
-{
- struct io_context *ioc;
+ task_unlock(task);
- might_sleep_if(gfpflags_allow_blocking(gfp_flags));
+ ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE);
+ if (!ioc)
+ return -ENOMEM;
- do {
task_lock(task);
- ioc = task->io_context;
- if (likely(ioc)) {
- get_io_context(ioc);
- task_unlock(task);
- return ioc;
+ if (task->flags & PF_EXITING) {
+ err = -ESRCH;
+ kmem_cache_free(iocontext_cachep, ioc);
+ goto out;
}
- task_unlock(task);
- } while (!create_task_io_context(task, gfp_flags, node));
+ if (task->io_context)
+ kmem_cache_free(iocontext_cachep, ioc);
+ else
+ task->io_context = ioc;
+ }
+ task->io_context->ioprio = ioprio;
+out:
+ task_unlock(task);
+ return err;
+}
+EXPORT_SYMBOL_GPL(set_task_ioprio);
- return NULL;
+int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct io_context *ioc = current->io_context;
+
+ /*
+ * Share io context with parent, if CLONE_IO is set
+ */
+ if (clone_flags & CLONE_IO) {
+ atomic_inc(&ioc->active_ref);
+ tsk->io_context = ioc;
+ } else if (ioprio_valid(ioc->ioprio)) {
+ tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
+ if (!tsk->io_context)
+ return -ENOMEM;
+ tsk->io_context->ioprio = ioc->ioprio;
+ }
+
+ return 0;
}
+#ifdef CONFIG_BLK_ICQ
/**
* ioc_lookup_icq - lookup io_cq from ioc
- * @ioc: the associated io_context
* @q: the associated request_queue
*
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
* with @q->queue_lock held.
*/
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
+struct io_cq *ioc_lookup_icq(struct request_queue *q)
{
+ struct io_context *ioc = current->io_context;
struct io_cq *icq;
lockdep_assert_held(&q->queue_lock);
@@ -359,9 +355,7 @@ EXPORT_SYMBOL(ioc_lookup_icq);
/**
* ioc_create_icq - create and link io_cq
- * @ioc: io_context of interest
* @q: request_queue of interest
- * @gfp_mask: allocation mask
*
* Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
* will be created using @gfp_mask.
@@ -369,19 +363,19 @@ EXPORT_SYMBOL(ioc_lookup_icq);
* The caller is responsible for ensuring @ioc won't go away and @q is
* alive and will stay alive until this function returns.
*/
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
- gfp_t gfp_mask)
+static struct io_cq *ioc_create_icq(struct request_queue *q)
{
+ struct io_context *ioc = current->io_context;
struct elevator_type *et = q->elevator->type;
struct io_cq *icq;
/* allocate stuff */
- icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+ icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO,
q->node);
if (!icq)
return NULL;
- if (radix_tree_maybe_preload(gfp_mask) < 0) {
+ if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) {
kmem_cache_free(et->icq_cache, icq);
return NULL;
}
@@ -402,7 +396,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
et->ops.init_icq(icq);
} else {
kmem_cache_free(et->icq_cache, icq);
- icq = ioc_lookup_icq(ioc, q);
+ icq = ioc_lookup_icq(q);
if (!icq)
printk(KERN_ERR "cfq: icq link failed!\n");
}
@@ -413,6 +407,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
return icq;
}
+struct io_cq *ioc_find_get_icq(struct request_queue *q)
+{
+ struct io_context *ioc = current->io_context;
+ struct io_cq *icq = NULL;
+
+ if (unlikely(!ioc)) {
+ ioc = alloc_io_context(GFP_ATOMIC, q->node);
+ if (!ioc)
+ return NULL;
+
+ task_lock(current);
+ if (current->io_context) {
+ kmem_cache_free(iocontext_cachep, ioc);
+ ioc = current->io_context;
+ } else {
+ current->io_context = ioc;
+ }
+
+ get_io_context(ioc);
+ task_unlock(current);
+ } else {
+ get_io_context(ioc);
+
+ spin_lock_irq(&q->queue_lock);
+ icq = ioc_lookup_icq(q);
+ spin_unlock_irq(&q->queue_lock);
+ }
+
+ if (!icq) {
+ icq = ioc_create_icq(q);
+ if (!icq) {
+ put_io_context(ioc);
+ return NULL;
+ }
+ }
+ return icq;
+}
+EXPORT_SYMBOL_GPL(ioc_find_get_icq);
+#endif /* CONFIG_BLK_ICQ */
+
static int __init blk_ioc_init(void)
{
iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index a5b37cc65b17..769b64394298 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2311,7 +2311,14 @@ static void ioc_timer_fn(struct timer_list *timer)
hwm = current_hweight_max(iocg);
new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
usage, &now);
- if (new_hwi < hwm) {
+ /*
+ * Donation calculation assumes hweight_after_donation
+ * to be positive, a condition that a donor w/ hwa < 2
+ * can't meet. Don't bother with donation if hwa is
+ * below 2. It's not gonna make a meaningful difference
+ * anyway.
+ */
+ if (new_hwi < hwm && hwa >= 2) {
iocg->hweight_donating = hwa;
iocg->hweight_after_donation = new_hwi;
list_add(&iocg->surplus_list, &surpluses);
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 332a07761bf8..2e7f10e1c03f 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -62,6 +62,7 @@ struct ioprio_blkg {
struct ioprio_blkcg {
struct blkcg_policy_data cpd;
enum prio_policy prio_policy;
+ bool prio_set;
};
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
if (ret < 0)
return ret;
blkcg->prio_policy = ret;
-
+ blkcg->prio_set = true;
return nbytes;
}
@@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
struct bio *bio)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
+ u16 prio;
+
+ if (!blkcg->prio_set)
+ return;
/*
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
@@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
* bio I/O priority is not modified. If the bio I/O priority equals
* IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
*/
- bio->bi_ioprio = max_t(u16, bio->bi_ioprio,
- IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
+ prio = max_t(u16, bio->bi_ioprio,
+ IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
+ if (prio > bio->bi_ioprio)
+ bio->bi_ioprio = prio;
}
static void blkcg_ioprio_exit(struct rq_qos *rqos)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 893c1a60b701..4de34a332c9f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -8,10 +8,12 @@
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
+#include <linux/part_stat.h>
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
#include "blk-throttle.h"
@@ -775,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q,
if (req_op(req) != req_op(next))
return NULL;
- if (rq_data_dir(req) != rq_data_dir(next)
- || req->rq_disk != next->rq_disk)
+ if (rq_data_dir(req) != rq_data_dir(next))
return NULL;
if (req_op(req) == REQ_OP_WRITE_SAME &&
@@ -903,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_data_dir(bio) != rq_data_dir(rq))
return false;
- /* must be same device */
- if (rq->rq_disk != bio->bi_bdev->bd_disk)
- return false;
-
/* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
@@ -1067,7 +1064,6 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @nr_segs: number of segments in @bio
- * @same_queue_rq: output value, will be true if there's an existing request
* from the passed in @q already in the plug list
*
* Determine whether @bio being queued on @q can be merged with the previous
@@ -1084,7 +1080,7 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, bool *same_queue_rq)
+ unsigned int nr_segs)
{
struct blk_plug *plug;
struct request *rq;
@@ -1096,12 +1092,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
/* check the previously added entry for a quick merge attempt */
rq = rq_list_peek(&plug->mq_list);
if (rq->q == q) {
- /*
- * Only blk-mq multiple hardware queues case checks the rq in
- * the same queue, there should be only one such rq in a queue
- */
- *same_queue_rq = true;
-
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
BIO_MERGE_OK)
return true;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4f2cf8399f3d..3a790eb4995c 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -11,6 +11,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
+#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#include "blk-rq-qos.h"
@@ -29,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
struct request_queue *q = data;
int bucket;
+ if (!q->poll_stat)
+ return 0;
+
for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
print_stat(m, &q->poll_stat[2 * bucket]);
@@ -122,7 +126,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(FUA),
QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
- QUEUE_FLAG_NAME(POLL_STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
QUEUE_FLAG_NAME(PCI_P2PDMA),
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ba21449439cc..55488ba97823 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -18,32 +18,6 @@
#include "blk-mq-tag.h"
#include "blk-wbt.h"
-void blk_mq_sched_assign_ioc(struct request *rq)
-{
- struct request_queue *q = rq->q;
- struct io_context *ioc;
- struct io_cq *icq;
-
- /*
- * May not have an IO context if it's a passthrough request
- */
- ioc = current->io_context;
- if (!ioc)
- return;
-
- spin_lock_irq(&q->queue_lock);
- icq = ioc_lookup_icq(ioc, q);
- spin_unlock_irq(&q->queue_lock);
-
- if (!icq) {
- icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
- if (!icq)
- return;
- }
- get_io_context(icq->ioc);
- rq->elv.icq = icq;
-}
-
/*
* Mark a hardware queue as needing a restart. For shared queues, maintain
* a count of how many hardware queues are marked for restart.
@@ -501,7 +475,8 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
* us one extra enqueue & dequeue to sw queue.
*/
if (!hctx->dispatch_busy && !run_queue_async) {
- blk_mq_try_issue_list_directly(hctx, list);
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_try_issue_list_directly(hctx, list));
if (list_empty(list))
goto out;
}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 25d1034952b6..025013972453 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -8,8 +8,6 @@
#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
-void blk_mq_sched_assign_ioc(struct request *rq);
-
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request);
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 253c857cba47..674786574075 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj);
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- cleanup_srcu_struct(hctx->srcu);
blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 995336abee33..e55a6834c9a6 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -215,7 +215,8 @@ void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
struct bt_iter_data {
struct blk_mq_hw_ctx *hctx;
- busy_iter_fn *fn;
+ struct request_queue *q;
+ busy_tag_iter_fn *fn;
void *data;
bool reserved;
};
@@ -228,7 +229,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
spin_lock_irqsave(&tags->lock, flags);
rq = tags->rqs[bitnr];
- if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
+ if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
rq = NULL;
spin_unlock_irqrestore(&tags->lock, flags);
return rq;
@@ -238,11 +239,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_iter_data *iter_data = data;
struct blk_mq_hw_ctx *hctx = iter_data->hctx;
- struct blk_mq_tags *tags = hctx->tags;
+ struct request_queue *q = iter_data->q;
+ struct blk_mq_tag_set *set = q->tag_set;
bool reserved = iter_data->reserved;
+ struct blk_mq_tags *tags;
struct request *rq;
bool ret = true;
+ if (blk_mq_is_shared_tags(set->flags))
+ tags = set->shared_tags;
+ else
+ tags = hctx->tags;
+
if (!reserved)
bitnr += tags->nr_reserved_tags;
/*
@@ -253,8 +261,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
if (!rq)
return true;
- if (rq->q == hctx->queue && rq->mq_hctx == hctx)
- ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
+ if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
+ ret = iter_data->fn(rq, iter_data->data, reserved);
blk_mq_put_rq_ref(rq);
return ret;
}
@@ -262,6 +270,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
/**
* bt_for_each - iterate over the requests associated with a hardware queue
* @hctx: Hardware queue to examine.
+ * @q: Request queue to examine.
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each request
@@ -273,14 +282,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* @reserved: Indicates whether @bt is the breserved_tags member or the
* bitmap_tags member of struct blk_mq_tags.
*/
-static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
- busy_iter_fn *fn, void *data, bool reserved)
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
+ struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
+ void *data, bool reserved)
{
struct bt_iter_data iter_data = {
.hctx = hctx,
.fn = fn,
.data = data,
.reserved = reserved,
+ .q = q,
};
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
@@ -457,12 +468,9 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
* called for all requests on all queues that share that tag set and not only
* for requests associated with @q.
*/
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv)
{
- struct blk_mq_hw_ctx *hctx;
- int i;
-
/*
* __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
* while the queue is frozen. So we can use q_usage_counter to avoid
@@ -471,19 +479,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
- queue_for_each_hw_ctx(q, hctx, i) {
- struct blk_mq_tags *tags = hctx->tags;
-
- /*
- * If no software queues are currently mapped to this
- * hardware queue, there's nothing to check
- */
- if (!blk_mq_hw_queue_mapped(hctx))
- continue;
+ if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+ struct blk_mq_tags *tags = q->tag_set->shared_tags;
+ struct sbitmap_queue *bresv = &tags->breserved_tags;
+ struct sbitmap_queue *btags = &tags->bitmap_tags;
if (tags->nr_reserved_tags)
- bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
- bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
+ bt_for_each(NULL, q, bresv, fn, priv, true);
+ bt_for_each(NULL, q, btags, fn, priv, false);
+ } else {
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ struct blk_mq_tags *tags = hctx->tags;
+ struct sbitmap_queue *bresv = &tags->breserved_tags;
+ struct sbitmap_queue *btags = &tags->bitmap_tags;
+
+ /*
+ * If no software queues are currently mapped to this
+ * hardware queue, there's nothing to check
+ */
+ if (!blk_mq_hw_queue_mapped(hctx))
+ continue;
+
+ if (tags->nr_reserved_tags)
+ bt_for_each(hctx, q, bresv, fn, priv, true);
+ bt_for_each(hctx, q, btags, fn, priv, false);
+ }
}
blk_queue_exit(q);
}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index df787b5a23bd..5668e28be0b7 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -28,7 +28,7 @@ extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv);
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
void *priv);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ab34c4f20da..a6d4780580fc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
+#include <linux/part_stat.h>
#include <trace/events/block.h>
@@ -126,8 +127,7 @@ struct mq_inflight {
unsigned int inflight[2];
};
-static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
- struct request *rq, void *priv,
+static bool blk_mq_check_inflight(struct request *rq, void *priv,
bool reserved)
{
struct mq_inflight *mi = priv;
@@ -259,17 +259,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
*/
void blk_mq_wait_quiesce_done(struct request_queue *q)
{
- struct blk_mq_hw_ctx *hctx;
- unsigned int i;
- bool rcu = false;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- synchronize_srcu(hctx->srcu);
- else
- rcu = true;
- }
- if (rcu)
+ if (blk_queue_has_srcu(q))
+ synchronize_srcu(q->srcu);
+ else
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
@@ -327,6 +319,23 @@ void blk_mq_wake_waiters(struct request_queue *q)
blk_mq_tag_wakeup_all(hctx->tags, true);
}
+void blk_rq_init(struct request_queue *q, struct request *rq)
+{
+ memset(rq, 0, sizeof(*rq));
+
+ INIT_LIST_HEAD(&rq->queuelist);
+ rq->q = q;
+ rq->__sector = (sector_t) -1;
+ INIT_HLIST_NODE(&rq->hash);
+ RB_CLEAR_NODE(&rq->rb_node);
+ rq->tag = BLK_MQ_NO_TAG;
+ rq->internal_tag = BLK_MQ_NO_TAG;
+ rq->start_time_ns = ktime_get_ns();
+ rq->part = NULL;
+ blk_crypto_rq_set_defaults(rq);
+}
+EXPORT_SYMBOL(blk_rq_init);
+
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
{
@@ -359,7 +368,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->start_time_ns = ktime_get_ns();
else
rq->start_time_ns = 0;
- rq->rq_disk = NULL;
rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
rq->alloc_time_ns = alloc_time_ns;
@@ -377,20 +385,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
INIT_LIST_HEAD(&rq->queuelist);
/* tag was already set */
WRITE_ONCE(rq->deadline, 0);
- refcount_set(&rq->ref, 1);
+ req_ref_set(rq, 1);
if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = data->q->elevator;
- rq->elv.icq = NULL;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
if (!op_is_flush(data->cmd_flags) &&
e->type->ops.prepare_request) {
- if (e->type->icq_cache)
- blk_mq_sched_assign_ioc(rq);
-
e->type->ops.prepare_request(rq);
rq->rq_flags |= RQF_ELVPRIV;
}
@@ -616,16 +620,9 @@ void blk_mq_free_request(struct request *rq)
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- if (rq->rq_flags & RQF_ELVPRIV) {
- struct elevator_queue *e = q->elevator;
-
- if (e->type->ops.finish_request)
- e->type->ops.finish_request(rq);
- if (rq->elv.icq) {
- put_io_context(rq->elv.icq->ioc);
- rq->elv.icq = NULL;
- }
- }
+ if ((rq->rq_flags & RQF_ELVPRIV) &&
+ q->elevator->type->ops.finish_request)
+ q->elevator->type->ops.finish_request(rq);
if (rq->rq_flags & RQF_MQ_INFLIGHT)
__blk_mq_dec_active_requests(hctx);
@@ -636,7 +633,7 @@ void blk_mq_free_request(struct request *rq)
rq_qos_done(q, rq);
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
- if (refcount_dec_and_test(&rq->ref))
+ if (req_ref_put_and_test(rq))
__blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
@@ -649,6 +646,20 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
blk_mq_free_request(rq);
}
+void blk_dump_rq_flags(struct request *rq, char *msg)
+{
+ printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
+ rq->q->disk ? rq->q->disk->disk_name : "?",
+ (unsigned long long) rq->cmd_flags);
+
+ printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
+ (unsigned long long)blk_rq_pos(rq),
+ blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
+ printk(KERN_INFO " bio %p, biotail %p, len %u\n",
+ rq->bio, rq->biotail, blk_rq_bytes(rq));
+}
+EXPORT_SYMBOL(blk_dump_rq_flags);
+
static void req_bio_endio(struct request *rq, struct bio *bio,
unsigned int nbytes, blk_status_t error)
{
@@ -685,6 +696,60 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
}
}
+static void blk_print_req_error(struct request *req, blk_status_t status)
+{
+ printk_ratelimited(KERN_ERR
+ "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+ "phys_seg %u prio class %u\n",
+ blk_status_to_str(status),
+ req->q->disk ? req->q->disk->disk_name : "?",
+ blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
+ req->cmd_flags & ~REQ_OP_MASK,
+ req->nr_phys_segments,
+ IOPRIO_PRIO_CLASS(req->ioprio));
+}
+
+/*
+ * Fully end IO on a request. Does not support partial completions, or
+ * errors.
+ */
+static void blk_complete_request(struct request *req)
+{
+ const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
+ int total_bytes = blk_rq_bytes(req);
+ struct bio *bio = req->bio;
+
+ trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
+
+ if (!bio)
+ return;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
+ req->q->integrity.profile->complete_fn(req, total_bytes);
+#endif
+
+ blk_account_io_completion(req, total_bytes);
+
+ do {
+ struct bio *next = bio->bi_next;
+
+ /* Completion has already been traced */
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+ if (!is_flush)
+ bio_endio(bio);
+ bio = next;
+ } while (bio);
+
+ /*
+ * Reset counters so that the request stacking driver
+ * can find how many bytes remain in the request
+ * later.
+ */
+ req->bio = NULL;
+ req->__data_len = 0;
+}
+
/**
* blk_update_request - Complete multiple bytes without completing the request
* @req: the request being processed
@@ -791,6 +856,48 @@ bool blk_update_request(struct request *req, blk_status_t error,
}
EXPORT_SYMBOL_GPL(blk_update_request);
+static void __blk_account_io_done(struct request *req, u64 now)
+{
+ const int sgrp = op_stat_group(req_op(req));
+
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, true);
+ part_stat_inc(req->part, ios[sgrp]);
+ part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_unlock();
+}
+
+static inline void blk_account_io_done(struct request *req, u64 now)
+{
+ /*
+ * Account IO completion. flush_rq isn't accounted as a
+ * normal IO on queueing nor completion. Accounting the
+ * containing request is enough.
+ */
+ if (blk_do_io_stat(req) && req->part &&
+ !(req->rq_flags & RQF_FLUSH_SEQ))
+ __blk_account_io_done(req, now);
+}
+
+static void __blk_account_io_start(struct request *rq)
+{
+ /* passthrough requests can hold bios that do not have ->bi_bdev set */
+ if (rq->bio && rq->bio->bi_bdev)
+ rq->part = rq->bio->bi_bdev;
+ else if (rq->q->disk)
+ rq->part = rq->q->disk->part0;
+
+ part_stat_lock();
+ update_io_ticks(rq->part, jiffies, false);
+ part_stat_unlock();
+}
+
+static inline void blk_account_io_start(struct request *req)
+{
+ if (blk_do_io_stat(req))
+ __blk_account_io_start(req);
+}
+
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
if (rq->rq_flags & RQF_STATS) {
@@ -856,17 +963,18 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
prefetch(rq->bio);
prefetch(rq->rq_next);
- blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
+ blk_complete_request(rq);
if (iob->need_ts)
__blk_mq_end_request_acct(rq, now);
+ rq_qos_done(rq->q, rq);
+
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
- if (!refcount_dec_and_test(&rq->ref))
+ if (!req_ref_put_and_test(rq))
continue;
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
- rq_qos_done(rq->q, rq);
if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
if (cur_hctx)
@@ -995,26 +1103,6 @@ void blk_mq_complete_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_complete_request);
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
- __releases(hctx->srcu)
-{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING))
- rcu_read_unlock();
- else
- srcu_read_unlock(hctx->srcu, srcu_idx);
-}
-
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
- __acquires(hctx->srcu)
-{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- /* shut up gcc false positive */
- *srcu_idx = 0;
- rcu_read_lock();
- } else
- *srcu_idx = srcu_read_lock(hctx->srcu);
-}
-
/**
* blk_mq_start_request - Start processing a request
* @rq: Pointer to request to be started
@@ -1057,6 +1145,107 @@ void blk_mq_start_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_start_request);
+/**
+ * blk_end_sync_rq - executes a completion event on a request
+ * @rq: request to complete
+ * @error: end I/O status of the request
+ */
+static void blk_end_sync_rq(struct request *rq, blk_status_t error)
+{
+ struct completion *waiting = rq->end_io_data;
+
+ rq->end_io_data = (void *)(uintptr_t)error;
+
+ /*
+ * complete last, if this is a stack request the process (and thus
+ * the rq pointer) could be invalid right after this complete()
+ */
+ complete(waiting);
+}
+
+/**
+ * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
+ * @rq: request to insert
+ * @at_head: insert request at head or tail of queue
+ * @done: I/O completion handler
+ *
+ * Description:
+ * Insert a fully prepared request at the back of the I/O scheduler queue
+ * for execution. Don't wait for completion.
+ *
+ * Note:
+ * This function will invoke @done directly if the queue is dead.
+ */
+void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done)
+{
+ WARN_ON(irqs_disabled());
+ WARN_ON(!blk_rq_is_passthrough(rq));
+
+ rq->end_io = done;
+
+ blk_account_io_start(rq);
+
+ /*
+ * don't check dying flag for MQ because the request won't
+ * be reused after dying flag is set
+ */
+ blk_mq_sched_insert_request(rq, at_head, true, false);
+}
+EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
+
+static bool blk_rq_is_poll(struct request *rq)
+{
+ if (!rq->mq_hctx)
+ return false;
+ if (rq->mq_hctx->type != HCTX_TYPE_POLL)
+ return false;
+ if (WARN_ON_ONCE(!rq->bio))
+ return false;
+ return true;
+}
+
+static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
+{
+ do {
+ bio_poll(rq->bio, NULL, 0);
+ cond_resched();
+ } while (!completion_done(wait));
+}
+
+/**
+ * blk_execute_rq - insert a request into queue for execution
+ * @rq: request to insert
+ * @at_head: insert request at head or tail of queue
+ *
+ * Description:
+ * Insert a fully prepared request at the back of the I/O scheduler queue
+ * for execution and wait for completion.
+ * Return: The blk_status_t result provided to blk_mq_end_request().
+ */
+blk_status_t blk_execute_rq(struct request *rq, bool at_head)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ unsigned long hang_check;
+
+ rq->end_io_data = &wait;
+ blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq);
+
+ /* Prevent hang_check timer from firing at us during very long I/O */
+ hang_check = sysctl_hung_task_timeout_secs;
+
+ if (blk_rq_is_poll(rq))
+ blk_rq_poll_completion(rq, &wait);
+ else if (hang_check)
+ while (!wait_for_completion_io_timeout(&wait,
+ hang_check * (HZ/2)))
+ ;
+ else
+ wait_for_completion_io(&wait);
+
+ return (blk_status_t)(uintptr_t)rq->end_io_data;
+}
+EXPORT_SYMBOL(blk_execute_rq);
+
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -1159,14 +1348,15 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
-static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
- void *priv, bool reserved)
+static bool blk_mq_rq_inflight(struct request *rq, void *priv,
+ bool reserved)
{
/*
- * If we find a request that isn't idle and the queue matches,
- * we know the queue is busy. Return false to stop the iteration.
+ * If we find a request that isn't idle we know the queue is busy
+ * as it's checked in the iter.
+ * Return false to stop the iteration.
*/
- if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
+ if (blk_mq_request_started(rq)) {
bool *busy = priv;
*busy = true;
@@ -1224,12 +1414,11 @@ void blk_mq_put_rq_ref(struct request *rq)
{
if (is_flush_rq(rq))
rq->end_io(rq, 0);
- else if (refcount_dec_and_test(&rq->ref))
+ else if (req_ref_put_and_test(rq))
__blk_mq_free_request(rq);
}
-static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
- struct request *rq, void *priv, bool reserved)
+static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved)
{
unsigned long *next = priv;
@@ -1770,19 +1959,14 @@ out:
*/
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
- int srcu_idx;
-
/*
* We can't run the queue inline with ints disabled. Ensure that
* we catch bad users of this early.
*/
WARN_ON_ONCE(in_interrupt());
- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
- hctx_lock(hctx, &srcu_idx);
- blk_mq_sched_dispatch_requests(hctx);
- hctx_unlock(hctx, srcu_idx);
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_sched_dispatch_requests(hctx));
}
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -1894,7 +2078,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
*/
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- int srcu_idx;
bool need_run;
/*
@@ -1905,10 +2088,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
* quiesced.
*/
- hctx_lock(hctx, &srcu_idx);
- need_run = !blk_queue_quiesced(hctx->queue) &&
- blk_mq_hctx_has_pending(hctx);
- hctx_unlock(hctx, srcu_idx);
+ __blk_mq_run_dispatch_ops(hctx->queue, false,
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx));
if (need_run)
__blk_mq_delay_run_hw_queue(hctx, async, 0);
@@ -2201,98 +2383,6 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
*queued = 0;
}
-static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
-{
- struct blk_mq_hw_ctx *hctx = NULL;
- struct request *rq;
- int queued = 0;
- int errors = 0;
-
- while ((rq = rq_list_pop(&plug->mq_list))) {
- bool last = rq_list_empty(plug->mq_list);
- blk_status_t ret;
-
- if (hctx != rq->mq_hctx) {
- if (hctx)
- blk_mq_commit_rqs(hctx, &queued, from_schedule);
- hctx = rq->mq_hctx;
- }
-
- ret = blk_mq_request_issue_directly(rq, last);
- switch (ret) {
- case BLK_STS_OK:
- queued++;
- break;
- case BLK_STS_RESOURCE:
- case BLK_STS_DEV_RESOURCE:
- blk_mq_request_bypass_insert(rq, false, last);
- blk_mq_commit_rqs(hctx, &queued, from_schedule);
- return;
- default:
- blk_mq_end_request(rq, ret);
- errors++;
- break;
- }
- }
-
- /*
- * If we didn't flush the entire list, we could have told the driver
- * there was more coming, but that turned out to be a lie.
- */
- if (errors)
- blk_mq_commit_rqs(hctx, &queued, from_schedule);
-}
-
-void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
-{
- struct blk_mq_hw_ctx *this_hctx;
- struct blk_mq_ctx *this_ctx;
- unsigned int depth;
- LIST_HEAD(list);
-
- if (rq_list_empty(plug->mq_list))
- return;
- plug->rq_count = 0;
-
- if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
- blk_mq_plug_issue_direct(plug, false);
- if (rq_list_empty(plug->mq_list))
- return;
- }
-
- this_hctx = NULL;
- this_ctx = NULL;
- depth = 0;
- do {
- struct request *rq;
-
- rq = rq_list_pop(&plug->mq_list);
-
- if (!this_hctx) {
- this_hctx = rq->mq_hctx;
- this_ctx = rq->mq_ctx;
- } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
- trace_block_unplug(this_hctx->queue, depth,
- !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx,
- &list, from_schedule);
- depth = 0;
- this_hctx = rq->mq_hctx;
- this_ctx = rq->mq_ctx;
-
- }
-
- list_add(&rq->queuelist, &list);
- depth++;
- } while (!rq_list_empty(plug->mq_list));
-
- if (!list_empty(&list)) {
- trace_block_unplug(this_hctx->queue, depth, !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
- from_schedule);
- }
-}
-
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
unsigned int nr_segs)
{
@@ -2403,33 +2493,141 @@ insert:
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- blk_status_t ret;
- int srcu_idx;
-
- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
- hctx_lock(hctx, &srcu_idx);
+ blk_status_t ret =
+ __blk_mq_try_issue_directly(hctx, rq, false, true);
- ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
blk_mq_request_bypass_insert(rq, false, true);
else if (ret != BLK_STS_OK)
blk_mq_end_request(rq, ret);
+}
- hctx_unlock(hctx, srcu_idx);
+static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
+{
+ return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last);
}
-blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
+static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
{
- blk_status_t ret;
- int srcu_idx;
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+ struct blk_mq_hw_ctx *hctx = NULL;
+ struct request *rq;
+ int queued = 0;
+ int errors = 0;
- hctx_lock(hctx, &srcu_idx);
- ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
- hctx_unlock(hctx, srcu_idx);
+ while ((rq = rq_list_pop(&plug->mq_list))) {
+ bool last = rq_list_empty(plug->mq_list);
+ blk_status_t ret;
- return ret;
+ if (hctx != rq->mq_hctx) {
+ if (hctx)
+ blk_mq_commit_rqs(hctx, &queued, from_schedule);
+ hctx = rq->mq_hctx;
+ }
+
+ ret = blk_mq_request_issue_directly(rq, last);
+ switch (ret) {
+ case BLK_STS_OK:
+ queued++;
+ break;
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_request_bypass_insert(rq, false, last);
+ blk_mq_commit_rqs(hctx, &queued, from_schedule);
+ return;
+ default:
+ blk_mq_end_request(rq, ret);
+ errors++;
+ break;
+ }
+ }
+
+ /*
+ * If we didn't flush the entire list, we could have told the driver
+ * there was more coming, but that turned out to be a lie.
+ */
+ if (errors)
+ blk_mq_commit_rqs(hctx, &queued, from_schedule);
+}
+
+static void __blk_mq_flush_plug_list(struct request_queue *q,
+ struct blk_plug *plug)
+{
+ if (blk_queue_quiesced(q))
+ return;
+ q->mq_ops->queue_rqs(&plug->mq_list);
+}
+
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+{
+ struct blk_mq_hw_ctx *this_hctx;
+ struct blk_mq_ctx *this_ctx;
+ struct request *rq;
+ unsigned int depth;
+ LIST_HEAD(list);
+
+ if (rq_list_empty(plug->mq_list))
+ return;
+ plug->rq_count = 0;
+
+ if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
+ struct request_queue *q;
+
+ rq = rq_list_peek(&plug->mq_list);
+ q = rq->q;
+
+ /*
+ * Peek first request and see if we have a ->queue_rqs() hook.
+ * If we do, we can dispatch the whole plug list in one go. We
+ * already know at this point that all requests belong to the
+ * same queue, caller must ensure that's the case.
+ *
+ * Since we pass off the full list to the driver at this point,
+ * we do not increment the active request count for the queue.
+ * Bypass shared tags for now because of that.
+ */
+ if (q->mq_ops->queue_rqs &&
+ !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+ blk_mq_run_dispatch_ops(q,
+ __blk_mq_flush_plug_list(q, plug));
+ if (rq_list_empty(plug->mq_list))
+ return;
+ }
+
+ blk_mq_run_dispatch_ops(q,
+ blk_mq_plug_issue_direct(plug, false));
+ if (rq_list_empty(plug->mq_list))
+ return;
+ }
+
+ this_hctx = NULL;
+ this_ctx = NULL;
+ depth = 0;
+ do {
+ rq = rq_list_pop(&plug->mq_list);
+
+ if (!this_hctx) {
+ this_hctx = rq->mq_hctx;
+ this_ctx = rq->mq_ctx;
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+ trace_block_unplug(this_hctx->queue, depth,
+ !from_schedule);
+ blk_mq_sched_insert_requests(this_hctx, this_ctx,
+ &list, from_schedule);
+ depth = 0;
+ this_hctx = rq->mq_hctx;
+ this_ctx = rq->mq_ctx;
+
+ }
+
+ list_add(&rq->queuelist, &list);
+ depth++;
+ } while (!rq_list_empty(plug->mq_list));
+
+ if (!list_empty(&list)) {
+ trace_block_unplug(this_hctx->queue, depth, !from_schedule);
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
+ from_schedule);
+ }
}
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2468,21 +2666,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
hctx->queue->mq_ops->commit_rqs(hctx);
}
-static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
-{
- if (!plug->multiple_queues) {
- struct request *nxt = rq_list_peek(&plug->mq_list);
-
- if (nxt && nxt->q != rq->q)
- plug->multiple_queues = true;
- }
- if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
- plug->has_elevator = true;
- rq->rq_next = NULL;
- rq_list_add(&plug->mq_list, rq);
- plug->rq_count++;
-}
-
/*
* Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
* queues. This is important for md arrays to benefit from merging
@@ -2495,12 +2678,33 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
return BLK_MAX_REQUEST_COUNT;
}
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
+{
+ struct request *last = rq_list_peek(&plug->mq_list);
+
+ if (!plug->rq_count) {
+ trace_block_plug(rq->q);
+ } else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
+ (!blk_queue_nomerges(rq->q) &&
+ blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
+ blk_mq_flush_plug_list(plug, false);
+ trace_block_plug(rq->q);
+ }
+
+ if (!plug->multiple_queues && last && last->q != rq->q)
+ plug->multiple_queues = true;
+ if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+ plug->has_elevator = true;
+ rq->rq_next = NULL;
+ rq_list_add(&plug->mq_list, rq);
+ plug->rq_count++;
+}
+
static bool blk_mq_attempt_bio_merge(struct request_queue *q,
- struct bio *bio, unsigned int nr_segs,
- bool *same_queue_rq)
+ struct bio *bio, unsigned int nr_segs)
{
if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
- if (blk_attempt_plug_merge(q, bio, nr_segs, same_queue_rq))
+ if (blk_attempt_plug_merge(q, bio, nr_segs))
return true;
if (blk_mq_sched_bio_merge(q, bio, nr_segs))
return true;
@@ -2510,9 +2714,7 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q,
static struct request *blk_mq_get_new_requests(struct request_queue *q,
struct blk_plug *plug,
- struct bio *bio,
- unsigned int nsegs,
- bool *same_queue_rq)
+ struct bio *bio)
{
struct blk_mq_alloc_data data = {
.q = q,
@@ -2521,11 +2723,9 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
};
struct request *rq;
- if (blk_mq_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
+ if (unlikely(bio_queue_enter(bio)))
return NULL;
- rq_qos_throttle(q, bio);
-
if (plug) {
data.nr_tags = plug->nr_ios;
plug->nr_ios = 1;
@@ -2535,63 +2735,32 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
rq = __blk_mq_alloc_requests(&data);
if (rq)
return rq;
-
rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
-
+ blk_queue_exit(q);
return NULL;
}
-static inline bool blk_mq_can_use_cached_rq(struct request *rq,
- struct bio *bio)
-{
- if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
- return false;
-
- if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
- return false;
-
- return true;
-}
-
-static inline struct request *blk_mq_get_request(struct request_queue *q,
- struct blk_plug *plug,
- struct bio *bio,
- unsigned int nsegs,
- bool *same_queue_rq)
+static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
+ struct blk_plug *plug, struct bio *bio)
{
struct request *rq;
- bool checked = false;
-
- if (plug) {
- rq = rq_list_peek(&plug->cached_rq);
- if (rq && rq->q == q) {
- if (unlikely(!submit_bio_checks(bio)))
- return NULL;
- if (blk_mq_attempt_bio_merge(q, bio, nsegs,
- same_queue_rq))
- return NULL;
- checked = true;
- if (!blk_mq_can_use_cached_rq(rq, bio))
- goto fallback;
- rq->cmd_flags = bio->bi_opf;
- plug->cached_rq = rq_list_next(rq);
- INIT_LIST_HEAD(&rq->queuelist);
- rq_qos_throttle(q, bio);
- return rq;
- }
- }
+ if (!plug)
+ return NULL;
+ rq = rq_list_peek(&plug->cached_rq);
+ if (!rq || rq->q != q)
+ return NULL;
-fallback:
- if (unlikely(bio_queue_enter(bio)))
+ if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
return NULL;
- if (!checked && !submit_bio_checks(bio))
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
return NULL;
- rq = blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq);
- if (!rq)
- blk_queue_exit(q);
+
+ rq->cmd_flags = bio->bi_opf;
+ plug->cached_rq = rq_list_next(rq);
+ INIT_LIST_HEAD(&rq->queuelist);
return rq;
}
@@ -2611,10 +2780,9 @@ fallback:
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct blk_plug *plug = blk_mq_plug(q, bio);
const int is_sync = op_is_sync(bio->bi_opf);
struct request *rq;
- struct blk_plug *plug;
- bool same_queue_rq = false;
unsigned int nr_segs = 1;
blk_status_t ret;
@@ -2628,11 +2796,18 @@ void blk_mq_submit_bio(struct bio *bio)
if (!bio_integrity_prep(bio))
return;
- plug = blk_mq_plug(q, bio);
- rq = blk_mq_get_request(q, plug, bio, nr_segs, &same_queue_rq);
- if (unlikely(!rq))
+ if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
return;
+ rq_qos_throttle(q, bio);
+
+ rq = blk_mq_get_cached_request(q, plug, bio);
+ if (!rq) {
+ rq = blk_mq_get_new_requests(q, plug, bio);
+ if (unlikely(!rq))
+ return;
+ }
+
trace_block_getrq(bio);
rq_qos_track(q, rq, bio);
@@ -2647,72 +2822,217 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
- if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
+ if (op_is_flush(bio->bi_opf)) {
+ blk_insert_flush(rq);
return;
+ }
- if (plug && (q->nr_hw_queues == 1 ||
- blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
- q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
- /*
- * Use plugging if we have a ->commit_rqs() hook as well, as
- * we know the driver uses bd->last in a smart fashion.
- *
- * Use normal plugging if this disk is slow HDD, as sequential
- * IO may benefit a lot from plug merging.
- */
- unsigned int request_count = plug->rq_count;
- struct request *last = NULL;
-
- if (!request_count) {
- trace_block_plug(q);
- } else if (!blk_queue_nomerges(q)) {
- last = rq_list_peek(&plug->mq_list);
- if (blk_rq_bytes(last) < BLK_PLUG_FLUSH_SIZE)
- last = NULL;
- }
-
- if (request_count >= blk_plug_max_rq_count(plug) || last) {
- blk_mq_flush_plug_list(plug, false);
- trace_block_plug(q);
- }
-
+ if (plug)
blk_add_rq_to_plug(plug, rq);
- } else if (rq->rq_flags & RQF_ELV) {
- /* Insert the request at the IO scheduler queue */
+ else if ((rq->rq_flags & RQF_ELV) ||
+ (rq->mq_hctx->dispatch_busy &&
+ (q->nr_hw_queues == 1 || !is_sync)))
blk_mq_sched_insert_request(rq, false, true, true);
- } else if (plug && !blk_queue_nomerges(q)) {
- struct request *next_rq = NULL;
+ else
+ blk_mq_run_dispatch_ops(rq->q,
+ blk_mq_try_issue_directly(rq->mq_hctx, rq));
+}
+
+/**
+ * blk_cloned_rq_check_limits - Helper function to check a cloned request
+ * for the new queue limits
+ * @q: the queue
+ * @rq: the request being checked
+ *
+ * Description:
+ * @rq may have been made based on weaker limitations of upper-level queues
+ * in request stacking drivers, and it may violate the limitation of @q.
+ * Since the block layer and the underlying device driver trust @rq
+ * after it is inserted to @q, it should be checked against @q before
+ * the insertion using this generic function.
+ *
+ * Request stacking drivers like request-based dm may change the queue
+ * limits when retrying requests on other queues. Those requests need
+ * to be checked against the new queue limits again during dispatch.
+ */
+static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
+ struct request *rq)
+{
+ unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+ if (blk_rq_sectors(rq) > max_sectors) {
/*
- * We do limited plugging. If the bio can be merged, do that.
- * Otherwise the existing request in the plug list will be
- * issued. So the plug list will have one request at most
- * The plug list might get flushed before this. If that happens,
- * the plug list is empty, and same_queue_rq is invalid.
+ * SCSI device does not have a good way to return if
+ * Write Same/Zero is actually supported. If a device rejects
+ * a non-read/write command (discard, write same,etc.) the
+ * low-level device driver will set the relevant queue limit to
+ * 0 to prevent blk-lib from issuing more of the offending
+ * operations. Commands queued prior to the queue limit being
+ * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
+ * errors being propagated to upper layers.
*/
- if (same_queue_rq) {
- next_rq = rq_list_pop(&plug->mq_list);
- plug->rq_count--;
- }
- blk_add_rq_to_plug(plug, rq);
- trace_block_plug(q);
+ if (max_sectors == 0)
+ return BLK_STS_NOTSUPP;
+
+ printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
+ __func__, blk_rq_sectors(rq), max_sectors);
+ return BLK_STS_IOERR;
+ }
+
+ /*
+ * The queue settings related to segment counting may differ from the
+ * original queue.
+ */
+ rq->nr_phys_segments = blk_recalc_rq_segments(rq);
+ if (rq->nr_phys_segments > queue_max_segments(q)) {
+ printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
+ __func__, rq->nr_phys_segments, queue_max_segments(q));
+ return BLK_STS_IOERR;
+ }
+
+ return BLK_STS_OK;
+}
+
+/**
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @q: the queue to submit the request
+ * @rq: the request being queued
+ */
+blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
+{
+ blk_status_t ret;
+
+ ret = blk_cloned_rq_check_limits(q, rq);
+ if (ret != BLK_STS_OK)
+ return ret;
+
+ if (rq->q->disk &&
+ should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq)))
+ return BLK_STS_IOERR;
+
+ if (blk_crypto_insert_cloned_request(rq))
+ return BLK_STS_IOERR;
+
+ blk_account_io_start(rq);
+
+ /*
+ * Since we have a scheduler attached on the top device,
+ * bypass a potential scheduler on the bottom device for
+ * insert.
+ */
+ blk_mq_run_dispatch_ops(rq->q,
+ ret = blk_mq_request_issue_directly(rq, true));
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
+
+/**
+ * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
+ * @rq: the clone request to be cleaned up
+ *
+ * Description:
+ * Free all bios in @rq for a cloned request.
+ */
+void blk_rq_unprep_clone(struct request *rq)
+{
+ struct bio *bio;
+
+ while ((bio = rq->bio) != NULL) {
+ rq->bio = bio->bi_next;
+
+ bio_put(bio);
+ }
+}
+EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
+
+/**
+ * blk_rq_prep_clone - Helper function to setup clone request
+ * @rq: the request to be setup
+ * @rq_src: original request to be cloned
+ * @bs: bio_set that bios for clone are allocated from
+ * @gfp_mask: memory allocation mask for bio
+ * @bio_ctr: setup function to be called for each clone bio.
+ * Returns %0 for success, non %0 for failure.
+ * @data: private data to be passed to @bio_ctr
+ *
+ * Description:
+ * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
+ * Also, pages which the original bios are pointing to are not copied
+ * and the cloned bios just point same pages.
+ * So cloned bios must be completed before original bios, which means
+ * the caller must complete @rq before @rq_src.
+ */
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+ struct bio_set *bs, gfp_t gfp_mask,
+ int (*bio_ctr)(struct bio *, struct bio *, void *),
+ void *data)
+{
+ struct bio *bio, *bio_src;
+
+ if (!bs)
+ bs = &fs_bio_set;
+
+ __rq_for_each_bio(bio_src, rq_src) {
+ bio = bio_clone_fast(bio_src, gfp_mask, bs);
+ if (!bio)
+ goto free_and_out;
- if (next_rq) {
- trace_block_unplug(q, 1, true);
- blk_mq_try_issue_directly(next_rq->mq_hctx, next_rq);
+ if (bio_ctr && bio_ctr(bio, bio_src, data))
+ goto free_and_out;
+
+ if (rq->bio) {
+ rq->biotail->bi_next = bio;
+ rq->biotail = bio;
+ } else {
+ rq->bio = rq->biotail = bio;
}
- } else if ((q->nr_hw_queues > 1 && is_sync) ||
- !rq->mq_hctx->dispatch_busy) {
- /*
- * There is no scheduler and we can try to send directly
- * to the hardware.
- */
- blk_mq_try_issue_directly(rq->mq_hctx, rq);
- } else {
- /* Default case. */
- blk_mq_sched_insert_request(rq, false, true, true);
+ bio = NULL;
+ }
+
+ /* Copy attributes of the original request to the clone request. */
+ rq->__sector = blk_rq_pos(rq_src);
+ rq->__data_len = blk_rq_bytes(rq_src);
+ if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+ rq->special_vec = rq_src->special_vec;
+ }
+ rq->nr_phys_segments = rq_src->nr_phys_segments;
+ rq->ioprio = rq_src->ioprio;
+
+ if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
+ goto free_and_out;
+
+ return 0;
+
+free_and_out:
+ if (bio)
+ bio_put(bio);
+ blk_rq_unprep_clone(rq);
+
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+
+/*
+ * Steal bios from a request and add them to a bio list.
+ * The request must not have been partially completed before.
+ */
+void blk_steal_bios(struct bio_list *list, struct request *rq)
+{
+ if (rq->bio) {
+ if (list->tail)
+ list->tail->bi_next = rq->bio;
+ else
+ list->head = rq->bio;
+ list->tail = rq->biotail;
+
+ rq->bio = NULL;
+ rq->biotail = NULL;
}
+
+ rq->__data_len = 0;
}
+EXPORT_SYMBOL_GPL(blk_steal_bios);
static size_t order_to_size(unsigned int order)
{
@@ -2740,7 +3060,7 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
unsigned long rq_addr = (unsigned long)rq;
if (rq_addr >= start && rq_addr < end) {
- WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
+ WARN_ON_ONCE(req_ref_read(rq) != 0);
cmpxchg(&drv_tags->rqs[i], rq, NULL);
}
}
@@ -3074,7 +3394,7 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
if (!tags)
return;
- WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
+ WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
for (i = 0; i < queue_depth; i++)
cmpxchg(&tags->rqs[i], flush_rq, NULL);
@@ -3128,20 +3448,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
}
}
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
- __alignof__(struct blk_mq_hw_ctx)) !=
- sizeof(struct blk_mq_hw_ctx));
-
- if (tag_set->flags & BLK_MQ_F_BLOCKING)
- hw_ctx_size += sizeof(struct srcu_struct);
-
- return hw_ctx_size;
-}
-
static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -3179,7 +3485,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
- hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+ hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
if (!hctx)
goto fail_alloc_hctx;
@@ -3221,8 +3527,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
if (!hctx->fq)
goto free_bitmap;
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- init_srcu_struct(hctx->srcu);
blk_mq_hctx_kobj_init(hctx);
return hctx;
@@ -3558,7 +3862,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
- q = blk_alloc_queue(set->numa_node);
+ q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
@@ -3707,6 +4011,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
+ WARN_ON_ONCE(blk_queue_has_srcu(q) !=
+ !!(set->flags & BLK_MQ_F_BLOCKING));
+
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -4243,11 +4550,10 @@ EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
/* Enable polling stats and return whether they were already enabled. */
static bool blk_poll_stats_enable(struct request_queue *q)
{
- if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
- blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
+ if (q->poll_stat)
return true;
- blk_stat_add_callback(q, q->poll_cb);
- return false;
+
+ return blk_stats_alloc_enable(q);
}
static void blk_mq_poll_stats_start(struct request_queue *q)
@@ -4256,8 +4562,7 @@ static void blk_mq_poll_stats_start(struct request_queue *q)
* We don't arm the callback if polling stats are not enabled or the
* callback is already active.
*/
- if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
- blk_stat_is_active(q->poll_cb))
+ if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
return;
blk_stat_activate_msecs(q->poll_cb, 100);
@@ -4417,6 +4722,19 @@ unsigned int blk_mq_rq_cpu(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_rq_cpu);
+void blk_mq_cancel_work_sync(struct request_queue *q)
+{
+ if (queue_is_mq(q)) {
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ cancel_delayed_work_sync(&q->requeue_work);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ cancel_delayed_work_sync(&hctx->run_work);
+ }
+}
+
static int __init blk_mq_init(void)
{
int i;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 8acfa650f575..948791ea2a3e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -65,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
-
-/* Used by blk_insert_cloned_request() to issue request directly */
-blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list);
@@ -128,6 +125,8 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
+void blk_mq_cancel_work_sync(struct request_queue *q);
+
void blk_mq_release(struct request_queue *q);
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
@@ -375,5 +374,24 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return __blk_mq_active_requests(hctx) < depth;
}
+/* run the code block in @dispatch_ops with rcu/srcu read lock held */
+#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
+do { \
+ if (!blk_queue_has_srcu(q)) { \
+ rcu_read_lock(); \
+ (dispatch_ops); \
+ rcu_read_unlock(); \
+ } else { \
+ int srcu_idx; \
+ \
+ might_sleep_if(check_sleep); \
+ srcu_idx = srcu_read_lock((q)->srcu); \
+ (dispatch_ops); \
+ srcu_read_unlock((q)->srcu, srcu_idx); \
+ } \
+} while (0)
+
+#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
+ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
#endif
diff --git a/block/blk-stat.c b/block/blk-stat.c
index ae3dd1fb8e61..2ea01b5c1aca 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -15,7 +15,7 @@
struct blk_queue_stats {
struct list_head callbacks;
spinlock_t lock;
- bool enable_accounting;
+ int accounting;
};
void blk_rq_stat_init(struct blk_rq_stat *stat)
@@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q,
spin_lock_irqsave(&q->stats->lock, flags);
list_del_rcu(&cb->list);
- if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
+ if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags);
@@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
}
+void blk_stat_disable_accounting(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->stats->lock, flags);
+ if (!--q->stats->accounting)
+ blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
+ spin_unlock_irqrestore(&q->stats->lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);
+
void blk_stat_enable_accounting(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(&q->stats->lock, flags);
- q->stats->enable_accounting = true;
- blk_queue_flag_set(QUEUE_FLAG_STATS, q);
+ if (!q->stats->accounting++)
+ blk_queue_flag_set(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
@@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
INIT_LIST_HEAD(&stats->callbacks);
spin_lock_init(&stats->lock);
- stats->enable_accounting = false;
+ stats->accounting = 0;
return stats;
}
@@ -219,3 +230,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
kfree(stats);
}
+
+bool blk_stats_alloc_enable(struct request_queue *q)
+{
+ struct blk_rq_stat *poll_stat;
+
+ poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
+ GFP_ATOMIC);
+ if (!poll_stat)
+ return false;
+
+ if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
+ kfree(poll_stat);
+ return true;
+ }
+
+ blk_stat_add_callback(q, q->poll_cb);
+ return false;
+}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 17b47a86eefb..17e1eb4ec7e2 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -64,11 +64,13 @@ struct blk_stat_callback {
struct blk_queue_stats *blk_alloc_queue_stats(void);
void blk_free_queue_stats(struct blk_queue_stats *);
+bool blk_stats_alloc_enable(struct request_queue *q);
void blk_stat_add(struct request *rq, u64 now);
/* record time/size info in request but not add a callback */
void blk_stat_enable_accounting(struct request_queue *q);
+void blk_stat_disable_accounting(struct request_queue *q);
/**
* blk_stat_alloc_callback() - Allocate a block statistics callback.
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cef1f713370b..e20eadfcf5c8 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -16,6 +16,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
+#include "blk-mq-sched.h"
#include "blk-wbt.h"
#include "blk-throttle.h"
@@ -734,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
struct request_queue *q = container_of(rcu_head, struct request_queue,
rcu_head);
- kmem_cache_free(blk_requestq_cachep, q);
+
+ kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
@@ -747,7 +749,7 @@ static void blk_exit_queue(struct request_queue *q)
*/
if (q->elevator) {
ioc_clear_queue(q);
- __elevator_exit(q, q->elevator);
+ elevator_exit(q);
}
/*
@@ -785,24 +787,15 @@ static void blk_release_queue(struct kobject *kobj)
might_sleep();
- if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+ if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
- blk_free_queue_stats(q->stats);
-
- if (queue_is_mq(q)) {
- struct blk_mq_hw_ctx *hctx;
- int i;
-
- cancel_delayed_work_sync(&q->requeue_work);
-
- queue_for_each_hw_ctx(q, hctx, i)
- cancel_delayed_work_sync(&hctx->run_work);
- }
-
blk_exit_queue(q);
+ blk_free_queue_stats(q->stats);
+ kfree(q->poll_stat);
+
blk_queue_free_zone_bitmaps(q);
if (queue_is_mq(q))
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 39bb6e68a9a2..7c462c006b26 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -13,6 +13,7 @@
#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-cgroup-rwstat.h"
+#include "blk-stat.h"
#include "blk-throttle.h"
/* Max dispatch from a group in 1 round */
diff --git a/block/blk.h b/block/blk.h
index b4fed2033e48..8bd43b3ad33d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -2,15 +2,10 @@
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H
-#include <linux/idr.h>
-#include <linux/blk-mq.h>
-#include <linux/part_stat.h>
#include <linux/blk-crypto.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <xen/xen.h>
#include "blk-crypto-internal.h"
-#include "blk-mq.h"
-#include "blk-mq-sched.h"
struct elevator_type;
@@ -32,15 +27,10 @@ struct blk_flush_queue {
};
extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
-static inline struct blk_flush_queue *
-blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
-{
- return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
-}
-
static inline void __blk_get_queue(struct request_queue *q)
{
kobject_get(&q->kobj);
@@ -250,16 +240,13 @@ static inline void blk_integrity_del(struct gendisk *disk)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
-void blk_print_req_error(struct request *req, blk_status_t status);
+const char *blk_status_to_str(blk_status_t status);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, bool *same_queue_rq);
+ unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs);
-void __blk_account_io_start(struct request *req);
-void __blk_account_io_done(struct request *req, u64 now);
-
/*
* Plug flush limits
*/
@@ -271,23 +258,14 @@ void __blk_account_io_done(struct request *req, u64 now);
*/
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
-bool blk_insert_flush(struct request *rq);
+void blk_insert_flush(struct request *rq);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
-void __elevator_exit(struct request_queue *, struct elevator_queue *);
+void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
-static inline void elevator_exit(struct request_queue *q,
- struct elevator_queue *e)
-{
- lockdep_assert_held(&q->sysfs_lock);
-
- blk_mq_sched_free_rqs(q);
- __elevator_exit(q, e);
-}
-
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
@@ -347,26 +325,10 @@ int blk_dev_init(void);
*/
static inline bool blk_do_io_stat(struct request *rq)
{
- return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk;
-}
-
-static inline void blk_account_io_done(struct request *req, u64 now)
-{
- /*
- * Account IO completion. flush_rq isn't accounted as a
- * normal IO on queueing nor completion. Accounting the
- * containing request is enough.
- */
- if (blk_do_io_stat(req) && req->part &&
- !(req->rq_flags & RQF_FLUSH_SEQ))
- __blk_account_io_done(req, now);
+ return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
}
-static inline void blk_account_io_start(struct request *req)
-{
- if (blk_do_io_stat(req))
- __blk_account_io_start(req);
-}
+void update_io_ticks(struct block_device *part, unsigned long now, bool end);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
@@ -402,13 +364,15 @@ static inline unsigned int bio_aligned_discard_max_sectors(
/*
* Internal io_context interface
*/
-void get_io_context(struct io_context *ioc);
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
- gfp_t gfp_mask);
+struct io_cq *ioc_find_get_icq(struct request_queue *q);
+struct io_cq *ioc_lookup_icq(struct request_queue *q);
+#ifdef CONFIG_BLK_ICQ
void ioc_clear_queue(struct request_queue *q);
-
-int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
+#else
+static inline void ioc_clear_queue(struct request_queue *q)
+{
+}
+#endif /* CONFIG_BLK_ICQ */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
@@ -467,7 +431,15 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
-struct request_queue *blk_alloc_queue(int node_id);
+static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
+{
+ if (srcu)
+ return blk_requestq_srcu_cachep;
+ return blk_requestq_cachep;
+}
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
+
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
@@ -493,4 +465,45 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *new_iars);
void disk_unregister_independent_access_ranges(struct gendisk *disk);
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+bool should_fail_request(struct block_device *part, unsigned int bytes);
+#else /* CONFIG_FAIL_MAKE_REQUEST */
+static inline bool should_fail_request(struct block_device *part,
+ unsigned int bytes)
+{
+ return false;
+}
+#endif /* CONFIG_FAIL_MAKE_REQUEST */
+
+/*
+ * Optimized request reference counting. Ideally we'd make timeouts be more
+ * clever, as that's the only reason we need references at all... But until
+ * this happens, this is faster than using refcount_t. Also see:
+ *
+ * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
+ */
+#define req_ref_zero_or_close_to_overflow(req) \
+ ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct request *req)
+{
+ return atomic_inc_not_zero(&req->ref);
+}
+
+static inline bool req_ref_put_and_test(struct request *req)
+{
+ WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+ return atomic_dec_and_test(&req->ref);
+}
+
+static inline void req_ref_set(struct request *req, int value)
+{
+ atomic_set(&req->ref, value);
+}
+
+static inline int req_ref_read(struct request *req)
+{
+ return atomic_read(&req->ref);
+}
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 10aa378702fa..acfe1357bf6c 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -92,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
goto out_unmap_bidi_rq;
bio = rq->bio;
- blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
+ blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
/*
* The assignments below don't make much sense, but are kept for
diff --git a/block/elevator.c b/block/elevator.c
index 1f39f6e8ebb9..ec98aed39c4f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
-void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
+void elevator_exit(struct request_queue *q)
{
+ struct elevator_queue *e = q->elevator;
+
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
@@ -595,7 +597,8 @@ int elevator_switch_mq(struct request_queue *q,
elv_unregister_queue(q);
ioc_clear_queue(q);
- elevator_exit(q, q->elevator);
+ blk_mq_sched_free_rqs(q);
+ elevator_exit(q);
}
ret = blk_mq_init_sched(q, new_e);
@@ -605,7 +608,8 @@ int elevator_switch_mq(struct request_queue *q,
if (new_e) {
ret = elv_register_queue(q, true);
if (ret) {
- elevator_exit(q, q->elevator);
+ blk_mq_sched_free_rqs(q);
+ elevator_exit(q);
goto out;
}
}
@@ -694,12 +698,18 @@ void elevator_init_mq(struct request_queue *q)
if (!e)
return;
+ /*
+ * We are called before adding disk, when there isn't any FS I/O,
+ * so freezing queue plus canceling dispatch work is enough to
+ * drain any dispatch activities originated from passthrough
+ * requests, then no need to quiesce queue which may add long boot
+ * latency, especially when lots of disks are involved.
+ */
blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
+ blk_mq_cancel_work_sync(q);
err = blk_mq_init_sched(q, e);
- blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
if (err) {
diff --git a/block/fops.c b/block/fops.c
index ad732a36f9b3..26bf15c770d2 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -15,6 +15,7 @@
#include <linux/falloc.h>
#include <linux/suspend.h>
#include <linux/fs.h>
+#include <linux/module.h>
#include "blk.h"
static inline struct inode *bdev_file_inode(struct file *file)
@@ -340,8 +341,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
} else {
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
- bio->bi_status = BLK_STS_IOERR;
- bio_endio(bio);
+ bio_put(bio);
return ret;
}
}
@@ -566,21 +566,48 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct block_device *bdev = iocb->ki_filp->private_data;
loff_t size = bdev_nr_bytes(bdev);
+ size_t count = iov_iter_count(to);
loff_t pos = iocb->ki_pos;
size_t shorted = 0;
- ssize_t ret;
+ ssize_t ret = 0;
- if (unlikely(pos + iov_iter_count(to) > size)) {
+ if (unlikely(pos + count > size)) {
if (pos >= size)
return 0;
size -= pos;
- if (iov_iter_count(to) > size) {
- shorted = iov_iter_count(to) - size;
+ if (count > size) {
+ shorted = count - size;
iov_iter_truncate(to, size);
}
}
- ret = generic_file_read_iter(iocb, to);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1))
+ return -EAGAIN;
+ } else {
+ ret = filemap_write_and_wait_range(mapping,
+ iocb->ki_pos,
+ iocb->ki_pos + count - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ file_accessed(iocb->ki_filp);
+
+ ret = blkdev_direct_IO(iocb, to);
+ if (ret >= 0) {
+ iocb->ki_pos += ret;
+ count -= ret;
+ }
+ if (ret < 0 || !count)
+ return ret;
+ }
+
+ ret = filemap_read(iocb, to, ret);
if (unlikely(shorted))
iov_iter_reexpand(to, iov_iter_count(to) + shorted);
diff --git a/block/genhd.c b/block/genhd.c
index c5392cc24d37..626c8406f21a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -25,8 +25,10 @@
#include <linux/log2.h>
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
+#include <linux/part_stat.h>
#include "blk.h"
+#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
static struct kobject *block_depr;
@@ -372,17 +374,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
}
EXPORT_SYMBOL_GPL(disk_uevent);
-static void disk_scan_partitions(struct gendisk *disk)
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
{
struct block_device *bdev;
- if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
- return;
+ if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
+ return -EINVAL;
+ if (disk->open_partitions)
+ return -EBUSY;
set_bit(GD_NEED_PART_SCAN, &disk->state);
- bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
- if (!IS_ERR(bdev))
- blkdev_put(bdev, FMODE_READ);
+ bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ blkdev_put(bdev, mode);
+ return 0;
}
/**
@@ -425,6 +431,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
DISK_MAX_PARTS);
disk->minors = DISK_MAX_PARTS;
}
+ if (disk->first_minor + disk->minors > MINORMASK + 1)
+ return -EINVAL;
} else {
if (WARN_ON(disk->minors))
return -EINVAL;
@@ -434,13 +442,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
return ret;
disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret;
- disk->flags |= GENHD_FL_EXT_DEVT;
}
- ret = disk_alloc_events(disk);
- if (ret)
- goto out_free_ext_minor;
-
/* delay uevents, until we scanned partition table */
dev_set_uevent_suppress(ddev, 1);
@@ -451,7 +454,12 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
ddev->devt = MKDEV(disk->major, disk->first_minor);
ret = device_add(ddev);
if (ret)
- goto out_disk_release_events;
+ goto out_free_ext_minor;
+
+ ret = disk_alloc_events(disk);
+ if (ret)
+ goto out_device_del;
+
if (!sysfs_deprecated) {
ret = sysfs_create_link(block_depr, &ddev->kobj,
kobject_name(&ddev->kobj));
@@ -490,14 +498,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
if (ret)
goto out_put_slave_dir;
- if (disk->flags & GENHD_FL_HIDDEN) {
- /*
- * Don't let hidden disks show up in /proc/partitions,
- * and don't bother scanning for partitions either.
- */
- disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
- disk->flags |= GENHD_FL_NO_PART_SCAN;
- } else {
+ if (!(disk->flags & GENHD_FL_HIDDEN)) {
ret = bdi_register(disk->bdi, "%u:%u",
disk->major, disk->first_minor);
if (ret)
@@ -509,7 +510,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
goto out_unregister_bdi;
bdev_add(disk->part0, ddev->devt);
- disk_scan_partitions(disk);
+ if (get_capacity(disk))
+ disk_scan_partitions(disk, FMODE_READ);
/*
* Announce the disk and partitions after all partitions are
@@ -539,8 +541,6 @@ out_del_block_link:
sysfs_remove_link(block_depr, dev_name(ddev));
out_device_del:
device_del(ddev);
-out_disk_release_events:
- disk_release_events(disk);
out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
@@ -720,8 +720,7 @@ void __init printk_all_partitions(void)
* Don't show empty devices or things that have been
* suppressed
*/
- if (get_capacity(disk) == 0 ||
- (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+ if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN))
continue;
/*
@@ -814,11 +813,7 @@ static int show_partition(struct seq_file *seqf, void *v)
struct block_device *part;
unsigned long idx;
- /* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
- (sgp->flags & GENHD_FL_REMOVABLE)))
- return 0;
- if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
+ if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
return 0;
rcu_read_lock();
@@ -874,7 +869,8 @@ static ssize_t disk_ext_range_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", disk_max_parts(disk));
+ return sprintf(buf, "%d\n",
+ (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
}
static ssize_t disk_removable_show(struct device *dev,
@@ -1111,6 +1107,8 @@ static void disk_release(struct device *dev)
might_sleep();
WARN_ON_ONCE(disk_live(disk));
+ blk_mq_cancel_work_sync(disk->queue);
+
disk_release_events(disk);
kfree(disk->random);
xa_destroy(&disk->part_tbl);
@@ -1341,7 +1339,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q;
struct gendisk *disk;
- q = blk_alloc_queue(node);
+ q = blk_alloc_queue(node, false);
if (!q)
return NULL;
diff --git a/block/ioctl.c b/block/ioctl.c
index 0a1d10ac2e1a..4a86340133e4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif
-static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
-{
- struct block_device *tmp;
-
- if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev))
- return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
- if (bdev->bd_disk->open_partitions)
- return -EBUSY;
-
- /*
- * Reopen the device to revalidate the driver state and force a
- * partition rescan.
- */
- mode &= ~FMODE_EXCL;
- set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
-
- tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL);
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
- blkdev_put(tmp, mode);
- return 0;
-}
-
static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
unsigned long arg, unsigned long flags)
{
@@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKRRPART:
- return blkdev_reread_part(bdev, mode);
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (bdev_is_partition(bdev))
+ return -EINVAL;
+ return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
diff --git a/block/ioprio.c b/block/ioprio.c
index 0e4ff245f2bf..2fe068fcaad5 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -22,46 +22,14 @@
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
-#include <linux/export.h>
#include <linux/ioprio.h>
#include <linux/cred.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
-#include <linux/sched/user.h>
-#include <linux/sched/task.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
-int set_task_ioprio(struct task_struct *task, int ioprio)
-{
- int err;
- struct io_context *ioc;
- const struct cred *cred = current_cred(), *tcred;
-
- rcu_read_lock();
- tcred = __task_cred(task);
- if (!uid_eq(tcred->uid, cred->euid) &&
- !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
-
- err = security_task_setioprio(task, ioprio);
- if (err)
- return err;
-
- ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
- if (ioc) {
- ioc->ioprio = ioprio;
- put_io_context(ioc);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(set_task_ioprio);
-
int ioprio_check_cap(int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
@@ -69,7 +37,14 @@ int ioprio_check_cap(int ioprio)
switch (class) {
case IOPRIO_CLASS_RT:
- if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN))
+ /*
+ * Originally this only checked for CAP_SYS_ADMIN,
+ * which was implicitly allowed for pid 0 by security
+ * modules such as SELinux. Make sure we check
+ * CAP_SYS_ADMIN first to avoid a denial/avc for
+ * possibly missing CAP_SYS_NICE permission.
+ */
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
fallthrough;
/* rt has prio field too */
@@ -213,6 +188,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
+ read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
tmpio = get_task_ioprio(p);
if (tmpio < 0)
@@ -222,6 +198,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
else
ret = ioprio_best(ret, tmpio);
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ read_unlock(&tasklist_lock);
+
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index fdd74a4df56f..70ff2a599ef6 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e)
int i;
del_timer_sync(&kqd->timer);
+ blk_stat_disable_accounting(kqd->q);
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
sbitmap_queue_free(&kqd->domain_tokens[i]);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 334b72ef1d73..c2a1635922b1 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -98,13 +98,12 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
{
struct parsed_partitions *state;
- int nr;
+ int nr = DISK_MAX_PARTS;
state = kzalloc(sizeof(*state), GFP_KERNEL);
if (!state)
return NULL;
- nr = disk_max_parts(hd);
state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
if (!state->parts) {
kfree(state);
@@ -326,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
lockdep_assert_held(&disk->open_mutex);
- if (partno >= disk_max_parts(disk))
+ if (partno >= DISK_MAX_PARTS)
return ERR_PTR(-EINVAL);
/*
@@ -527,18 +526,15 @@ out_unlock:
static bool disk_unlock_native_capacity(struct gendisk *disk)
{
- const struct block_device_operations *bdops = disk->fops;
-
- if (bdops->unlock_native_capacity &&
- !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
- printk(KERN_CONT "enabling native capacity\n");
- bdops->unlock_native_capacity(disk);
- disk->flags |= GENHD_FL_NATIVE_CAPACITY;
- return true;
- } else {
+ if (!disk->fops->unlock_native_capacity ||
+ test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) {
printk(KERN_CONT "truncated\n");
return false;
}
+
+ printk(KERN_CONT "enabling native capacity\n");
+ disk->fops->unlock_native_capacity(disk);
+ return true;
}
void blk_drop_partitions(struct gendisk *disk)
@@ -607,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk)
struct parsed_partitions *state;
int ret = -EAGAIN, p;
- if (!disk_part_scan_enabled(disk))
+ if (disk->flags & GENHD_FL_NO_PART)
return 0;
state = check_partition(disk);
@@ -690,7 +686,7 @@ rescan:
* userspace for this particular setup.
*/
if (invalidate) {
- if (disk_part_scan_enabled(disk) ||
+ if (!(disk->flags & GENHD_FL_NO_PART) ||
!(disk->flags & GENHD_FL_REMOVABLE))
set_capacity(disk, 0);
}