diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 23:27:12 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 23:27:12 +0200 |
commit | 6c0029211382011af508273c4fc98a732f841d95 (patch) | |
tree | 63b47915c50542b28d1de48d0e50318afdb32dbb /block | |
parent | Merge tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kerne... (diff) | |
parent | blk-iocost: don't ignore vrate_min on QD contention (diff) | |
download | linux-6c0029211382011af508273c4fc98a732f841d95.tar.xz linux-6c0029211382011af508273c4fc98a732f841d95.zip |
Merge tag 'for-5.13/block-2021-04-27' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"Pretty quiet round this time, which is nice. In detail:
- Series revamping bounce buffer support (Christoph)
- Dead code removal (Christoph, Bart)
- Partition iteration revamp, now using xarray (Christoph)
- Passthrough request scheduler improvements (Lin)
- Series of BFQ improvements (Paolo)
- Fix ioprio task iteration (Peter)
- Various little tweaks and fixes (Tejun, Saravanan, Bhaskar, Max,
Nikolay)"
* tag 'for-5.13/block-2021-04-27' of git://git.kernel.dk/linux-block: (41 commits)
blk-iocost: don't ignore vrate_min on QD contention
blk-mq: Fix spurious debugfs directory creation during initialization
bfq/mq-deadline: remove redundant check for passthrough request
blk-mq: bypass IO scheduler's limit_depth for passthrough request
block: Remove an obsolete comment from sg_io()
block: move bio_list_copy_data to pktcdvd
block: remove zero_fill_bio_iter
block: add queue_to_disk() to get gendisk from request_queue
block: remove an incorrect check from blk_rq_append_bio
block: initialize ret in bdev_disk_changed
block: Fix sys_ioprio_set(.which=IOPRIO_WHO_PGRP) task iteration
block: remove disk_part_iter
block: simplify diskstats_show
block: simplify show_partition
block: simplify printk_all_partitions
block: simplify partition_overlaps
block: simplify partition removal
block: take bd_mutex around delete_partitions in del_gendisk
block: refactor blk_drop_partitions
block: move more syncing and invalidation to delete_partition
...
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-cgroup.c | 2 | ||||
-rw-r--r-- | block/bfq-iosched.c | 398 | ||||
-rw-r--r-- | block/bfq-iosched.h | 15 | ||||
-rw-r--r-- | block/bfq-wf2q.c | 8 | ||||
-rw-r--r-- | block/bio-integrity.c | 3 | ||||
-rw-r--r-- | block/bio.c | 43 | ||||
-rw-r--r-- | block/blk-core.c | 6 | ||||
-rw-r--r-- | block/blk-iocost.c | 4 | ||||
-rw-r--r-- | block/blk-map.c | 119 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 8 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 8 | ||||
-rw-r--r-- | block/blk-mq.c | 3 | ||||
-rw-r--r-- | block/blk-settings.c | 53 | ||||
-rw-r--r-- | block/blk-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk-zoned.c | 8 | ||||
-rw-r--r-- | block/blk.h | 18 | ||||
-rw-r--r-- | block/bounce.c | 138 | ||||
-rw-r--r-- | block/elevator.c | 3 | ||||
-rw-r--r-- | block/genhd.c | 183 | ||||
-rw-r--r-- | block/ioprio.c | 11 | ||||
-rw-r--r-- | block/mq-deadline.c | 7 | ||||
-rw-r--r-- | block/partitions/core.c | 54 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 6 |
23 files changed, 585 insertions, 522 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b791e2041e49..e2f14508f2d6 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -547,6 +547,8 @@ static void bfq_pd_init(struct blkg_policy_data *pd) entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; + entity->last_bfqq_created = NULL; + bfqg->my_entity = entity; /* * the root_group's will be set to NULL * in bfq_init_queue() diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 95586137194e..0270cd7ca165 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1012,7 +1012,7 @@ static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) { - unsigned int old_wr_coeff = bfqq->wr_coeff; + unsigned int old_wr_coeff = 1; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); if (bic->saved_has_short_ttime) @@ -1033,7 +1033,13 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->ttime = bic->saved_ttime; bfqq->io_start_time = bic->saved_io_start_time; bfqq->tot_idle_time = bic->saved_tot_idle_time; - bfqq->wr_coeff = bic->saved_wr_coeff; + /* + * Restore weight coefficient only if low_latency is on + */ + if (bfqd->low_latency) { + old_wr_coeff = bfqq->wr_coeff; + bfqq->wr_coeff = bic->saved_wr_coeff; + } bfqq->service_from_wr = bic->saved_service_from_wr; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; @@ -1069,7 +1075,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - - (bfqq->weight_counter != NULL); + (bfqq->weight_counter != NULL) - bfqq->stable_ref; } /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ @@ -2622,6 +2628,11 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, return true; } +static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + +static void bfq_put_stable_ref(struct bfq_queue *bfqq); + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2644,11 +2655,50 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - void *io_struct, bool request) + void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; /* + * Check delayed stable merge for rotational or non-queueing + * devs. For this branch to be executed, bfqq must not be + * currently merged with some other queue (i.e., bfqq->bic + * must be non null). If we considered also merged queues, + * then we should also check whether bfqq has already been + * merged with bic->stable_merge_bfqq. But this would be + * costly and complicated. + */ + if (unlikely(!bfqd->nonrot_with_queueing)) { + if (bic->stable_merge_bfqq && + !bfq_bfqq_just_created(bfqq) && + time_is_after_jiffies(bfqq->split_time + + msecs_to_jiffies(200))) { + struct bfq_queue *stable_merge_bfqq = + bic->stable_merge_bfqq; + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); + + /* deschedule stable merge, because done or aborted here */ + bfq_put_stable_ref(stable_merge_bfqq); + + bic->stable_merge_bfqq = NULL; + + if (!idling_boosts_thr_without_issues(bfqd, bfqq) && + proc_ref > 0) { + /* next function will take at least one ref */ + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, stable_merge_bfqq); + + bic->stably_merged = true; + if (new_bfqq && new_bfqq->bic) + new_bfqq->bic->stably_merged = true; + return new_bfqq; + } else + return NULL; + } + } + + /* * Do not perform queue merging if the device is non * rotational and performs internal queueing. In fact, such a * device reaches a high speed through internal parallelism @@ -2789,6 +2839,17 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) } } + +static void +bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) +{ + if (cur_bfqq->entity.parent && + cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) + cur_bfqq->entity.parent->last_bfqq_created = new_bfqq; + else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq) + cur_bfqq->bfqd->last_bfqq_created = new_bfqq; +} + void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* @@ -2806,6 +2867,8 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq != bfqd->in_service_queue) bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_reassign_last_bfqq(bfqq, NULL); + bfq_put_queue(bfqq); } @@ -2823,6 +2886,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_clear_bfqq_IO_bound(bfqq); /* + * The processes associated with bfqq are cooperators of the + * processes associated with new_bfqq. So, if bfqq has a + * waker, then assume that all these processes will be happy + * to let bfqq's waker freely inject I/O when they have no + * I/O. + */ + if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq && + bfqq->waker_bfqq != new_bfqq) { + new_bfqq->waker_bfqq = bfqq->waker_bfqq; + new_bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be reset. So insert + * new_bfqq into the woken_list of the waker. See + * bfq_check_waker for details. + */ + hlist_add_head(&new_bfqq->woken_list_node, + &new_bfqq->waker_bfqq->woken_list); + + } + + /* * If bfqq is weight-raised, then let new_bfqq inherit * weight-raising. To reduce false positives, neglect the case * where bfqq has just been created, but has not yet made it @@ -2879,6 +2965,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->pid = -1; bfqq->bic = NULL; + + bfq_reassign_last_bfqq(bfqq, new_bfqq); + bfq_release_process_ref(bfqd, bfqq); } @@ -2906,7 +2995,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. */ - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic); if (new_bfqq) { /* * bic still points to bfqq, then it has not yet been @@ -4491,9 +4580,15 @@ check_queue: bfq_bfqq_busy(bfqq->bic->bfqq[0]) && bfqq->bic->bfqq[0]->next_rq ? bfqq->bic->bfqq[0] : NULL; + struct bfq_queue *blocked_bfqq = + !hlist_empty(&bfqq->woken_list) ? + container_of(bfqq->woken_list.first, + struct bfq_queue, + woken_list_node) + : NULL; /* - * The next three mutually-exclusive ifs decide + * The next four mutually-exclusive ifs decide * whether to try injection, and choose the queue to * pick an I/O request from. * @@ -4526,7 +4621,15 @@ check_queue: * next bfqq's I/O is brought forward dramatically, * for it is not blocked for milliseconds. * - * The third if checks whether bfqq is a queue for + * The third if checks whether there is a queue woken + * by bfqq, and currently with pending I/O. Such a + * woken queue does not steal bandwidth from bfqq, + * because it remains soon without I/O if bfqq is not + * served. So there is virtually no risk of loss of + * bandwidth for bfqq if this woken queue has I/O + * dispatched while bfqq is waiting for new I/O. + * + * The fourth if checks whether bfqq is a queue for * which it is better to avoid injection. It is so if * bfqq delivers more throughput when served without * any further I/O from other queues in the middle, or @@ -4546,11 +4649,11 @@ check_queue: * bfq_update_has_short_ttime(), it is rather likely * that, if I/O is being plugged for bfqq and the * waker queue has pending I/O requests that are - * blocking bfqq's I/O, then the third alternative + * blocking bfqq's I/O, then the fourth alternative * above lets the waker queue get served before the * I/O-plugging timeout fires. So one may deem the * second alternative superfluous. It is not, because - * the third alternative may be way less effective in + * the fourth alternative may be way less effective in * case of a synchronization. For two main * reasons. First, throughput may be low because the * inject limit may be too low to guarantee the same @@ -4559,7 +4662,7 @@ check_queue: * guarantees (the second alternative unconditionally * injects a pending I/O request of the waker queue * for each bfq_dispatch_request()). Second, with the - * third alternative, the duration of the plugging, + * fourth alternative, the duration of the plugging, * i.e., the time before bfqq finally receives new I/O, * may not be minimized, because the waker queue may * happen to be served only after other queues. @@ -4577,6 +4680,14 @@ check_queue: bfq_bfqq_budget_left(bfqq->waker_bfqq) ) bfqq = bfqq->waker_bfqq; + else if (blocked_bfqq && + bfq_bfqq_busy(blocked_bfqq) && + blocked_bfqq->next_rq && + bfq_serv_to_charge(blocked_bfqq->next_rq, + blocked_bfqq) <= + bfq_bfqq_budget_left(blocked_bfqq) + ) + bfqq = blocked_bfqq; else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || !bfq_bfqq_has_short_ttime(bfqq))) @@ -4983,6 +5094,12 @@ void bfq_put_queue(struct bfq_queue *bfqq) bfqg_and_blkg_put(bfqg); } +static void bfq_put_stable_ref(struct bfq_queue *bfqq) +{ + bfqq->stable_ref--; + bfq_put_queue(bfqq); +} + static void bfq_put_cooperator(struct bfq_queue *bfqq) { struct bfq_queue *__bfqq, *next; @@ -5039,6 +5156,24 @@ static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + if (bic->stable_merge_bfqq) { + struct bfq_data *bfqd = bic->stable_merge_bfqq->bfqd; + + /* + * bfqd is NULL if scheduler already exited, and in + * that case this is the last time bfqq is accessed. + */ + if (bfqd) { + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); + bfq_put_stable_ref(bic->stable_merge_bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + bfq_put_stable_ref(bic->stable_merge_bfqq); + } + } + bfq_exit_icq_bfqq(bic, true); bfq_exit_icq_bfqq(bic, false); } @@ -5099,7 +5234,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); + struct bfq_io_cq *bic, + bool respawn); static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { @@ -5119,7 +5255,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); bic_set_bfqq(bic, bfqq, false); } @@ -5162,6 +5298,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* set end request to minus infinity from now */ bfqq->ttime.last_end_request = now_ns + 1; + bfqq->creation_time = jiffies; + bfqq->io_start_time = now_ns; bfq_mark_bfqq_IO_bound(bfqq); @@ -5211,9 +5349,156 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, } } +static struct bfq_queue * +bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, + struct bfq_queue *last_bfqq_created) +{ + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, last_bfqq_created); + + if (!new_bfqq) + return bfqq; + + if (new_bfqq->bic) + new_bfqq->bic->stably_merged = true; + bic->stably_merged = true; + + /* + * Reusing merge functions. This implies that + * bfqq->bic must be set too, for + * bfq_merge_bfqqs to correctly save bfqq's + * state before killing it. + */ + bfqq->bic = bic; + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + + return new_bfqq; +} + +/* + * Many throughput-sensitive workloads are made of several parallel + * I/O flows, with all flows generated by the same application, or + * more generically by the same task (e.g., system boot). The most + * counterproductive action with these workloads is plugging I/O + * dispatch when one of the bfq_queues associated with these flows + * remains temporarily empty. + * + * To avoid this plugging, BFQ has been using a burst-handling + * mechanism for years now. This mechanism has proven effective for + * throughput, and not detrimental for service guarantees. The + * following function pushes this mechanism a little bit further, + * basing on the following two facts. + * + * First, all the I/O flows of a the same application or task + * contribute to the execution/completion of that common application + * or task. So the performance figures that matter are total + * throughput of the flows and task-wide I/O latency. In particular, + * these flows do not need to be protected from each other, in terms + * of individual bandwidth or latency. + * + * Second, the above fact holds regardless of the number of flows. + * + * Putting these two facts together, this commits merges stably the + * bfq_queues associated with these I/O flows, i.e., with the + * processes that generate these IO/ flows, regardless of how many the + * involved processes are. + * + * To decide whether a set of bfq_queues is actually associated with + * the I/O flows of a common application or task, and to merge these + * queues stably, this function operates as follows: given a bfq_queue, + * say Q2, currently being created, and the last bfq_queue, say Q1, + * created before Q2, Q2 is merged stably with Q1 if + * - very little time has elapsed since when Q1 was created + * - Q2 has the same ioprio as Q1 + * - Q2 belongs to the same group as Q1 + * + * Merging bfq_queues also reduces scheduling overhead. A fio test + * with ten random readers on /dev/nullb shows a throughput boost of + * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of + * the total per-request processing time, the above throughput boost + * implies that BFQ's overhead is reduced by more than 50%. + * + * This new mechanism most certainly obsoletes the current + * burst-handling heuristics. We keep those heuristics for the moment. + */ +static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + struct bfq_queue **source_bfqq = bfqq->entity.parent ? + &bfqq->entity.parent->last_bfqq_created : + &bfqd->last_bfqq_created; + + struct bfq_queue *last_bfqq_created = *source_bfqq; + + /* + * If last_bfqq_created has not been set yet, then init it. If + * it has been set already, but too long ago, then move it + * forward to bfqq. Finally, move also if bfqq belongs to a + * different group than last_bfqq_created, or if bfqq has a + * different ioprio or ioprio_class. If none of these + * conditions holds true, then try an early stable merge or + * schedule a delayed stable merge. + * + * A delayed merge is scheduled (instead of performing an + * early merge), in case bfqq might soon prove to be more + * throughput-beneficial if not merged. Currently this is + * possible only if bfqd is rotational with no queueing. For + * such a drive, not merging bfqq is better for throughput if + * bfqq happens to contain sequential I/O. So, we wait a + * little bit for enough I/O to flow through bfqq. After that, + * if such an I/O is sequential, then the merge is + * canceled. Otherwise the merge is finally performed. + */ + if (!last_bfqq_created || + time_before(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, + bfqq->creation_time) || + bfqq->entity.parent != last_bfqq_created->entity.parent || + bfqq->ioprio != last_bfqq_created->ioprio || + bfqq->ioprio_class != last_bfqq_created->ioprio_class) + *source_bfqq = bfqq; + else if (time_after_eq(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, + bfqq->creation_time)) { + if (likely(bfqd->nonrot_with_queueing)) + /* + * With this type of drive, leaving + * bfqq alone may provide no + * throughput benefits compared with + * merging bfqq. So merge bfqq now. + */ + bfqq = bfq_do_early_stable_merge(bfqd, bfqq, + bic, + last_bfqq_created); + else { /* schedule tentative stable merge */ + /* + * get reference on last_bfqq_created, + * to prevent it from being freed, + * until we decide whether to merge + */ + last_bfqq_created->ref++; + /* + * need to keep track of stable refs, to + * compute process refs correctly + */ + last_bfqq_created->stable_ref++; + /* + * Record the bfqq to merge to. + */ + bic->stable_merge_bfqq = last_bfqq_created; + } + } + + return bfqq; +} + + static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic) + struct bfq_io_cq *bic, + bool respawn) { const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); @@ -5271,7 +5556,10 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, out: bfqq->ref++; /* get a process reference to this queue */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + + if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) + bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); + rcu_read_unlock(); return bfqq; } @@ -5521,7 +5809,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), - *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true, + RQ_BIC(rq)); bool waiting, idle_timer_disabled = false; if (new_bfqq) { @@ -5627,7 +5916,48 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); - if (!bfqq || at_head || blk_rq_is_passthrough(rq)) { + + /* + * Reqs with at_head or passthrough flags set are to be put + * directly into dispatch list. Additional case for putting rq + * directly into the dispatch queue: the only active + * bfq_queues are bfqq and either its waker bfq_queue or one + * of its woken bfq_queues. The rationale behind this + * additional condition is as follows: + * - consider a bfq_queue, say Q1, detected as a waker of + * another bfq_queue, say Q2 + * - by definition of a waker, Q1 blocks the I/O of Q2, i.e., + * some I/O of Q1 needs to be completed for new I/O of Q2 + * to arrive. A notable example of waker is journald + * - so, Q1 and Q2 are in any respect the queues of two + * cooperating processes (or of two cooperating sets of + * processes): the goal of Q1's I/O is doing what needs to + * be done so that new Q2's I/O can finally be + * issued. Therefore, if the service of Q1's I/O is delayed, + * then Q2's I/O is delayed too. Conversely, if Q2's I/O is + * delayed, the goal of Q1's I/O is hindered. + * - as a consequence, if some I/O of Q1/Q2 arrives while + * Q2/Q1 is the only queue in service, there is absolutely + * no point in delaying the service of such an I/O. The + * only possible result is a throughput loss + * - so, when the above condition holds, the best option is to + * have the new I/O dispatched as soon as possible + * - the most effective and efficient way to attain the above + * goal is to put the new I/O directly in the dispatch + * list + * - as an additional restriction, Q1 and Q2 must be the only + * busy queues for this commit to put the I/O of Q2/Q1 in + * the dispatch list. This is necessary, because, if also + * other queues are waiting for service, then putting new + * I/O directly in the dispatch list may evidently cause a + * violation of service guarantees for the other queues + */ + if (!bfqq || + (bfqq != bfqd->in_service_queue && + bfqd->in_service_queue != NULL && + bfq_tot_busy_queues(bfqd) == 1 + bfq_bfqq_busy(bfqq) && + (bfqq->waker_bfqq == bfqd->in_service_queue || + bfqd->in_service_queue->waker_bfqq == bfqq)) || at_head) { if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else @@ -5767,7 +6097,17 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 1UL<<(BFQ_RATE_SHIFT - 10)) bfq_update_rate_reset(bfqd, NULL); bfqd->last_completion = now_ns; - bfqd->last_completed_rq_bfqq = bfqq; + /* + * Shared queues are likely to receive I/O at a high + * rate. This may deceptively let them be considered as wakers + * of other queues. But a false waker will unjustly steal + * bandwidth to its supposedly woken queue. So considering + * also shared queues in the waking mechanism may cause more + * control troubles than throughput benefits. Then do not set + * last_completed_rq_bfqq to bfqq if bfqq is a shared queue. + */ + if (!bfq_bfqq_coop(bfqq)) + bfqd->last_completed_rq_bfqq = bfqq; /* * If we are waiting to discover whether the request pattern @@ -6124,7 +6464,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, if (bfqq) bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { @@ -6245,8 +6585,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && + !bic->stably_merged) { + struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) @@ -6255,11 +6596,24 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq = bfq_split_bfqq(bic, bfqq); split = true; - if (!bfqq) + if (!bfqq) { bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); - else + bfqq->waker_bfqq = old_bfqq->waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (bfqq->waker_bfqq) + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } else bfqq_already_existing = true; } } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index b8e793c34ff1..99c2a3cb081e 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -197,6 +197,9 @@ struct bfq_entity { /* flag, set if the entity is counted in groups_with_pending_reqs */ bool in_groups_with_pending_reqs; + + /* last child queue of entity created (for non-leaf entities) */ + struct bfq_queue *last_bfqq_created; }; struct bfq_group; @@ -230,6 +233,8 @@ struct bfq_ttime { struct bfq_queue { /* reference counter */ int ref; + /* counter of references from other queues for delayed stable merge */ + int stable_ref; /* parent bfq_data */ struct bfq_data *bfqd; @@ -365,6 +370,8 @@ struct bfq_queue { unsigned long first_IO_time; /* time of first I/O for this queue */ + unsigned long creation_time; /* when this queue is created */ + /* max service rate measured so far */ u32 max_service_rate; @@ -454,6 +461,11 @@ struct bfq_io_cq { u64 saved_last_serv_time_ns; unsigned int saved_inject_limit; unsigned long saved_decrease_time_jif; + + /* candidate queue for a stable merge (due to close creation time) */ + struct bfq_queue *stable_merge_bfqq; + + bool stably_merged; /* non splittable if true */ }; /** @@ -578,6 +590,9 @@ struct bfq_data { /* bfqq owning the last completed rq */ struct bfq_queue *last_completed_rq_bfqq; + /* last bfqq created, among those in the root group */ + struct bfq_queue *last_bfqq_created; + /* time of last transition from empty to non-empty (ns) */ u64 last_empty_occupied_ns; diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 070e34a7feb1..7a462df71f68 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1706,4 +1706,12 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; + + /* Move bfqq to the head of the woken list of its waker */ + if (!hlist_unhashed(&bfqq->woken_list_node) && + &bfqq->woken_list_node != bfqq->waker_bfqq->woken_list.first) { + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } } diff --git a/block/bio-integrity.c b/block/bio-integrity.c index dfa652122a2d..4b4eb8964a6f 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -204,7 +204,6 @@ bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct request_queue *q = bio->bi_bdev->bd_disk->queue; void *buf; unsigned long start, end; unsigned int len, nr_pages; @@ -238,7 +237,7 @@ bool bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = intervals * bi->tuple_size; - buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); + buf = kmalloc(len, GFP_NOIO); status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); diff --git a/block/bio.c b/block/bio.c index 50e579088aca..44205dfb6b60 100644 --- a/block/bio.c +++ b/block/bio.c @@ -493,20 +493,20 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) } EXPORT_SYMBOL(bio_kmalloc); -void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) +void zero_fill_bio(struct bio *bio) { unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - __bio_for_each_segment(bv, bio, iter, start) { + bio_for_each_segment(bv, bio, iter) { char *data = bvec_kmap_irq(&bv, &flags); memset(data, 0, bv.bv_len); flush_dcache_page(bv.bv_page); bvec_kunmap_irq(data, &flags); } } -EXPORT_SYMBOL(zero_fill_bio_iter); +EXPORT_SYMBOL(zero_fill_bio); /** * bio_truncate - truncate the bio to small size of @new_size @@ -1236,43 +1236,6 @@ void bio_copy_data(struct bio *dst, struct bio *src) } EXPORT_SYMBOL(bio_copy_data); -/** - * bio_list_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * Stops when it reaches the end of either the @src list or @dst list - that is, - * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of - * bios). - */ -void bio_list_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter = src->bi_iter; - struct bvec_iter dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } -} -EXPORT_SYMBOL(bio_list_copy_data); - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; diff --git a/block/blk-core.c b/block/blk-core.c index fc60ff208497..9bcdae93f6d4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1161,10 +1161,8 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, } /* - * queue's settings related to segment counting like q->bounce_pfn - * may differ from that of other stacking queues. - * Recalculate it to check the request correctly on this queue's - * limitation. + * The queue settings related to segment counting may differ from the + * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_segments(q)) { diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 98d656bdb42b..e0c4baa01857 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -987,10 +987,6 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, return; } - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - /* * If vrate is out of bounds, apply clamp gradually as the * bounds can change abruptly. Otherwise, apply busy_level diff --git a/block/blk-map.c b/block/blk-map.c index 1ffef782fcf2..3743158ddaeb 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -123,7 +123,6 @@ static int bio_uncopy_user(struct bio *bio) bio_free_pages(bio); } kfree(bmd); - bio_put(bio); return ret; } @@ -132,7 +131,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, { struct bio_map_data *bmd; struct page *page; - struct bio *bio, *bounce_bio; + struct bio *bio; int i = 0, ret; int nr_pages; unsigned int len = iter->count; @@ -181,7 +180,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, i++; } else { - page = alloc_page(rq->q->bounce_gfp | gfp_mask); + page = alloc_page(GFP_NOIO | gfp_mask); if (!page) { ret = -ENOMEM; goto cleanup; @@ -218,16 +217,9 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio->bi_private = bmd; - bounce_bio = bio; - ret = blk_rq_append_bio(rq, &bounce_bio); + ret = blk_rq_append_bio(rq, bio); if (ret) goto cleanup; - - /* - * We link the bounce buffer in and could have to traverse it later, so - * we have to get a ref to prevent it from being freed - */ - bio_get(bounce_bio); return 0; cleanup: if (!map_data) @@ -242,7 +234,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { unsigned int max_sectors = queue_max_hw_sectors(rq->q); - struct bio *bio, *bounce_bio; + struct bio *bio; int ret; int j; @@ -304,49 +296,17 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, break; } - /* - * Subtle: if we end up needing to bounce a bio, it would normally - * disappear when its bi_end_io is run. However, we need the original - * bio for the unmap, so grab an extra reference to it - */ - bio_get(bio); - - bounce_bio = bio; - ret = blk_rq_append_bio(rq, &bounce_bio); + ret = blk_rq_append_bio(rq, bio); if (ret) - goto out_put_orig; - - /* - * We link the bounce buffer in and could have to traverse it - * later, so we have to get a ref to prevent it from being freed - */ - bio_get(bounce_bio); + goto out_unmap; return 0; - out_put_orig: - bio_put(bio); out_unmap: bio_release_pages(bio, false); bio_put(bio); return ret; } -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -static void bio_unmap_user(struct bio *bio) -{ - bio_release_pages(bio, bio_data_dir(bio) == READ); - bio_put(bio); - bio_put(bio); -} - static void bio_invalidate_vmalloc_pages(struct bio *bio) { #ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE @@ -486,7 +446,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(GFP_NOIO | gfp_mask); if (!page) goto cleanup; @@ -519,33 +479,24 @@ cleanup: * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. */ -int blk_rq_append_bio(struct request *rq, struct bio **bio) +int blk_rq_append_bio(struct request *rq, struct bio *bio) { - struct bio *orig_bio = *bio; struct bvec_iter iter; struct bio_vec bv; unsigned int nr_segs = 0; - blk_queue_bounce(rq->q, bio); - - bio_for_each_bvec(bv, *bio, iter) + bio_for_each_bvec(bv, bio, iter) nr_segs++; if (!rq->bio) { - blk_rq_bio_prep(rq, *bio, nr_segs); + blk_rq_bio_prep(rq, bio, nr_segs); } else { - if (!ll_back_merge_fn(rq, *bio, nr_segs)) { - if (orig_bio != *bio) { - bio_put(*bio); - *bio = orig_bio; - } + if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; - } - - rq->biotail->bi_next = *bio; - rq->biotail = *bio; - rq->__data_len += (*bio)->bi_iter.bi_size; - bio_crypt_free_ctx(*bio); + rq->biotail->bi_next = bio; + rq->biotail = bio; + rq->__data_len += (bio)->bi_iter.bi_size; + bio_crypt_free_ctx(bio); } return 0; @@ -566,12 +517,6 @@ EXPORT_SYMBOL(blk_rq_append_bio); * * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, @@ -588,6 +533,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, if (map_data) copy = true; + else if (blk_queue_may_bounce(q)) + copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (queue_virt_boundary(q)) @@ -641,25 +588,21 @@ EXPORT_SYMBOL(blk_rq_map_user); */ int blk_rq_unmap_user(struct bio *bio) { - struct bio *mapped_bio; + struct bio *next_bio; int ret = 0, ret2; while (bio) { - mapped_bio = bio; - if (unlikely(bio_flagged(bio, BIO_BOUNCED))) - mapped_bio = bio->bi_private; - if (bio->bi_private) { - ret2 = bio_uncopy_user(mapped_bio); + ret2 = bio_uncopy_user(bio); if (ret2 && !ret) ret = ret2; } else { - bio_unmap_user(mapped_bio); + bio_release_pages(bio, bio_data_dir(bio) == READ); } - mapped_bio = bio; + next_bio = bio; bio = bio->bi_next; - bio_put(mapped_bio); + bio_put(next_bio); } return ret; @@ -684,7 +627,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; - struct bio *bio, *orig_bio; + struct bio *bio; int ret; if (len > (queue_max_hw_sectors(q) << 9)) @@ -692,7 +635,8 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) + if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || + blk_queue_may_bounce(q)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); @@ -703,14 +647,9 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); - orig_bio = bio; - ret = blk_rq_append_bio(rq, &bio); - if (unlikely(ret)) { - /* request is too big */ - bio_put(orig_bio); - return ret; - } - - return 0; + ret = blk_rq_append_bio(rq, bio); + if (unlikely(ret)) + bio_put(bio); + return ret; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 271f6596435b..2a75bc7401df 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -972,6 +972,14 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, { struct elevator_type *e = q->elevator->type; + /* + * If the parent debugfs directory has not been created yet, return; + * We will be called again later on with appropriate parent debugfs + * directory from blk_register_queue() + */ + if (!hctx->debugfs_dir) + return; + if (!e->hctx_debugfs_attrs) return; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9c92053e704d..2a37731e8244 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -373,8 +373,8 @@ static bool blk_mq_tagset_count_completed_rqs(struct request *rq, } /** - * blk_mq_tagset_wait_completed_request - wait until all completed req's - * complete funtion is run + * blk_mq_tagset_wait_completed_request - Wait until all scheduled request + * completions have finished. * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown @@ -517,7 +517,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - if (flags & BLK_MQ_F_TAG_HCTX_SHARED) + if (blk_mq_is_sbitmap_shared(flags)) return tags; if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { @@ -529,7 +529,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) { - if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { + if (!blk_mq_is_sbitmap_shared(flags)) { sbitmap_queue_free(tags->bitmap_tags); sbitmap_queue_free(tags->breserved_tags); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e3a70ab5be1..cd4ad58c88a5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -361,11 +361,12 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) if (e) { /* - * Flush requests are special and go directly to the + * Flush/passthrough requests are special and go directly to the * dispatch list. Don't include reserved tags in the * limiting, as it isn't useful. */ if (!op_is_flush(data->cmd_flags) && + !blk_op_is_passthrough(data->cmd_flags) && e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.limit_depth(data->cmd_flags, data); diff --git a/block/blk-settings.c b/block/blk-settings.c index b4aa2f37fab6..9c009090c4b5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -7,7 +7,6 @@ #include <linux/init.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <linux/gcd.h> #include <linux/lcm.h> #include <linux/jiffies.h> @@ -17,11 +16,6 @@ #include "blk.h" #include "blk-wbt.h" -unsigned long blk_max_low_pfn; -EXPORT_SYMBOL(blk_max_low_pfn); - -unsigned long blk_max_pfn; - void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { q->rq_timeout = timeout; @@ -55,7 +49,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->discard_alignment = 0; lim->discard_misaligned = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; - lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); + lim->bounce = BLK_BOUNCE_NONE; lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; @@ -92,39 +86,16 @@ EXPORT_SYMBOL(blk_set_stacking_limits); /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device - * @max_addr: the maximum address the device can handle + * @bounce: bounce limit to enforce * * Description: - * Different hardware can have different requirements as to what pages - * it can do I/O directly to. A low level driver can call - * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @max_addr. + * Force bouncing for ISA DMA ranges or highmem. + * + * DEPRECATED, don't use in new code. **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) +void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce) { - unsigned long b_pfn = max_addr >> PAGE_SHIFT; - int dma = 0; - - q->bounce_gfp = GFP_NOIO; -#if BITS_PER_LONG == 64 - /* - * Assume anything <= 4GB can be handled by IOMMU. Actually - * some IOMMUs can handle everything, but I don't know of a - * way to test this here. - */ - if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) - dma = 1; - q->limits.bounce_pfn = max(max_low_pfn, b_pfn); -#else - if (b_pfn < blk_max_low_pfn) - dma = 1; - q->limits.bounce_pfn = b_pfn; -#endif - if (dma) { - init_emergency_isa_pool(); - q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->limits.bounce_pfn = b_pfn; - } + q->limits.bounce = bounce; } EXPORT_SYMBOL(blk_queue_bounce_limit); @@ -547,7 +518,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(t->max_zone_append_sectors, b->max_zone_append_sectors); - t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); + t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); @@ -927,11 +898,3 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) } } EXPORT_SYMBOL_GPL(blk_queue_set_zoned); - -static int __init blk_settings_init(void) -{ - blk_max_low_pfn = max_low_pfn - 1; - blk_max_pfn = max_pfn - 1; - return 0; -} -subsys_initcall(blk_settings_init); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0f4f0c8a7825..e03bedf180ab 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -60,7 +60,7 @@ static ssize_t queue_var_store64(s64 *var, const char *page) static ssize_t queue_requests_show(struct request_queue *q, char *page) { - return queue_var_show(q->nr_requests, (page)); + return queue_var_show(q->nr_requests, page); } static ssize_t @@ -264,6 +264,11 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_hw_sectors_kb, (page)); } +static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.virt_boundary_mask, (page)); +} + #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ static ssize_t \ queue_##name##_show(struct request_queue *q, char *page) \ @@ -610,6 +615,7 @@ QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); +QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); @@ -670,6 +676,7 @@ static struct attribute *queue_attrs[] = { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &blk_throtl_sample_time_entry.attr, #endif + &queue_virt_boundary_mask_entry.attr, NULL, }; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index c0276b42d9fb..250cb76ee615 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -52,14 +52,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -static inline sector_t blk_zone_start(struct request_queue *q, - sector_t sector) -{ - sector_t zone_mask = blk_queue_zone_sectors(q) - 1; - - return sector & ~zone_mask; -} - /* * Return true if a request is a write requests that needs zone write locking. */ diff --git a/block/blk.h b/block/blk.h index 3b53e44b967e..8b3591aee0a5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -6,6 +6,7 @@ #include <linux/blk-mq.h> #include <linux/part_stat.h> #include <linux/blk-crypto.h> +#include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <xen/xen.h> #include "blk-crypto-internal.h" #include "blk-mq.h" @@ -311,18 +312,20 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { } static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif -#ifdef CONFIG_BOUNCE -extern int init_emergency_isa_pool(void); -extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); -#else -static inline int init_emergency_isa_pool(void) +void __blk_queue_bounce(struct request_queue *q, struct bio **bio); + +static inline bool blk_queue_may_bounce(struct request_queue *q) { - return 0; + return IS_ENABLED(CONFIG_BOUNCE) && + q->limits.bounce == BLK_BOUNCE_HIGH && + max_low_pfn >= max_pfn; } + static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) { + if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio))) + __blk_queue_bounce(q, bio); } -#endif /* CONFIG_BOUNCE */ #ifdef CONFIG_BLK_CGROUP_IOLATENCY extern int blk_iolatency_init(struct request_queue *q); @@ -346,7 +349,6 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct block_device *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); diff --git a/block/bounce.c b/block/bounce.c index 6c441f4f1cd4..94081e013c58 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -18,7 +18,6 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> -#include <linux/memblock.h> #include <linux/printk.h> #include <asm/tlbflush.h> @@ -29,7 +28,7 @@ #define ISA_POOL_SIZE 16 static struct bio_set bounce_bio_set, bounce_bio_split; -static mempool_t page_pool, isa_page_pool; +static mempool_t page_pool; static void init_bounce_bioset(void) { @@ -49,11 +48,11 @@ static void init_bounce_bioset(void) bounce_bs_setup = true; } -#if defined(CONFIG_HIGHMEM) static __init int init_emergency_pool(void) { int ret; -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) + +#ifndef CONFIG_MEMORY_HOTPLUG if (max_pfn <= max_low_pfn) return 0; #endif @@ -67,9 +66,7 @@ static __init int init_emergency_pool(void) } __initcall(init_emergency_pool); -#endif -#ifdef CONFIG_HIGHMEM /* * highmem version, map in to vec */ @@ -82,48 +79,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) kunmap_atomic(vto); } -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - -#endif /* CONFIG_HIGHMEM */ - -/* - * allocate pages in the DMA region for the ISA pool - */ -static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) -{ - return mempool_alloc_pages(gfp_mask | GFP_DMA, data); -} - -static DEFINE_MUTEX(isa_mutex); - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - int ret; - - mutex_lock(&isa_mutex); - - if (mempool_initialized(&isa_page_pool)) { - mutex_unlock(&isa_mutex); - return 0; - } - - ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(ret); - - pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); - init_bounce_bioset(); - mutex_unlock(&isa_mutex); - return 0; -} - /* * Simple bounce buffer support for highmem pages. Depending on the * queue gfp mask set, *to may or may not be a highmem page. kmap it @@ -159,7 +114,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) } } -static void bounce_end_io(struct bio *bio, mempool_t *pool) +static void bounce_end_io(struct bio *bio) { struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, orig_vec; @@ -173,7 +128,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) orig_vec = bio_iter_iovec(bio_orig, orig_iter); if (bvec->bv_page != orig_vec.bv_page) { dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); + mempool_free(bvec->bv_page, &page_pool); } bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); } @@ -185,33 +140,17 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) static void bounce_end_io_write(struct bio *bio) { - bounce_end_io(bio, &page_pool); + bounce_end_io(bio); } -static void bounce_end_io_write_isa(struct bio *bio) -{ - - bounce_end_io(bio, &isa_page_pool); -} - -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) +static void bounce_end_io_read(struct bio *bio) { struct bio *bio_orig = bio->bi_private; if (!bio->bi_status) copy_to_high_bio_irq(bio_orig, bio); - bounce_end_io(bio, pool); -} - -static void bounce_end_io_read(struct bio *bio) -{ - __bounce_end_io_read(bio, &page_pool); -} - -static void bounce_end_io_read_isa(struct bio *bio) -{ - __bounce_end_io_read(bio, &isa_page_pool); + bounce_end_io(bio); } static struct bio *bounce_clone_bio(struct bio *bio_src) @@ -241,12 +180,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) * asking for trouble and would force extra work on * __bio_clone_fast() anyways. */ - if (bio_is_passthrough(bio_src)) - bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, - bio_segments(bio_src)); - else - bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), - &bounce_bio_set); + bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), + &bounce_bio_set); bio->bi_bdev = bio_src->bi_bdev; if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); @@ -287,8 +222,7 @@ err_put: return NULL; } -static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool) +void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { struct bio *bio; int rw = bio_data_dir(*bio_orig); @@ -301,14 +235,13 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_VECS) sectors += from.bv_len >> 9; - if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) + if (PageHighMem(from.bv_page)) bounce = true; } if (!bounce) return; - if (!bio_is_passthrough(*bio_orig) && - sectors < bio_sectors(*bio_orig)) { + if (sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); submit_bio_noacct(*bio_orig); @@ -324,10 +257,10 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { struct page *page = to->bv_page; - if (page_to_pfn(page) <= q->limits.bounce_pfn) + if (!PageHighMem(page)) continue; - to->bv_page = mempool_alloc(pool, q->bounce_gfp); + to->bv_page = mempool_alloc(&page_pool, GFP_NOIO); inc_zone_page_state(to->bv_page, NR_BOUNCE); if (rw == WRITE) { @@ -346,46 +279,11 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bio->bi_flags |= (1 << BIO_BOUNCED); - if (pool == &page_pool) { + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + else bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } bio->bi_private = *bio_orig; *bio_orig = bio; } - -void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) -{ - mempool_t *pool; - - /* - * Data-less bio, nothing to bounce - */ - if (!bio_has_data(*bio_orig)) - return; - - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (q->limits.bounce_pfn >= blk_max_pfn) - return; - pool = &page_pool; - } else { - BUG_ON(!mempool_initialized(&isa_page_pool)); - pool = &isa_page_pool; - } - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool); -} diff --git a/block/elevator.c b/block/elevator.c index 293c5c81397a..440699c28119 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -621,7 +621,8 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { - if (q->nr_hw_queues != 1) + if (q->nr_hw_queues != 1 && + !blk_mq_is_sbitmap_shared(q->tag_set->flags)) return NULL; return elevator_get(q, "mq-deadline", false); diff --git a/block/genhd.c b/block/genhd.c index 8c8f543572e6..39ca97b0edc6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -161,81 +161,6 @@ static void part_in_flight_rw(struct block_device *part, inflight[1] = 0; } -/** - * disk_part_iter_init - initialize partition iterator - * @piter: iterator to initialize - * @disk: disk to iterate over - * @flags: DISK_PITER_* flags - * - * Initialize @piter so that it iterates over partitions of @disk. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, - unsigned int flags) -{ - piter->disk = disk; - piter->part = NULL; - if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) - piter->idx = 0; - else - piter->idx = 1; - piter->flags = flags; -} - -/** - * disk_part_iter_next - proceed iterator to the next partition and return it - * @piter: iterator of interest - * - * Proceed @piter to the next partition and return it. - * - * CONTEXT: - * Don't care. - */ -struct block_device *disk_part_iter_next(struct disk_part_iter *piter) -{ - struct block_device *part; - unsigned long idx; - - /* put the last partition */ - disk_part_iter_exit(piter); - - rcu_read_lock(); - xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) { - if (!bdev_nr_sectors(part) && - !(piter->flags & DISK_PITER_INCL_EMPTY) && - !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && - piter->idx == 0)) - continue; - - piter->part = bdgrab(part); - if (!piter->part) - continue; - piter->idx = idx + 1; - break; - } - rcu_read_unlock(); - - return piter->part; -} - -/** - * disk_part_iter_exit - finish up partition iteration - * @piter: iter of interest - * - * Called when iteration is over. Cleans up @piter. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_exit(struct disk_part_iter *piter) -{ - if (piter->part) - bdput(piter->part); - piter->part = NULL; -} - /* * Can be deleted altogether. Later. * @@ -472,13 +397,22 @@ static char *bdevt_str(dev_t devt, char *buf) void disk_uevent(struct gendisk *disk, enum kobject_action action) { - struct disk_part_iter piter; struct block_device *part; + unsigned long idx; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (bdev_is_partition(part) && !bdev_nr_sectors(part)) + continue; + if (!bdgrab(part)) + continue; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY_PART0); - while ((part = disk_part_iter_next(&piter))) + rcu_read_unlock(); kobject_uevent(bdev_kobj(part), action); - disk_part_iter_exit(&piter); + bdput(part); + rcu_read_lock(); + } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(disk_uevent); @@ -646,18 +580,6 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) } EXPORT_SYMBOL(device_add_disk_no_queue_reg); -static void invalidate_partition(struct block_device *bdev) -{ - fsync_bdev(bdev); - __invalidate_device(bdev, true); - - /* - * Unhash the bdev inode for this device so that it can't be looked - * up any more even if openers still hold references to it. - */ - remove_inode_hash(bdev->bd_inode); -} - /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove @@ -679,9 +601,6 @@ static void invalidate_partition(struct block_device *bdev) */ void del_gendisk(struct gendisk *disk) { - struct disk_part_iter piter; - struct block_device *part; - might_sleep(); if (WARN_ON_ONCE(!disk->queue)) @@ -696,15 +615,19 @@ void del_gendisk(struct gendisk *disk) */ down_write(&bdev_lookup_sem); - /* invalidate stuff */ - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(part); - delete_partition(part); - } - disk_part_iter_exit(&piter); + mutex_lock(&disk->part0->bd_mutex); + blk_drop_partitions(disk); + mutex_unlock(&disk->part0->bd_mutex); + + fsync_bdev(disk->part0); + __invalidate_device(disk->part0, true); + + /* + * Unhash the bdev inode for this device so that it can't be looked + * up any more even if openers still hold references to it. + */ + remove_inode_hash(disk->part0->bd_inode); - invalidate_partition(disk->part0); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&bdev_lookup_sem); @@ -817,10 +740,10 @@ void __init printk_all_partitions(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct disk_part_iter piter; struct block_device *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; + unsigned long idx; /* * Don't show empty devices or things that have been @@ -831,30 +754,29 @@ void __init printk_all_partitions(void) continue; /* - * Note, unlike /proc/partitions, I am showing the - * numbers in hex - the same format as the root= - * option takes. + * Note, unlike /proc/partitions, I am showing the numbers in + * hex - the same format as the root= option takes. */ - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == disk->part0; - - printk("%s%s %10llu %s %s", is_part0 ? "" : " ", + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; + printk("%s%s %10llu %s %s", + bdev_is_partition(part) ? " " : "", bdevt_str(part->bd_dev, devt_buf), bdev_nr_sectors(part) >> 1, disk_name(disk, part->bd_partno, name_buf), part->bd_meta_info ? part->bd_meta_info->uuid : ""); - if (is_part0) { - if (dev->parent && dev->parent->driver) - printk(" driver: %s\n", - dev->parent->driver->name); - else - printk(" (driver?)\n"); - } else + if (bdev_is_partition(part)) printk("\n"); + else if (dev->parent && dev->parent->driver) + printk(" driver: %s\n", + dev->parent->driver->name); + else + printk(" (driver?)\n"); } - disk_part_iter_exit(&piter); + rcu_read_unlock(); } class_dev_iter_exit(&iter); } @@ -919,8 +841,8 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; - struct disk_part_iter piter; struct block_device *part; + unsigned long idx; char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ @@ -930,15 +852,16 @@ static int show_partition(struct seq_file *seqf, void *v) if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) return 0; - /* show the full disk and all non-0 size partitions of it */ - disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) + rcu_read_lock(); + xa_for_each(&sgp->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), bdev_nr_sectors(part) >> 1, disk_name(sgp, part->bd_partno, buf)); - disk_part_iter_exit(&piter); - + } + rcu_read_unlock(); return 0; } @@ -1247,11 +1170,11 @@ const struct device_type disk_type = { static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; - struct disk_part_iter piter; struct block_device *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; + unsigned long idx; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1261,8 +1184,10 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n\n"); */ - disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); - while ((hd = disk_part_iter_next(&piter))) { + rcu_read_lock(); + xa_for_each(&gp->part_tbl, idx, hd) { + if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) + continue; part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); @@ -1305,7 +1230,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) NSEC_PER_MSEC) ); } - disk_part_iter_exit(&piter); + rcu_read_unlock(); return 0; } diff --git a/block/ioprio.c b/block/ioprio.c index 364d2294ba90..bee628f9f1b2 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -119,11 +119,17 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) pgrp = task_pgrp(current); else pgrp = find_vpid(who); + + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); - if (ret) - break; + if (ret) { + read_unlock(&tasklist_lock); + goto out; + } } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); + break; case IOPRIO_WHO_USER: uid = make_kuid(current_user_ns(), who); @@ -153,6 +159,7 @@ free_uid: ret = -EINVAL; } +out: rcu_read_unlock(); return ret; } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index f3631a287466..04aded71ead2 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -500,11 +500,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); - if (at_head || blk_rq_is_passthrough(rq)) { - if (at_head) - list_add(&rq->queuelist, &dd->dispatch); - else - list_add_tail(&rq->queuelist, &dd->dispatch); + if (at_head) { + list_add(&rq->queuelist, &dd->dispatch); } else { deadline_add_rq_rb(dd, rq); diff --git a/block/partitions/core.c b/block/partitions/core.c index 46f055bc7ecb..dc60ecf46fe6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -285,8 +285,11 @@ struct device_type part_type = { * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct block_device *part) +static void delete_partition(struct block_device *part) { + fsync_bdev(part); + __invalidate_device(part, true); + xa_erase(&part->bd_disk->part_tbl, part->bd_partno); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); @@ -424,21 +427,21 @@ out_put: static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { - struct disk_part_iter piter; struct block_device *part; bool overlap = false; - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) { - if (part->bd_partno == skip_partno || - start >= part->bd_start_sect + bdev_nr_sectors(part) || - start + length <= part->bd_start_sect) - continue; - overlap = true; - break; + unsigned long idx; + + rcu_read_lock(); + xa_for_each_start(&disk->part_tbl, idx, part, 1) { + if (part->bd_partno != skip_partno && + start < part->bd_start_sect + bdev_nr_sectors(part) && + start + length > part->bd_start_sect) { + overlap = true; + break; + } } + rcu_read_unlock(); - disk_part_iter_exit(&piter); return overlap; } @@ -475,9 +478,6 @@ int bdev_del_partition(struct block_device *bdev, int partno) if (part->bd_openers) goto out_unlock; - sync_blockdev(part); - invalidate_bdev(part); - delete_partition(part); ret = 0; out_unlock: @@ -533,28 +533,20 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int blk_drop_partitions(struct block_device *bdev) +void blk_drop_partitions(struct gendisk *disk) { - struct disk_part_iter piter; struct block_device *part; + unsigned long idx; - if (bdev->bd_part_count) - return -EBUSY; + lockdep_assert_held(&disk->part0->bd_mutex); - sync_blockdev(bdev); - invalidate_bdev(bdev); - - disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) + xa_for_each_start(&disk->part_tbl, idx, part, 1) { + if (!bdgrab(part)) + continue; delete_partition(part); - disk_part_iter_exit(&piter); - - return 0; + bdput(part); + } } -#ifdef CONFIG_S390 -/* for historic reasons in the DASD driver */ -EXPORT_SYMBOL_GPL(blk_drop_partitions); -#endif static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, struct parsed_partitions *state, int p) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 6599bac0a78c..1b3fe99b83a6 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -353,10 +353,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, start_time = jiffies; - /* ignore return value. All information is passed back to caller - * (if he doesn't check that is his problem). - * N.B. a non-zero SCSI status is _not_ necessarily an error. - */ blk_execute_rq(bd_disk, rq, at_head); hdr->duration = jiffies_to_msecs(jiffies - start_time); @@ -431,7 +427,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, bytes = max(in_len, out_len); if (bytes) { - buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN); + buffer = kzalloc(bytes, GFP_NOIO | GFP_USER | __GFP_NOWARN); if (!buffer) return -ENOMEM; |