diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-cgroup.c | 116 | ||||
-rw-r--r-- | block/bfq-iosched.c | 2 | ||||
-rw-r--r-- | block/bfq-iosched.h | 23 | ||||
-rw-r--r-- | block/bio-integrity.c | 3 | ||||
-rw-r--r-- | block/blk-cgroup.c | 2 | ||||
-rw-r--r-- | block/blk-core.c | 10 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 58 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 9 | ||||
-rw-r--r-- | block/blk-mq.c | 70 | ||||
-rw-r--r-- | block/blk-sysfs.c | 42 | ||||
-rw-r--r-- | block/blk-throttle.c | 184 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/cfq-iosched.c | 17 | ||||
-rw-r--r-- | block/partition-generic.c | 4 | ||||
-rw-r--r-- | block/partitions/msdos.c | 2 |
15 files changed, 371 insertions, 173 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index c8a32fb345cf..78b2e0db4fb2 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -52,7 +52,7 @@ BFQG_FLAG_FNS(idling) BFQG_FLAG_FNS(empty) #undef BFQG_FLAG_FNS -/* This should be called with the queue_lock held. */ +/* This should be called with the scheduler lock held. */ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) { unsigned long long now; @@ -67,7 +67,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) bfqg_stats_clear_waiting(stats); } -/* This should be called with the queue_lock held. */ +/* This should be called with the scheduler lock held. */ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, struct bfq_group *curr_bfqg) { @@ -81,7 +81,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, bfqg_stats_mark_waiting(stats); } -/* This should be called with the queue_lock held. */ +/* This should be called with the scheduler lock held. */ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { unsigned long long now; @@ -203,12 +203,30 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) static void bfqg_get(struct bfq_group *bfqg) { - return blkg_get(bfqg_to_blkg(bfqg)); + bfqg->ref++; } void bfqg_put(struct bfq_group *bfqg) { - return blkg_put(bfqg_to_blkg(bfqg)); + bfqg->ref--; + + if (bfqg->ref == 0) + kfree(bfqg); +} + +static void bfqg_and_blkg_get(struct bfq_group *bfqg) +{ + /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ + bfqg_get(bfqg); + + blkg_get(bfqg_to_blkg(bfqg)); +} + +void bfqg_and_blkg_put(struct bfq_group *bfqg) +{ + bfqg_put(bfqg); + + blkg_put(bfqg_to_blkg(bfqg)); } void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, @@ -312,7 +330,11 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; - bfqg_get(bfqg); + /* + * Make sure that bfqg and its associated blkg do not + * disappear before entity. + */ + bfqg_and_blkg_get(bfqg); } entity->parent = bfqg->my_entity; /* NULL for root group */ entity->sched_data = &bfqg->sched_data; @@ -399,6 +421,8 @@ struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) return NULL; } + /* see comments in bfq_bic_update_cgroup for why refcounting */ + bfqg_get(bfqg); return &bfqg->pd; } @@ -426,7 +450,7 @@ void bfq_pd_free(struct blkg_policy_data *pd) struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); - return kfree(bfqg); + bfqg_put(bfqg); } void bfq_pd_reset_stats(struct blkg_policy_data *pd) @@ -496,9 +520,10 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, * Move @bfqq to @bfqg, deactivating it from its old group and reactivating * it on the new one. Avoid putting the entity on the old group idle tree. * - * Must be called under the queue lock; the cgroup owning @bfqg must - * not disappear (by now this just means that we are called under - * rcu_read_lock()). + * Must be called under the scheduler lock, to make sure that the blkg + * owning @bfqg does not disappear (see comments in + * bfq_bic_update_cgroup on guaranteeing the consistency of blkg + * objects). */ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) @@ -519,16 +544,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - bfqg_put(bfqq_group(bfqq)); + bfqg_and_blkg_put(bfqq_group(bfqq)); - /* - * Here we use a reference to bfqg. We don't need a refcounter - * as the cgroup reference will not be dropped, so that its - * destroy() callback will not be invoked. - */ entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; - bfqg_get(bfqg); + /* pin down bfqg and its associated blkg */ + bfqg_and_blkg_get(bfqg); if (bfq_bfqq_busy(bfqq)) { bfq_pos_tree_add_move(bfqd, bfqq); @@ -545,8 +566,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * @bic: the bic to move. * @blkcg: the blk-cgroup to move to. * - * Move bic to blkcg, assuming that bfqd->queue is locked; the caller - * has to make sure that the reference to cgroup is valid across the call. + * Move bic to blkcg, assuming that bfqd->lock is held; which makes + * sure that the reference to cgroup is valid across the call (see + * comments in bfq_bic_update_cgroup on this issue) * * NOTE: an alternative approach might have been to store the current * cgroup in bfqq and getting a reference to it, reducing the lookup @@ -604,6 +626,57 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) goto out; bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); + /* + * Update blkg_path for bfq_log_* functions. We cache this + * path, and update it here, for the following + * reasons. Operations on blkg objects in blk-cgroup are + * protected with the request_queue lock, and not with the + * lock that protects the instances of this scheduler + * (bfqd->lock). This exposes BFQ to the following sort of + * race. + * + * The blkg_lookup performed in bfq_get_queue, protected + * through rcu, may happen to return the address of a copy of + * the original blkg. If this is the case, then the + * bfqg_and_blkg_get performed in bfq_get_queue, to pin down + * the blkg, is useless: it does not prevent blk-cgroup code + * from destroying both the original blkg and all objects + * directly or indirectly referred by the copy of the + * blkg. + * + * On the bright side, destroy operations on a blkg invoke, as + * a first step, hooks of the scheduler associated with the + * blkg. And these hooks are executed with bfqd->lock held for + * BFQ. As a consequence, for any blkg associated with the + * request queue this instance of the scheduler is attached + * to, we are guaranteed that such a blkg is not destroyed, and + * that all the pointers it contains are consistent, while we + * are holding bfqd->lock. A blkg_lookup performed with + * bfqd->lock held then returns a fully consistent blkg, which + * remains consistent until this lock is held. + * + * Thanks to the last fact, and to the fact that: (1) bfqg has + * been obtained through a blkg_lookup in the above + * assignment, and (2) bfqd->lock is being held, here we can + * safely use the policy data for the involved blkg (i.e., the + * field bfqg->pd) to get to the blkg associated with bfqg, + * and then we can safely use any field of blkg. After we + * release bfqd->lock, even just getting blkg through this + * bfqg may cause dangling references to be traversed, as + * bfqg->pd may not exist any more. + * + * In view of the above facts, here we cache, in the bfqg, any + * blkg data we may need for this bic, and for its associated + * bfq_queue. As of now, we need to cache only the path of the + * blkg, which is used in the bfq_log_* functions. + * + * Finally, note that bfqg itself needs to be protected from + * destruction on the blkg_free of the original blkg (which + * invokes bfq_pd_free). We use an additional private + * refcounter for bfqg, to let it disappear only after no + * bfq_queue refers to it any longer. + */ + blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; out: rcu_read_unlock(); @@ -640,8 +713,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree with the entities. - * - * Needs queue_lock to be taken and reference to be valid over the call. */ static void bfq_reparent_active_entities(struct bfq_data *bfqd, struct bfq_group *bfqg, @@ -692,8 +763,7 @@ void bfq_pd_offline(struct blkg_policy_data *pd) /* * The idle tree may still contain bfq_queues belonging * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. No one else - * can access them so it's safe to act without any lock. + * cgroup from the one being destroyed now. */ bfq_flush_idle_tree(st); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 08ce45096350..ed93da2462ab 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3665,7 +3665,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) kmem_cache_free(bfq_pool, bfqq); #ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_put(bfqg); + bfqg_and_blkg_put(bfqg); #endif } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ae783c06dfd9..5c3bf9861492 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -759,6 +759,12 @@ struct bfq_group { /* must be the first member */ struct blkg_policy_data pd; + /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ + char blkg_path[128]; + + /* reference counter (see comments in bfq_bic_update_cgroup) */ + int ref; + struct bfq_entity entity; struct bfq_sched_data sched_data; @@ -838,7 +844,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); -void bfqg_put(struct bfq_group *bfqg); +void bfqg_and_blkg_put(struct bfq_group *bfqg); #ifdef CONFIG_BFQ_GROUP_IOSCHED extern struct cftype bfq_blkcg_legacy_files[]; @@ -910,20 +916,13 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - __pbuf, ##args); \ + bfqq_group(bfqq)->blkg_path, ##args); \ } while (0) -#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -} while (0) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args) #else /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5384713d48bc..b5009a896a7f 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -175,6 +175,9 @@ bool bio_integrity_enabled(struct bio *bio) if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) return false; + if (!bio_sectors(bio)) + return false; + /* Already protected? */ if (bio_integrity(bio)) return false; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7c2947128f58..0480892e97e5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -74,7 +74,7 @@ static void blkg_free(struct blkcg_gq *blkg) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->blkcg != &blkcg_root) - blk_exit_rl(&blkg->rl); + blk_exit_rl(blkg->q, &blkg->rl); blkg_rwstat_exit(&blkg->stat_ios); blkg_rwstat_exit(&blkg->stat_bytes); diff --git a/block/blk-core.c b/block/blk-core.c index c7068520794b..a7421b772d0e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -648,13 +648,19 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q, if (!rl->rq_pool) return -ENOMEM; + if (rl != &q->root_rl) + WARN_ON_ONCE(!blk_get_queue(q)); + return 0; } -void blk_exit_rl(struct request_list *rl) +void blk_exit_rl(struct request_queue *q, struct request_list *rl) { - if (rl->rq_pool) + if (rl->rq_pool) { mempool_destroy(rl->rq_pool); + if (rl != &q->root_rl) + blk_put_queue(q); + } } struct request_queue *blk_alloc_queue(gfp_t gfp_mask) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 1f5b692526ae..0ded5e846335 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -68,6 +68,45 @@ static void blk_mq_sched_assign_ioc(struct request_queue *q, __blk_mq_sched_assign_ioc(q, rq, bio, ioc); } +/* + * Mark a hardware queue as needing a restart. For shared queues, maintain + * a count of how many hardware queues are marked for restart. + */ +static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + return; + + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_inc(&q->shared_hctx_restart); + } else + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); +} + +static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + return false; + + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_dec(&q->shared_hctx_restart); + } else + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + + if (blk_mq_hctx_has_pending(hctx)) { + blk_mq_run_hw_queue(hctx, true); + return true; + } + + return false; +} + struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, @@ -266,18 +305,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } -static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) -{ - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - if (blk_mq_hctx_has_pending(hctx)) { - blk_mq_run_hw_queue(hctx, true); - return true; - } - } - return false; -} - /** * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list * @pos: loop cursor. @@ -309,6 +336,13 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) unsigned int i, j; if (set->flags & BLK_MQ_F_TAG_SHARED) { + /* + * If this is 0, then we know that no hardware queues + * have RESTART marked. We're done. + */ + if (!atomic_read(&queue->shared_hctx_restart)) + return; + rcu_read_lock(); list_for_each_entry_rcu_rr(q, queue, &set->tag_list, tag_set_list) { diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index edafb5383b7b..5007edece51a 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -115,15 +115,6 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) return false; } -/* - * Mark a hardware queue as needing a restart. - */ -static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) -{ - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); -} - static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); diff --git a/block/blk-mq.c b/block/blk-mq.c index a69ad122ed66..958cedaff8b8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -628,25 +628,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -void blk_mq_abort_requeue_list(struct request_queue *q) -{ - unsigned long flags; - LIST_HEAD(rq_list); - - spin_lock_irqsave(&q->requeue_lock, flags); - list_splice_init(&q->requeue_list, &rq_list); - spin_unlock_irqrestore(&q->requeue_lock, flags); - - while (!list_empty(&rq_list)) { - struct request *rq; - - rq = list_first_entry(&rq_list, struct request, queuelist); - list_del_init(&rq->queuelist); - blk_mq_end_request(rq, -EIO); - } -} -EXPORT_SYMBOL(blk_mq_abort_requeue_list); - struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { @@ -1480,22 +1461,28 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, - bool may_sleep) +static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, bool may_sleep) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = true, }; - struct blk_mq_hw_ctx *hctx; blk_qc_t new_cookie; int ret; + bool run_queue = true; + + if (blk_mq_hctx_stopped(hctx)) { + run_queue = false; + goto insert; + } if (q->elevator) goto insert; - if (!blk_mq_get_driver_tag(rq, &hctx, false)) + if (!blk_mq_get_driver_tag(rq, NULL, false)) goto insert; new_cookie = request_to_qc_t(hctx, rq); @@ -1519,7 +1506,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, __blk_mq_requeue_request(rq); insert: - blk_mq_sched_insert_request(rq, false, true, false, may_sleep); + blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); } static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, @@ -1527,7 +1514,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, { if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - __blk_mq_try_issue_directly(rq, cookie, false); + __blk_mq_try_issue_directly(hctx, rq, cookie, false); rcu_read_unlock(); } else { unsigned int srcu_idx; @@ -1535,7 +1522,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, might_sleep(); srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); - __blk_mq_try_issue_directly(rq, cookie, true); + __blk_mq_try_issue_directly(hctx, rq, cookie, true); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); } } @@ -1638,9 +1625,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); - if (same_queue_rq) + if (same_queue_rq) { + data.hctx = blk_mq_map_queue(q, + same_queue_rq->mq_ctx->cpu); blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); + } } else if (q->nr_hw_queues > 1 && is_sync) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); @@ -2113,20 +2103,30 @@ static void blk_mq_map_swqueue(struct request_queue *q, } } +/* + * Caller needs to ensure that we're either frozen/quiesced, or that + * the queue isn't live yet. + */ static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) + if (shared) { + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_inc(&q->shared_hctx_restart); hctx->flags |= BLK_MQ_F_TAG_SHARED; - else + } else { + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_dec(&q->shared_hctx_restart); hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, + bool shared) { struct request_queue *q; @@ -2660,7 +2660,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } -void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) +static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, + int nr_hw_queues) { struct request_queue *q; @@ -2684,6 +2685,13 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); } + +void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) +{ + mutex_lock(&set->tag_list_lock); + __blk_mq_update_nr_hw_queues(set, nr_hw_queues); + mutex_unlock(&set->tag_list_lock); +} EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); /* Enable polling stats and return whether they were already enabled. */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 504fee940052..27aceab1cc31 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -777,24 +777,25 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) } /** - * blk_release_queue: - release a &struct request_queue when it is no longer needed - * @kobj: the kobj belonging to the request queue to be released + * __blk_release_queue - release a request queue when it is no longer needed + * @work: pointer to the release_work member of the request queue to be released * * Description: - * blk_release_queue is the pair to blk_init_queue() or - * blk_queue_make_request(). It should be called when a request queue is - * being released; typically when a block device is being de-registered. - * Currently, its primary task it to free all the &struct request - * structures that were allocated to the queue and the queue itself. + * blk_release_queue is the counterpart of blk_init_queue(). It should be + * called when a request queue is being released; typically when a block + * device is being de-registered. Its primary task it to free the queue + * itself. * - * Note: + * Notes: * The low level driver must have finished any outstanding requests first * via blk_cleanup_queue(). - **/ -static void blk_release_queue(struct kobject *kobj) + * + * Although blk_release_queue() may be called with preemption disabled, + * __blk_release_queue() may sleep. + */ +static void __blk_release_queue(struct work_struct *work) { - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct request_queue *q = container_of(work, typeof(*q), release_work); if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) blk_stat_remove_callback(q, q->poll_cb); @@ -809,7 +810,7 @@ static void blk_release_queue(struct kobject *kobj) blk_free_queue_stats(q->stats); - blk_exit_rl(&q->root_rl); + blk_exit_rl(q, &q->root_rl); if (q->queue_tags) __blk_queue_free_tags(q); @@ -834,6 +835,15 @@ static void blk_release_queue(struct kobject *kobj) call_rcu(&q->rcu_head, blk_free_queue_rcu); } +static void blk_release_queue(struct kobject *kobj) +{ + struct request_queue *q = + container_of(kobj, struct request_queue, kobj); + + INIT_WORK(&q->release_work, __blk_release_queue); + schedule_work(&q->release_work); +} + static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, @@ -887,10 +897,10 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } - if (q->mq_ops) + if (q->mq_ops) { __blk_mq_register_dev(dev, q); - - blk_mq_debugfs_register(q); + blk_mq_debugfs_register(q); + } kobject_uevent(&q->kobj, KOBJ_ADD); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b78db2e5fdff..a7285bf2831c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -22,11 +22,18 @@ static int throtl_quantum = 32; #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) -#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ -#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ -/* default latency target is 0, eg, guarantee IO latency by default */ -#define DFL_LATENCY_TARGET (0) +#define MIN_THROTL_BPS (320 * 1024) +#define MIN_THROTL_IOPS (10) +#define DFL_LATENCY_TARGET (-1L) +#define DFL_IDLE_THRESHOLD (0) +#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */ +#define LATENCY_FILTERED_SSD (0) +/* + * For HD, very small latency comes from sequential IO. Such IO is helpless to + * help determine if its IO is impacted by others, hence we ignore the IO + */ +#define LATENCY_FILTERED_HD (1000L) /* 1ms */ #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) @@ -157,6 +164,7 @@ struct throtl_grp { unsigned long last_check_time; unsigned long latency_target; /* us */ + unsigned long latency_target_conf; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; @@ -165,6 +173,7 @@ struct throtl_grp { unsigned long checked_last_finish_time; /* ns / 1024 */ unsigned long avg_idletime; /* ns / 1024 */ unsigned long idletime_threshold; /* us */ + unsigned long idletime_threshold_conf; /* us */ unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ @@ -201,8 +210,6 @@ struct throtl_data unsigned int limit_index; bool limit_valid[LIMIT_CNT]; - unsigned long dft_idletime_threshold; /* us */ - unsigned long low_upgrade_time; unsigned long low_downgrade_time; @@ -212,6 +219,7 @@ struct throtl_data struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; struct latency_bucket __percpu *latency_buckets; unsigned long last_calculate_time; + unsigned long filtered_latency; bool track_bio_latency; }; @@ -294,8 +302,14 @@ static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) td = tg->td; ret = tg->bps[rw][td->limit_index]; - if (ret == 0 && td->limit_index == LIMIT_LOW) - return tg->bps[rw][LIMIT_MAX]; + if (ret == 0 && td->limit_index == LIMIT_LOW) { + /* intermediate node or iops isn't 0 */ + if (!list_empty(&blkg->blkcg->css.children) || + tg->iops[rw][td->limit_index]) + return U64_MAX; + else + return MIN_THROTL_BPS; + } if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { @@ -315,10 +329,17 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; + td = tg->td; ret = tg->iops[rw][td->limit_index]; - if (ret == 0 && tg->td->limit_index == LIMIT_LOW) - return tg->iops[rw][LIMIT_MAX]; + if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { + /* intermediate node or bps isn't 0 */ + if (!list_empty(&blkg->blkcg->css.children) || + tg->bps[rw][td->limit_index]) + return UINT_MAX; + else + return MIN_THROTL_IOPS; + } if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { @@ -482,6 +503,9 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) /* LIMIT_LOW will have default value 0 */ tg->latency_target = DFL_LATENCY_TARGET; + tg->latency_target_conf = DFL_LATENCY_TARGET; + tg->idletime_threshold = DFL_IDLE_THRESHOLD; + tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; } @@ -510,8 +534,6 @@ static void throtl_pd_init(struct blkg_policy_data *pd) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->idletime_threshold = td->dft_idletime_threshold; } /* @@ -684,7 +706,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { - unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice; + unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; /* * Since we are adjusting the throttle limit dynamically, the sleep @@ -1349,7 +1371,7 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static void tg_conf_updated(struct throtl_grp *tg) +static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; @@ -1367,8 +1389,26 @@ static void tg_conf_updated(struct throtl_grp *tg) * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) - tg_update_has_rules(blkg_to_tg(blkg)); + blkg_for_each_descendant_pre(blkg, pos_css, + global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { + struct throtl_grp *this_tg = blkg_to_tg(blkg); + struct throtl_grp *parent_tg; + + tg_update_has_rules(this_tg); + /* ignore root/second level */ + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || + !blkg->parent->parent) + continue; + parent_tg = blkg_to_tg(blkg->parent); + /* + * make sure all children has lower idle time threshold and + * higher latency target + */ + this_tg->idletime_threshold = min(this_tg->idletime_threshold, + parent_tg->idletime_threshold); + this_tg->latency_target = max(this_tg->latency_target, + parent_tg->latency_target); + } /* * We're already holding queue_lock and know @tg is valid. Let's @@ -1413,7 +1453,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; - tg_conf_updated(tg); + tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_finish(&ctx); @@ -1497,34 +1537,34 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, tg->iops_conf[READ][off] == iops_dft && tg->iops_conf[WRITE][off] == iops_dft && (off != LIMIT_LOW || - (tg->idletime_threshold == tg->td->dft_idletime_threshold && - tg->latency_target == DFL_LATENCY_TARGET))) + (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && + tg->latency_target_conf == DFL_LATENCY_TARGET))) return 0; - if (tg->bps_conf[READ][off] != bps_dft) + if (tg->bps_conf[READ][off] != U64_MAX) snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps_conf[READ][off]); - if (tg->bps_conf[WRITE][off] != bps_dft) + if (tg->bps_conf[WRITE][off] != U64_MAX) snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps_conf[WRITE][off]); - if (tg->iops_conf[READ][off] != iops_dft) + if (tg->iops_conf[READ][off] != UINT_MAX) snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops_conf[READ][off]); - if (tg->iops_conf[WRITE][off] != iops_dft) + if (tg->iops_conf[WRITE][off] != UINT_MAX) snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops_conf[WRITE][off]); if (off == LIMIT_LOW) { - if (tg->idletime_threshold == ULONG_MAX) + if (tg->idletime_threshold_conf == ULONG_MAX) strcpy(idle_time, " idle=max"); else snprintf(idle_time, sizeof(idle_time), " idle=%lu", - tg->idletime_threshold); + tg->idletime_threshold_conf); - if (tg->latency_target == ULONG_MAX) + if (tg->latency_target_conf == ULONG_MAX) strcpy(latency_time, " latency=max"); else snprintf(latency_time, sizeof(latency_time), - " latency=%lu", tg->latency_target); + " latency=%lu", tg->latency_target_conf); } seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", @@ -1563,8 +1603,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[2] = tg->iops_conf[READ][index]; v[3] = tg->iops_conf[WRITE][index]; - idle_time = tg->idletime_threshold; - latency_time = tg->latency_target; + idle_time = tg->idletime_threshold_conf; + latency_time = tg->latency_target_conf; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; @@ -1623,17 +1663,33 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->iops_conf[READ][LIMIT_MAX]); tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], tg->iops_conf[WRITE][LIMIT_MAX]); + tg->idletime_threshold_conf = idle_time; + tg->latency_target_conf = latency_time; + + /* force user to configure all settings for low limit */ + if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || + tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || + tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || + tg->latency_target_conf == DFL_LATENCY_TARGET) { + tg->bps[READ][LIMIT_LOW] = 0; + tg->bps[WRITE][LIMIT_LOW] = 0; + tg->iops[READ][LIMIT_LOW] = 0; + tg->iops[WRITE][LIMIT_LOW] = 0; + tg->idletime_threshold = DFL_IDLE_THRESHOLD; + tg->latency_target = DFL_LATENCY_TARGET; + } else if (index == LIMIT_LOW) { + tg->idletime_threshold = tg->idletime_threshold_conf; + tg->latency_target = tg->latency_target_conf; + } - if (index == LIMIT_LOW) { - blk_throtl_update_limit_valid(tg->td); - if (tg->td->limit_valid[LIMIT_LOW]) + blk_throtl_update_limit_valid(tg->td); + if (tg->td->limit_valid[LIMIT_LOW]) { + if (index == LIMIT_LOW) tg->td->limit_index = LIMIT_LOW; - tg->idletime_threshold = (idle_time == ULONG_MAX) ? - ULONG_MAX : idle_time; - tg->latency_target = (latency_time == ULONG_MAX) ? - ULONG_MAX : latency_time; - } - tg_conf_updated(tg); + } else + tg->td->limit_index = LIMIT_MAX; + tg_conf_updated(tg, index == LIMIT_LOW && + tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish: blkg_conf_finish(&ctx); @@ -1722,17 +1778,25 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) /* * cgroup is idle if: * - single idle is too long, longer than a fixed value (in case user - * configure a too big threshold) or 4 times of slice + * configure a too big threshold) or 4 times of idletime threshold * - average think time is more than threshold * - IO latency is largely below threshold */ - unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); - - time = min_t(unsigned long, MAX_IDLE_TIME, time); - return (ktime_get_ns() >> 10) - tg->last_finish_time > time || - tg->avg_idletime > tg->idletime_threshold || - (tg->latency_target && tg->bio_cnt && + unsigned long time; + bool ret; + + time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); + ret = tg->latency_target == DFL_LATENCY_TARGET || + tg->idletime_threshold == DFL_IDLE_THRESHOLD || + (ktime_get_ns() >> 10) - tg->last_finish_time > time || + tg->avg_idletime > tg->idletime_threshold || + (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); + throtl_log(&tg->service_queue, + "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", + tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, + tg->bio_cnt, ret, tg->td->scale); + return ret; } static bool throtl_tg_can_upgrade(struct throtl_grp *tg) @@ -1828,6 +1892,7 @@ static void throtl_upgrade_state(struct throtl_data *td) struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; + throtl_log(&td->service_queue, "upgrade to max"); td->limit_index = LIMIT_MAX; td->low_upgrade_time = jiffies; td->scale = 0; @@ -1850,6 +1915,7 @@ static void throtl_downgrade_state(struct throtl_data *td, int new) { td->scale /= 2; + throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); if (td->scale) { td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; return; @@ -2023,6 +2089,11 @@ static void throtl_update_latency_buckets(struct throtl_data *td) td->avg_buckets[i].valid = true; last_latency = td->avg_buckets[i].latency; } + + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) + throtl_log(&td->service_queue, + "Latency bucket %d: latency=%ld, valid=%d", i, + td->avg_buckets[i].latency, td->avg_buckets[i].valid); } #else static inline void throtl_update_latency_buckets(struct throtl_data *td) @@ -2218,7 +2289,7 @@ void blk_throtl_bio_endio(struct bio *bio) throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat), bio_op(bio), lat); - if (tg->latency_target) { + if (tg->latency_target && lat >= tg->td->filtered_latency) { int bucket; unsigned int threshold; @@ -2354,18 +2425,19 @@ void blk_throtl_exit(struct request_queue *q) void blk_throtl_register_queue(struct request_queue *q) { struct throtl_data *td; - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; + int i; td = q->td; BUG_ON(!td); if (blk_queue_nonrot(q)) { td->throtl_slice = DFL_THROTL_SLICE_SSD; - td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; + td->filtered_latency = LATENCY_FILTERED_SSD; } else { td->throtl_slice = DFL_THROTL_SLICE_HD; - td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; + td->filtered_latency = LATENCY_FILTERED_HD; + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) + td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; } #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ @@ -2375,18 +2447,6 @@ void blk_throtl_register_queue(struct request_queue *q) td->track_bio_latency = !q->mq_ops && !q->request_fn; if (!td->track_bio_latency) blk_stat_enable_accounting(q); - - /* - * some tg are created before queue is fully initialized, eg, nonrot - * isn't initialized yet - */ - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - tg->idletime_threshold = td->dft_idletime_threshold; - } - rcu_read_unlock(); } #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk.h b/block/blk.h index 2ed70228e44f..83c8e1100525 100644 --- a/block/blk.h +++ b/block/blk.h @@ -59,7 +59,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); -void blk_exit_rl(struct request_list *rl); +void blk_exit_rl(struct request_queue *q, struct request_list *rl); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); void blk_queue_bypass_start(struct request_queue *q); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index da69b079725f..b7e9c7feeab2 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -38,9 +38,13 @@ static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */ static const int cfq_hist_divisor = 4; /* - * offset from end of service tree + * offset from end of queue service tree for idle class */ #define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5) +/* offset from end of group service tree under time slice mode */ +#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5) +/* offset from end of group service under IOPS mode */ +#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5) /* * below this threshold, we consider thinktime immediate @@ -1362,6 +1366,14 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) cfqg->vfraction = max_t(unsigned, vfr, 1); } +static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd) +{ + if (!iops_mode(cfqd)) + return CFQ_SLICE_MODE_GROUP_DELAY; + else + return CFQ_IOPS_MODE_GROUP_DELAY; +} + static void cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) { @@ -1381,7 +1393,8 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) n = rb_last(&st->rb); if (n) { __cfqg = rb_entry_cfqg(n); - cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; + cfqg->vdisktime = __cfqg->vdisktime + + cfq_get_cfqg_vdisktime_delay(cfqd); } else cfqg->vdisktime = st->min_vdisktime; cfq_group_service_tree_add(st, cfqg); diff --git a/block/partition-generic.c b/block/partition-generic.c index ff07b9143ca4..c5ec8246e25e 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -320,8 +320,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (info) { struct partition_meta_info *pinfo = alloc_part_info(disk); - if (!pinfo) + if (!pinfo) { + err = -ENOMEM; goto out_free_stats; + } memcpy(pinfo, info, sizeof(*info)); p->info = pinfo; } diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 93e7c1b32edd..5610cd537da7 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -300,6 +300,8 @@ static void parse_bsd(struct parsed_partitions *state, continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); + if (memcmp(flavour, "bsd\0", 4) == 0) + bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ continue; |