diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-18 01:57:47 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-18 01:57:47 +0200 |
commit | 7ad67ca5534ee7c958559c4ad610f05c4578e361 (patch) | |
tree | dc6b6a8a6b70b5f25b07bcdc06d8e77e705f6822 /block | |
parent | Merge tag 'for-5.4/libata-2019-09-15' of git://git.kernel.dk/linux-block (diff) | |
parent | null_blk: format pr_* logs with pr_fmt (diff) | |
download | linux-7ad67ca5534ee7c958559c4ad610f05c4578e361.tar.xz linux-7ad67ca5534ee7c958559c4ad610f05c4578e361.zip |
Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- Two NVMe pull requests:
- ana log parse fix from Anton
- nvme quirks support for Apple devices from Ben
- fix missing bio completion tracing for multipath stack devices
from Hannes and Mikhail
- IP TOS settings for nvme rdma and tcp transports from Israel
- rq_dma_dir cleanups from Israel
- tracing for Get LBA Status command from Minwoo
- Some nvme-tcp cleanups from Minwoo, Potnuri and Myself
- Some consolidation between the fabrics transports for handling
the CAP register
- reset race with ns scanning fix for fabrics (move fabrics
commands to a dedicated request queue with a different lifetime
from the admin request queue)."
- controller reset and namespace scan races fixes
- nvme discovery log change uevent support
- naming improvements from Keith
- multiple discovery controllers reject fix from James
- some regular cleanups from various people
- Series fixing (and re-fixing) null_blk debug printing and nr_devices
checks (André)
- A few pull requests from Song, with fixes from Andy, Guoqing,
Guilherme, Neil, Nigel, and Yufen.
- REQ_OP_ZONE_RESET_ALL support (Chaitanya)
- Bio merge handling unification (Christoph)
- Pick default elevator correctly for devices with special needs
(Damien)
- Block stats fixes (Hou)
- Timeout and support devices nbd fixes (Mike)
- Series fixing races around elevator switching and device add/remove
(Ming)
- sed-opal cleanups (Revanth)
- Per device weight support for BFQ (Fam)
- Support for blk-iocost, a new model that can properly account cost of
IO workloads. (Tejun)
- blk-cgroup writeback fixes (Tejun)
- paride queue init fixes (zhengbin)
- blk_set_runtime_active() cleanup (Stanley)
- Block segment mapping optimizations (Bart)
- lightnvm fixes (Hans/Minwoo/YueHaibing)
- Various little fixes and cleanups
* tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits)
null_blk: format pr_* logs with pr_fmt
null_blk: match the type of parameter nr_devices
null_blk: do not fail the module load with zero devices
block: also check RQF_STATS in blk_mq_need_time_stamp()
block: make rq sector size accessible for block stats
bfq: Fix bfq linkage error
raid5: use bio_end_sector in r5_next_bio
raid5: remove STRIPE_OPS_REQ_PENDING
md: add feature flag MD_FEATURE_RAID0_LAYOUT
md/raid0: avoid RAID0 data corruption due to layout confusion.
raid5: don't set STRIPE_HANDLE to stripe which is in batch list
raid5: don't increment read_errors on EILSEQ return
nvmet: fix a wrong error status returned in error log page
nvme: send discovery log page change events to userspace
nvme: add uevent variables for controller devices
nvme: enable aen regardless of the presence of I/O queues
nvme-fabrics: allow discovery subsystems accept a kato
nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery()
nvme: Remove redundant assignment of cq vector
nvme: Assign subsys instance from first ctrl
...
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 13 | ||||
-rw-r--r-- | block/Makefile | 1 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 156 | ||||
-rw-r--r-- | block/bfq-iosched.h | 3 | ||||
-rw-r--r-- | block/bfq-wf2q.c | 2 | ||||
-rw-r--r-- | block/bio.c | 60 | ||||
-rw-r--r-- | block/blk-cgroup.c | 73 | ||||
-rw-r--r-- | block/blk-core.c | 37 | ||||
-rw-r--r-- | block/blk-iocost.c | 2457 | ||||
-rw-r--r-- | block/blk-iolatency.c | 8 | ||||
-rw-r--r-- | block/blk-merge.c | 151 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 29 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 23 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 32 | ||||
-rw-r--r-- | block/blk-mq.c | 69 | ||||
-rw-r--r-- | block/blk-pm.c | 12 | ||||
-rw-r--r-- | block/blk-rq-qos.c | 18 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 28 | ||||
-rw-r--r-- | block/blk-settings.c | 18 | ||||
-rw-r--r-- | block/blk-sysfs.c | 50 | ||||
-rw-r--r-- | block/blk-throttle.c | 9 | ||||
-rw-r--r-- | block/blk-wbt.c | 20 | ||||
-rw-r--r-- | block/blk-wbt.h | 4 | ||||
-rw-r--r-- | block/blk-zoned.c | 39 | ||||
-rw-r--r-- | block/blk.h | 4 | ||||
-rw-r--r-- | block/elevator.c | 217 | ||||
-rw-r--r-- | block/genhd.c | 9 | ||||
-rw-r--r-- | block/mq-deadline.c | 20 | ||||
-rw-r--r-- | block/opal_proto.h | 5 | ||||
-rw-r--r-- | block/sed-opal.c | 49 |
30 files changed, 3274 insertions, 342 deletions
diff --git a/block/Kconfig b/block/Kconfig index 8b5f8e560eb4..41c0917ce622 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -26,6 +26,9 @@ menuconfig BLOCK if BLOCK +config BLK_RQ_ALLOC_TIME + bool + config BLK_SCSI_REQUEST bool @@ -132,6 +135,16 @@ config BLK_CGROUP_IOLATENCY Note, this is an experimental interface and could be changed someday. +config BLK_CGROUP_IOCOST + bool "Enable support for cost model based cgroup IO controller" + depends on BLK_CGROUP=y + select BLK_RQ_ALLOC_TIME + ---help--- + Enabling this option enables the .weight interface for cost + model based proportional IO control. The IO controller + distributes IO capacity between different groups based on + their share of the overall weight distribution. + config BLK_WBT_MQ bool "Multiqueue writeback throttling" default y diff --git a/block/Makefile b/block/Makefile index eee1b4ceecf9..9ef57ace90d4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o +obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 0f6cd688924f..86a607cf19a1 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -501,11 +501,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd) kfree(cpd_to_bfqgd(cpd)); } -static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) { struct bfq_group *bfqg; - bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); + bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node); if (!bfqg) return NULL; @@ -904,7 +905,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -static int bfq_io_show_weight(struct seq_file *sf, void *v) +static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); @@ -918,6 +919,60 @@ static int bfq_io_show_weight(struct seq_file *sf, void *v) return 0; } +static u64 bfqg_prfill_weight_device(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + + if (!bfqg->entity.dev_weight) + return 0; + return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight); +} + +static int bfq_io_show_weight(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + + seq_printf(sf, "default %u\n", bfqgd->weight); + blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device, + &blkcg_policy_bfq, 0, false); + return 0; +} + +static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight) +{ + weight = dev_weight ?: weight; + + bfqg->entity.dev_weight = dev_weight; + /* + * Setting the prio_changed flag of the entity + * to 1 with new_weight == weight would re-set + * the value of the weight to its ioprio mapping. + * Set the flag only if necessary. + */ + if ((unsigned short)weight != bfqg->entity.new_weight) { + bfqg->entity.new_weight = (unsigned short)weight; + /* + * Make sure that the above new value has been + * stored in bfqg->entity.new_weight before + * setting the prio_changed flag. In fact, + * this flag may be read asynchronously (in + * critical sections protected by a different + * lock than that held here), and finding this + * flag set may cause the execution of the code + * for updating parameters whose value may + * depend also on bfqg->entity.new_weight (in + * __bfq_entity_update_weight_prio). + * This barrier makes sure that the new value + * of bfqg->entity.new_weight is correctly + * seen in that code. + */ + smp_wmb(); + bfqg->entity.prio_changed = 1; + } +} + static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) @@ -936,55 +991,72 @@ static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); - if (!bfqg) - continue; - /* - * Setting the prio_changed flag of the entity - * to 1 with new_weight == weight would re-set - * the value of the weight to its ioprio mapping. - * Set the flag only if necessary. - */ - if ((unsigned short)val != bfqg->entity.new_weight) { - bfqg->entity.new_weight = (unsigned short)val; - /* - * Make sure that the above new value has been - * stored in bfqg->entity.new_weight before - * setting the prio_changed flag. In fact, - * this flag may be read asynchronously (in - * critical sections protected by a different - * lock than that held here), and finding this - * flag set may cause the execution of the code - * for updating parameters whose value may - * depend also on bfqg->entity.new_weight (in - * __bfq_entity_update_weight_prio). - * This barrier makes sure that the new value - * of bfqg->entity.new_weight is correctly - * seen in that code. - */ - smp_wmb(); - bfqg->entity.prio_changed = 1; - } + if (bfqg) + bfq_group_set_weight(bfqg, val, 0); } spin_unlock_irq(&blkcg->lock); return ret; } -static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) +static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - u64 weight; - /* First unsigned long found in the file is used */ - int ret = kstrtoull(strim(buf), 0, &weight); + int ret; + struct blkg_conf_ctx ctx; + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct bfq_group *bfqg; + u64 v; + ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx); if (ret) return ret; - ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight); + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v) + goto out; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; + goto out; + } + + bfqg = blkg_to_bfqg(ctx.blkg); + + ret = -ERANGE; + if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) { + bfq_group_set_weight(bfqg, bfqg->entity.weight, v); + ret = 0; + } +out: + blkg_conf_finish(&ctx); return ret ?: nbytes; } +static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = bfq_io_set_weight_legacy(of_css(of), NULL, v); + return ret ?: nbytes; + } + + return bfq_io_set_device_weight(of, buf, nbytes, off); +} + #ifdef CONFIG_BFQ_CGROUP_DEBUG static int bfqg_print_stat(struct seq_file *sf, void *v) { @@ -1141,9 +1213,15 @@ struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.weight", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, + .seq_show = bfq_io_show_weight_legacy, .write_u64 = bfq_io_set_weight_legacy, }, + { + .name = "bfq.weight_device", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, + }, /* statistics, covers only the tasks in the bfqg */ { diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index e80adf822bbe..5d1a519640f6 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -168,6 +168,9 @@ struct bfq_entity { /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ int budget; + /* device weight, if non-zero, it overrides the default weight of + * bfq_group_data */ + int dev_weight; /* weight of the queue */ int weight; /* next weight if a change is in progress */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index c9ba225081ce..05f0bf4a1144 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -744,6 +744,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, } #endif + /* Matches the smp_wmb() in bfq_group_set_weight. */ + smp_rmb(); old_st->wsum -= entity->weight; if (entity->new_weight != entity->orig_weight) { diff --git a/block/bio.c b/block/bio.c index 299a0e7651ec..8f0ed6228fc5 100644 --- a/block/bio.c +++ b/block/bio.c @@ -646,25 +646,20 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return true; } -/* - * Check if the @page can be added to the current segment(@bv), and make - * sure to call it only if page_is_mergeable(@bv, @page) is true - */ -static bool can_add_page_to_seg(struct request_queue *q, - struct bio_vec *bv, struct page *page, unsigned len, - unsigned offset) +static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned len, unsigned offset, + bool *same_page) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; - if (bv->bv_len + len > queue_max_segment_size(q)) return false; - - return true; + return __bio_try_merge_page(bio, page, len, offset, same_page); } /** @@ -674,7 +669,7 @@ static bool can_add_page_to_seg(struct request_queue *q, * @page: page to add * @len: vec entry length * @offset: vec entry offset - * @put_same_page: put the page if it is same with last added page + * @same_page: return if the merge happen inside the same page * * Attempt to add a page to the bio_vec maplist. This can fail for a * number of reasons, such as the bio being full or target block device @@ -685,10 +680,9 @@ static bool can_add_page_to_seg(struct request_queue *q, */ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, - bool put_same_page) + bool *same_page) { struct bio_vec *bvec; - bool same_page = false; /* * cloned bio must not modify vec list @@ -700,28 +694,16 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page == bvec->bv_page && - offset == bvec->bv_offset + bvec->bv_len) { - if (put_same_page) - put_page(page); - bvec->bv_len += len; - goto done; - } + if (bio_try_merge_pc_page(q, bio, page, len, offset, same_page)) + return len; /* - * If the queue doesn't support SG gaps and adding this - * offset would create a gap, disallow it. + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. */ + bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (bvec_gap_to_prev(q, bvec, offset)) return 0; - - if (page_is_mergeable(bvec, page, len, offset, &same_page) && - can_add_page_to_seg(q, bvec, page, len, offset)) { - bvec->bv_len += len; - goto done; - } } if (bio_full(bio, len)) @@ -735,7 +717,6 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, bvec->bv_len = len; bvec->bv_offset = offset; bio->bi_vcnt++; - done: bio->bi_iter.bi_size += len; return len; } @@ -743,7 +724,8 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio, int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - return __bio_add_pc_page(q, bio, page, len, offset, false); + bool same_page = false; + return __bio_add_pc_page(q, bio, page, len, offset, &same_page); } EXPORT_SYMBOL(bio_add_pc_page); @@ -806,6 +788,9 @@ void __bio_add_page(struct bio *bio, struct page *page, bio->bi_iter.bi_size += len; bio->bi_vcnt++; + + if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) + bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -1384,13 +1369,17 @@ struct bio *bio_map_user_iov(struct request_queue *q, for (j = 0; j < npages; j++) { struct page *page = pages[j]; unsigned int n = PAGE_SIZE - offs; + bool same_page = false; if (n > bytes) n = bytes; if (!__bio_add_pc_page(q, bio, page, n, offs, - true)) + &same_page)) { + if (same_page) + put_page(page); break; + } added += n; bytes -= n; @@ -1521,7 +1510,6 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, bio->bi_end_io = bio_map_kern_endio; return bio; } -EXPORT_SYMBOL(bio_map_kern); static void bio_copy_kern_endio(struct bio *bio) { @@ -1842,8 +1830,8 @@ EXPORT_SYMBOL(bio_endio); * @bio, and updates @bio to represent the remaining sectors. * * Unless this is a discard request the newly allocated bio will point - * to @bio's bi_io_vec; it is the caller's responsibility to ensure that - * @bio is not freed before the split. + * to @bio's bi_io_vec. It is the caller's responsibility to ensure that + * neither @bio nor @bs are freed before the split bio. */ struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 55a7dc227dfb..b6f20be0fc78 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -175,7 +175,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); if (!pd) goto err_free; @@ -755,6 +755,44 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, /** * blkg_conf_prep - parse and prepare for per-blkg config update + * @inputp: input string pointer + * + * Parse the device node prefix part, MAJ:MIN, of per-blkg config update + * from @input and get and return the matching gendisk. *@inputp is + * updated to point past the device node prefix. Returns an ERR_PTR() + * value on error. + * + * Use this function iff blkg_conf_prep() can't be used for some reason. + */ +struct gendisk *blkcg_conf_get_disk(char **inputp) +{ + char *input = *inputp; + unsigned int major, minor; + struct gendisk *disk; + int key_len, part; + + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) + return ERR_PTR(-EINVAL); + + input += key_len; + if (!isspace(*input)) + return ERR_PTR(-EINVAL); + input = skip_spaces(input); + + disk = get_gendisk(MKDEV(major, minor), &part); + if (!disk) + return ERR_PTR(-ENODEV); + if (part) { + put_disk_and_module(disk); + return ERR_PTR(-ENODEV); + } + + *inputp = input; + return disk; +} + +/** + * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy * @input: input string @@ -772,25 +810,11 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; - unsigned int major, minor; - int key_len, part, ret; - char *body; - - if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) - return -EINVAL; - - body = input + key_len; - if (!isspace(*body)) - return -EINVAL; - body = skip_spaces(body); + int ret; - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk) - return -ENODEV; - if (part) { - ret = -ENODEV; - goto fail; - } + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); q = disk->queue; @@ -856,7 +880,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, success: ctx->disk = disk; ctx->blkg = blkg; - ctx->body = body; + ctx->body = input; return 0; fail_unlock: @@ -876,6 +900,7 @@ fail: } return ret; } +EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_finish - finish up per-blkg config update @@ -891,6 +916,7 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) rcu_read_unlock(); put_disk_and_module(ctx->disk); } +EXPORT_SYMBOL_GPL(blkg_conf_finish); static int blkcg_print_stat(struct seq_file *sf, void *v) { @@ -1346,7 +1372,7 @@ int blkcg_activate_policy(struct request_queue *q, blk_mq_freeze_queue(q); pd_prealloc: if (!pd_prealloc) { - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, &blkcg_root); if (!pd_prealloc) { ret = -ENOMEM; goto out_bypass_end; @@ -1362,7 +1388,7 @@ pd_prealloc: if (blkg->pd[pol->plid]) continue; - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node); + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, &blkcg_root); if (!pd) swap(pd, pd_prealloc); if (!pd) { @@ -1475,7 +1501,8 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; - pol->cpd_init_fn(cpd); + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); } } diff --git a/block/blk-core.c b/block/blk-core.c index d0cc6e14d2f0..875e8d105067 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -36,6 +36,7 @@ #include <linux/blk-cgroup.h> #include <linux/debugfs.h> #include <linux/bpf.h> +#include <linux/psi.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -129,6 +130,7 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(DISCARD), REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), + REQ_OP_NAME(ZONE_RESET_ALL), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), @@ -344,7 +346,8 @@ void blk_cleanup_queue(struct request_queue *q) /* * Drain all requests queued before DYING marking. Set DEAD flag to - * prevent that q->request_fn() gets invoked after draining finished. + * prevent that blk_mq_run_hw_queues() accesses the hardware queues + * after draining finished. */ blk_freeze_queue(q); @@ -479,7 +482,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); @@ -518,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) mutex_init(&q->blk_trace_mutex); #endif mutex_init(&q->sysfs_lock); + mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); @@ -601,6 +604,7 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio, return false; trace_block_bio_backmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -622,6 +626,7 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio, return false; trace_block_bio_frontmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); @@ -647,6 +652,8 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, blk_rq_get_max_sectors(req, blk_rq_pos(req))) goto no_merge; + rq_qos_merge(q, req, bio); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -931,6 +938,10 @@ generic_make_request_checks(struct bio *bio) if (!blk_queue_is_zoned(q)) goto not_supported; break; + case REQ_OP_ZONE_RESET_ALL: + if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q)) + goto not_supported; + break; case REQ_OP_WRITE_ZEROES: if (!q->limits.max_write_zeroes_sectors) goto not_supported; @@ -1128,6 +1139,10 @@ EXPORT_SYMBOL_GPL(direct_make_request); */ blk_qc_t submit_bio(struct bio *bio) { + bool workingset_read = false; + unsigned long pflags; + blk_qc_t ret; + if (blkcg_punt_bio_submit(bio)) return BLK_QC_T_NONE; @@ -1146,6 +1161,8 @@ blk_qc_t submit_bio(struct bio *bio) if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); } else { + if (bio_flagged(bio, BIO_WORKINGSET)) + workingset_read = true; task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } @@ -1160,7 +1177,21 @@ blk_qc_t submit_bio(struct bio *bio) } } - return generic_make_request(bio); + /* + * If we're reading data that is part of the userspace + * workingset, count submission time as memory stall. When the + * device is congested, or the submitting cgroup IO-throttled, + * submission can be a significant part of overall IO time. + */ + if (workingset_read) + psi_memstall_enter(&pflags); + + ret = generic_make_request(bio); + + if (workingset_read) + psi_memstall_leave(&pflags); + + return ret; } EXPORT_SYMBOL(submit_bio); diff --git a/block/blk-iocost.c b/block/blk-iocost.c new file mode 100644 index 000000000000..3b39deb8b9f8 --- /dev/null +++ b/block/blk-iocost.c @@ -0,0 +1,2457 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * IO cost model based controller. + * + * Copyright (C) 2019 Tejun Heo <tj@kernel.org> + * Copyright (C) 2019 Andy Newell <newella@fb.com> + * Copyright (C) 2019 Facebook + * + * One challenge of controlling IO resources is the lack of trivially + * observable cost metric. This is distinguished from CPU and memory where + * wallclock time and the number of bytes can serve as accurate enough + * approximations. + * + * Bandwidth and iops are the most commonly used metrics for IO devices but + * depending on the type and specifics of the device, different IO patterns + * easily lead to multiple orders of magnitude variations rendering them + * useless for the purpose of IO capacity distribution. While on-device + * time, with a lot of clutches, could serve as a useful approximation for + * non-queued rotational devices, this is no longer viable with modern + * devices, even the rotational ones. + * + * While there is no cost metric we can trivially observe, it isn't a + * complete mystery. For example, on a rotational device, seek cost + * dominates while a contiguous transfer contributes a smaller amount + * proportional to the size. If we can characterize at least the relative + * costs of these different types of IOs, it should be possible to + * implement a reasonable work-conserving proportional IO resource + * distribution. + * + * 1. IO Cost Model + * + * IO cost model estimates the cost of an IO given its basic parameters and + * history (e.g. the end sector of the last IO). The cost is measured in + * device time. If a given IO is estimated to cost 10ms, the device should + * be able to process ~100 of those IOs in a second. + * + * Currently, there's only one builtin cost model - linear. Each IO is + * classified as sequential or random and given a base cost accordingly. + * On top of that, a size cost proportional to the length of the IO is + * added. While simple, this model captures the operational + * characteristics of a wide varienty of devices well enough. Default + * paramters for several different classes of devices are provided and the + * parameters can be configured from userspace via + * /sys/fs/cgroup/io.cost.model. + * + * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate + * device-specific coefficients. + * + * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate + * device-specific coefficients. + * + * 2. Control Strategy + * + * The device virtual time (vtime) is used as the primary control metric. + * The control strategy is composed of the following three parts. + * + * 2-1. Vtime Distribution + * + * When a cgroup becomes active in terms of IOs, its hierarchical share is + * calculated. Please consider the following hierarchy where the numbers + * inside parentheses denote the configured weights. + * + * root + * / \ + * A (w:100) B (w:300) + * / \ + * A0 (w:100) A1 (w:100) + * + * If B is idle and only A0 and A1 are actively issuing IOs, as the two are + * of equal weight, each gets 50% share. If then B starts issuing IOs, B + * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, + * 12.5% each. The distribution mechanism only cares about these flattened + * shares. They're called hweights (hierarchical weights) and always add + * upto 1 (HWEIGHT_WHOLE). + * + * A given cgroup's vtime runs slower in inverse proportion to its hweight. + * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) + * against the device vtime - an IO which takes 10ms on the underlying + * device is considered to take 80ms on A0. + * + * This constitutes the basis of IO capacity distribution. Each cgroup's + * vtime is running at a rate determined by its hweight. A cgroup tracks + * the vtime consumed by past IOs and can issue a new IO iff doing so + * wouldn't outrun the current device vtime. Otherwise, the IO is + * suspended until the vtime has progressed enough to cover it. + * + * 2-2. Vrate Adjustment + * + * It's unrealistic to expect the cost model to be perfect. There are too + * many devices and even on the same device the overall performance + * fluctuates depending on numerous factors such as IO mixture and device + * internal garbage collection. The controller needs to adapt dynamically. + * + * This is achieved by adjusting the overall IO rate according to how busy + * the device is. If the device becomes overloaded, we're sending down too + * many IOs and should generally slow down. If there are waiting issuers + * but the device isn't saturated, we're issuing too few and should + * generally speed up. + * + * To slow down, we lower the vrate - the rate at which the device vtime + * passes compared to the wall clock. For example, if the vtime is running + * at the vrate of 75%, all cgroups added up would only be able to issue + * 750ms worth of IOs per second, and vice-versa for speeding up. + * + * Device business is determined using two criteria - rq wait and + * completion latencies. + * + * When a device gets saturated, the on-device and then the request queues + * fill up and a bio which is ready to be issued has to wait for a request + * to become available. When this delay becomes noticeable, it's a clear + * indication that the device is saturated and we lower the vrate. This + * saturation signal is fairly conservative as it only triggers when both + * hardware and software queues are filled up, and is used as the default + * busy signal. + * + * As devices can have deep queues and be unfair in how the queued commands + * are executed, soley depending on rq wait may not result in satisfactory + * control quality. For a better control quality, completion latency QoS + * parameters can be configured so that the device is considered saturated + * if N'th percentile completion latency rises above the set point. + * + * The completion latency requirements are a function of both the + * underlying device characteristics and the desired IO latency quality of + * service. There is an inherent trade-off - the tighter the latency QoS, + * the higher the bandwidth lossage. Latency QoS is disabled by default + * and can be set through /sys/fs/cgroup/io.cost.qos. + * + * 2-3. Work Conservation + * + * Imagine two cgroups A and B with equal weights. A is issuing a small IO + * periodically while B is sending out enough parallel IOs to saturate the + * device on its own. Let's say A's usage amounts to 100ms worth of IO + * cost per second, i.e., 10% of the device capacity. The naive + * distribution of half and half would lead to 60% utilization of the + * device, a significant reduction in the total amount of work done + * compared to free-for-all competition. This is too high a cost to pay + * for IO control. + * + * To conserve the total amount of work done, we keep track of how much + * each active cgroup is actually using and yield part of its weight if + * there are other cgroups which can make use of it. In the above case, + * A's weight will be lowered so that it hovers above the actual usage and + * B would be able to use the rest. + * + * As we don't want to penalize a cgroup for donating its weight, the + * surplus weight adjustment factors in a margin and has an immediate + * snapback mechanism in case the cgroup needs more IO vtime for itself. + * + * Note that adjusting down surplus weights has the same effects as + * accelerating vtime for other cgroups and work conservation can also be + * implemented by adjusting vrate dynamically. However, squaring who can + * donate and should take back how much requires hweight propagations + * anyway making it easier to implement and understand as a separate + * mechanism. + * + * 3. Monitoring + * + * Instead of debugfs or other clumsy monitoring mechanisms, this + * controller uses a drgn based monitoring script - + * tools/cgroup/iocost_monitor.py. For details on drgn, please see + * https://github.com/osandov/drgn. The ouput looks like the following. + * + * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% + * active weight hweight% inflt% dbt delay usages% + * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033 + * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077 + * + * - per : Timer period + * - cur_per : Internal wall and device vtime clock + * - vrate : Device virtual time rate against wall clock + * - weight : Surplus-adjusted and configured weights + * - hweight : Surplus-adjusted and configured hierarchical weights + * - inflt : The percentage of in-flight IO cost at the end of last period + * - del_ms : Deferred issuer delay induction level and duration + * - usages : Usage history + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/time64.h> +#include <linux/parser.h> +#include <linux/sched/signal.h> +#include <linux/blk-cgroup.h> +#include "blk-rq-qos.h" +#include "blk-stat.h" +#include "blk-wbt.h" + +#ifdef CONFIG_TRACEPOINTS + +/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */ +#define TRACE_IOCG_PATH_LEN 1024 +static DEFINE_SPINLOCK(trace_iocg_path_lock); +static char trace_iocg_path[TRACE_IOCG_PATH_LEN]; + +#define TRACE_IOCG_PATH(type, iocg, ...) \ + do { \ + unsigned long flags; \ + if (trace_iocost_##type##_enabled()) { \ + spin_lock_irqsave(&trace_iocg_path_lock, flags); \ + cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \ + trace_iocg_path, TRACE_IOCG_PATH_LEN); \ + trace_iocost_##type(iocg, trace_iocg_path, \ + ##__VA_ARGS__); \ + spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \ + } \ + } while (0) + +#else /* CONFIG_TRACE_POINTS */ +#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0) +#endif /* CONFIG_TRACE_POINTS */ + +enum { + MILLION = 1000000, + + /* timer period is calculated from latency requirements, bound it */ + MIN_PERIOD = USEC_PER_MSEC, + MAX_PERIOD = USEC_PER_SEC, + + /* + * A cgroup's vtime can run 50% behind the device vtime, which + * serves as its IO credit buffer. Surplus weight adjustment is + * immediately canceled if the vtime margin runs below 10%. + */ + MARGIN_PCT = 50, + INUSE_MARGIN_PCT = 10, + + /* Have some play in waitq timer operations */ + WAITQ_TIMER_MARGIN_PCT = 5, + + /* + * vtime can wrap well within a reasonable uptime when vrate is + * consistently raised. Don't trust recorded cgroup vtime if the + * period counter indicates that it's older than 5mins. + */ + VTIME_VALID_DUR = 300 * USEC_PER_SEC, + + /* + * Remember the past three non-zero usages and use the max for + * surplus calculation. Three slots guarantee that we remember one + * full period usage from the last active stretch even after + * partial deactivation and re-activation periods. Don't start + * giving away weight before collecting two data points to prevent + * hweight adjustments based on one partial activation period. + */ + NR_USAGE_SLOTS = 3, + MIN_VALID_USAGES = 2, + + /* 1/64k is granular enough and can easily be handled w/ u32 */ + HWEIGHT_WHOLE = 1 << 16, + + /* + * As vtime is used to calculate the cost of each IO, it needs to + * be fairly high precision. For example, it should be able to + * represent the cost of a single page worth of discard with + * suffificient accuracy. At the same time, it should be able to + * represent reasonably long enough durations to be useful and + * convenient during operation. + * + * 1s worth of vtime is 2^37. This gives us both sub-nanosecond + * granularity and days of wrap-around time even at extreme vrates. + */ + VTIME_PER_SEC_SHIFT = 37, + VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, + VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, + + /* bound vrate adjustments within two orders of magnitude */ + VRATE_MIN_PPM = 10000, /* 1% */ + VRATE_MAX_PPM = 100000000, /* 10000% */ + + VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, + VRATE_CLAMP_ADJ_PCT = 4, + + /* if IOs end up waiting for requests, issue less */ + RQ_WAIT_BUSY_PCT = 5, + + /* unbusy hysterisis */ + UNBUSY_THR_PCT = 75, + + /* don't let cmds which take a very long time pin lagging for too long */ + MAX_LAGGING_PERIODS = 10, + + /* + * If usage% * 1.25 + 2% is lower than hweight% by more than 3%, + * donate the surplus. + */ + SURPLUS_SCALE_PCT = 125, /* * 125% */ + SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */ + SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */ + + /* switch iff the conditions are met for longer than this */ + AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, + + /* + * Count IO size in 4k pages. The 12bit shift helps keeping + * size-proportional components of cost calculation in closer + * numbers of digits to per-IO cost components. + */ + IOC_PAGE_SHIFT = 12, + IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT, + IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT, + + /* if apart further than 16M, consider randio for linear model */ + LCOEF_RANDIO_PAGES = 4096, +}; + +enum ioc_running { + IOC_IDLE, + IOC_RUNNING, + IOC_STOP, +}; + +/* io.cost.qos controls including per-dev enable of the whole controller */ +enum { + QOS_ENABLE, + QOS_CTRL, + NR_QOS_CTRL_PARAMS, +}; + +/* io.cost.qos params */ +enum { + QOS_RPPM, + QOS_RLAT, + QOS_WPPM, + QOS_WLAT, + QOS_MIN, + QOS_MAX, + NR_QOS_PARAMS, +}; + +/* io.cost.model controls */ +enum { + COST_CTRL, + COST_MODEL, + NR_COST_CTRL_PARAMS, +}; + +/* builtin linear cost model coefficients */ +enum { + I_LCOEF_RBPS, + I_LCOEF_RSEQIOPS, + I_LCOEF_RRANDIOPS, + I_LCOEF_WBPS, + I_LCOEF_WSEQIOPS, + I_LCOEF_WRANDIOPS, + NR_I_LCOEFS, +}; + +enum { + LCOEF_RPAGE, + LCOEF_RSEQIO, + LCOEF_RRANDIO, + LCOEF_WPAGE, + LCOEF_WSEQIO, + LCOEF_WRANDIO, + NR_LCOEFS, +}; + +enum { + AUTOP_INVALID, + AUTOP_HDD, + AUTOP_SSD_QD1, + AUTOP_SSD_DFL, + AUTOP_SSD_FAST, +}; + +struct ioc_gq; + +struct ioc_params { + u32 qos[NR_QOS_PARAMS]; + u64 i_lcoefs[NR_I_LCOEFS]; + u64 lcoefs[NR_LCOEFS]; + u32 too_fast_vrate_pct; + u32 too_slow_vrate_pct; +}; + +struct ioc_missed { + u32 nr_met; + u32 nr_missed; + u32 last_met; + u32 last_missed; +}; + +struct ioc_pcpu_stat { + struct ioc_missed missed[2]; + + u64 rq_wait_ns; + u64 last_rq_wait_ns; +}; + +/* per device */ +struct ioc { + struct rq_qos rqos; + + bool enabled; + + struct ioc_params params; + u32 period_us; + u32 margin_us; + u64 vrate_min; + u64 vrate_max; + + spinlock_t lock; + struct timer_list timer; + struct list_head active_iocgs; /* active cgroups */ + struct ioc_pcpu_stat __percpu *pcpu_stat; + + enum ioc_running running; + atomic64_t vtime_rate; + + seqcount_t period_seqcount; + u32 period_at; /* wallclock starttime */ + u64 period_at_vtime; /* vtime starttime */ + + atomic64_t cur_period; /* inc'd each period */ + int busy_level; /* saturation history */ + + u64 inuse_margin_vtime; + bool weights_updated; + atomic_t hweight_gen; /* for lazy hweights */ + + u64 autop_too_fast_at; + u64 autop_too_slow_at; + int autop_idx; + bool user_qos_params:1; + bool user_cost_model:1; +}; + +/* per device-cgroup pair */ +struct ioc_gq { + struct blkg_policy_data pd; + struct ioc *ioc; + + /* + * A iocg can get its weight from two sources - an explicit + * per-device-cgroup configuration or the default weight of the + * cgroup. `cfg_weight` is the explicit per-device-cgroup + * configuration. `weight` is the effective considering both + * sources. + * + * When an idle cgroup becomes active its `active` goes from 0 to + * `weight`. `inuse` is the surplus adjusted active weight. + * `active` and `inuse` are used to calculate `hweight_active` and + * `hweight_inuse`. + * + * `last_inuse` remembers `inuse` while an iocg is idle to persist + * surplus adjustments. + */ + u32 cfg_weight; + u32 weight; + u32 active; + u32 inuse; + u32 last_inuse; + + sector_t cursor; /* to detect randio */ + + /* + * `vtime` is this iocg's vtime cursor which progresses as IOs are + * issued. If lagging behind device vtime, the delta represents + * the currently available IO budget. If runnning ahead, the + * overage. + * + * `vtime_done` is the same but progressed on completion rather + * than issue. The delta behind `vtime` represents the cost of + * currently in-flight IOs. + * + * `last_vtime` is used to remember `vtime` at the end of the last + * period to calculate utilization. + */ + atomic64_t vtime; + atomic64_t done_vtime; + atomic64_t abs_vdebt; + u64 last_vtime; + + /* + * The period this iocg was last active in. Used for deactivation + * and invalidating `vtime`. + */ + atomic64_t active_period; + struct list_head active_list; + + /* see __propagate_active_weight() and current_hweight() for details */ + u64 child_active_sum; + u64 child_inuse_sum; + int hweight_gen; + u32 hweight_active; + u32 hweight_inuse; + bool has_surplus; + + struct wait_queue_head waitq; + struct hrtimer waitq_timer; + struct hrtimer delay_timer; + + /* usage is recorded as fractions of HWEIGHT_WHOLE */ + int usage_idx; + u32 usages[NR_USAGE_SLOTS]; + + /* this iocg's depth in the hierarchy and ancestors including self */ + int level; + struct ioc_gq *ancestors[]; +}; + +/* per cgroup */ +struct ioc_cgrp { + struct blkcg_policy_data cpd; + unsigned int dfl_weight; +}; + +struct ioc_now { + u64 now_ns; + u32 now; + u64 vnow; + u64 vrate; +}; + +struct iocg_wait { + struct wait_queue_entry wait; + struct bio *bio; + u64 abs_cost; + bool committed; +}; + +struct iocg_wake_ctx { + struct ioc_gq *iocg; + u32 hw_inuse; + s64 vbudget; +}; + +static const struct ioc_params autop[] = { + [AUTOP_HDD] = { + .qos = { + [QOS_RLAT] = 50000, /* 50ms */ + [QOS_WLAT] = 50000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 174019176, + [I_LCOEF_RSEQIOPS] = 41708, + [I_LCOEF_RRANDIOPS] = 370, + [I_LCOEF_WBPS] = 178075866, + [I_LCOEF_WSEQIOPS] = 42705, + [I_LCOEF_WRANDIOPS] = 378, + }, + }, + [AUTOP_SSD_QD1] = { + .qos = { + [QOS_RLAT] = 25000, /* 25ms */ + [QOS_WLAT] = 25000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 245855193, + [I_LCOEF_RSEQIOPS] = 61575, + [I_LCOEF_RRANDIOPS] = 6946, + [I_LCOEF_WBPS] = 141365009, + [I_LCOEF_WSEQIOPS] = 33716, + [I_LCOEF_WRANDIOPS] = 26796, + }, + }, + [AUTOP_SSD_DFL] = { + .qos = { + [QOS_RLAT] = 25000, /* 25ms */ + [QOS_WLAT] = 25000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 488636629, + [I_LCOEF_RSEQIOPS] = 8932, + [I_LCOEF_RRANDIOPS] = 8518, + [I_LCOEF_WBPS] = 427891549, + [I_LCOEF_WSEQIOPS] = 28755, + [I_LCOEF_WRANDIOPS] = 21940, + }, + .too_fast_vrate_pct = 500, + }, + [AUTOP_SSD_FAST] = { + .qos = { + [QOS_RLAT] = 5000, /* 5ms */ + [QOS_WLAT] = 5000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 3102524156LLU, + [I_LCOEF_RSEQIOPS] = 724816, + [I_LCOEF_RRANDIOPS] = 778122, + [I_LCOEF_WBPS] = 1742780862LLU, + [I_LCOEF_WSEQIOPS] = 425702, + [I_LCOEF_WRANDIOPS] = 443193, + }, + .too_slow_vrate_pct = 10, + }, +}; + +/* + * vrate adjust percentages indexed by ioc->busy_level. We adjust up on + * vtime credit shortage and down on device saturation. + */ +static u32 vrate_adj_pct[] = + { 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 }; + +static struct blkcg_policy blkcg_policy_iocost; + +/* accessors and helpers */ +static struct ioc *rqos_to_ioc(struct rq_qos *rqos) +{ + return container_of(rqos, struct ioc, rqos); +} + +static struct ioc *q_to_ioc(struct request_queue *q) +{ + return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); +} + +static const char *q_name(struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + return kobject_name(q->kobj.parent); + else + return "<unknown>"; +} + +static const char __maybe_unused *ioc_name(struct ioc *ioc) +{ + return q_name(ioc->rqos.q); +} + +static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct ioc_gq, pd) : NULL; +} + +static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg) +{ + return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost)); +} + +static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg) +{ + return pd_to_blkg(&iocg->pd); +} + +static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) +{ + return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost), + struct ioc_cgrp, cpd); +} + +/* + * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical + * weight, the more expensive each IO. Must round up. + */ +static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) +{ + return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); +} + +/* + * The inverse of abs_cost_to_cost(). Must round up. + */ +static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) +{ + return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); +} + +static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) +{ + bio->bi_iocost_cost = cost; + atomic64_add(cost, &iocg->vtime); +} + +#define CREATE_TRACE_POINTS +#include <trace/events/iocost.h> + +/* latency Qos params changed, update period_us and all the dependent params */ +static void ioc_refresh_period_us(struct ioc *ioc) +{ + u32 ppm, lat, multi, period_us; + + lockdep_assert_held(&ioc->lock); + + /* pick the higher latency target */ + if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { + ppm = ioc->params.qos[QOS_RPPM]; + lat = ioc->params.qos[QOS_RLAT]; + } else { + ppm = ioc->params.qos[QOS_WPPM]; + lat = ioc->params.qos[QOS_WLAT]; + } + + /* + * We want the period to be long enough to contain a healthy number + * of IOs while short enough for granular control. Define it as a + * multiple of the latency target. Ideally, the multiplier should + * be scaled according to the percentile so that it would nominally + * contain a certain number of requests. Let's be simpler and + * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50). + */ + if (ppm) + multi = max_t(u32, (MILLION - ppm) / 50000, 2); + else + multi = 2; + period_us = multi * lat; + period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD); + + /* calculate dependent params */ + ioc->period_us = period_us; + ioc->margin_us = period_us * MARGIN_PCT / 100; + ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( + period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100); +} + +static int ioc_autop_idx(struct ioc *ioc) +{ + int idx = ioc->autop_idx; + const struct ioc_params *p = &autop[idx]; + u32 vrate_pct; + u64 now_ns; + + /* rotational? */ + if (!blk_queue_nonrot(ioc->rqos.q)) + return AUTOP_HDD; + + /* handle SATA SSDs w/ broken NCQ */ + if (blk_queue_depth(ioc->rqos.q) == 1) + return AUTOP_SSD_QD1; + + /* use one of the normal ssd sets */ + if (idx < AUTOP_SSD_DFL) + return AUTOP_SSD_DFL; + + /* if user is overriding anything, maintain what was there */ + if (ioc->user_qos_params || ioc->user_cost_model) + return idx; + + /* step up/down based on the vrate */ + vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100, + VTIME_PER_USEC); + now_ns = ktime_get_ns(); + + if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { + if (!ioc->autop_too_fast_at) + ioc->autop_too_fast_at = now_ns; + if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) + return idx + 1; + } else { + ioc->autop_too_fast_at = 0; + } + + if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { + if (!ioc->autop_too_slow_at) + ioc->autop_too_slow_at = now_ns; + if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) + return idx - 1; + } else { + ioc->autop_too_slow_at = 0; + } + + return idx; +} + +/* + * Take the followings as input + * + * @bps maximum sequential throughput + * @seqiops maximum sequential 4k iops + * @randiops maximum random 4k iops + * + * and calculate the linear model cost coefficients. + * + * *@page per-page cost 1s / (@bps / 4096) + * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0) + * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0) + */ +static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, + u64 *page, u64 *seqio, u64 *randio) +{ + u64 v; + + *page = *seqio = *randio = 0; + + if (bps) + *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, + DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE)); + + if (seqiops) { + v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); + if (v > *page) + *seqio = v - *page; + } + + if (randiops) { + v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops); + if (v > *page) + *randio = v - *page; + } +} + +static void ioc_refresh_lcoefs(struct ioc *ioc) +{ + u64 *u = ioc->params.i_lcoefs; + u64 *c = ioc->params.lcoefs; + + calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], + &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]); + calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS], + &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); +} + +static bool ioc_refresh_params(struct ioc *ioc, bool force) +{ + const struct ioc_params *p; + int idx; + + lockdep_assert_held(&ioc->lock); + + idx = ioc_autop_idx(ioc); + p = &autop[idx]; + + if (idx == ioc->autop_idx && !force) + return false; + + if (idx != ioc->autop_idx) + atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + + ioc->autop_idx = idx; + ioc->autop_too_fast_at = 0; + ioc->autop_too_slow_at = 0; + + if (!ioc->user_qos_params) + memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); + if (!ioc->user_cost_model) + memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); + + ioc_refresh_period_us(ioc); + ioc_refresh_lcoefs(ioc); + + ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * + VTIME_PER_USEC, MILLION); + ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] * + VTIME_PER_USEC, MILLION); + + return true; +} + +/* take a snapshot of the current [v]time and vrate */ +static void ioc_now(struct ioc *ioc, struct ioc_now *now) +{ + unsigned seq; + + now->now_ns = ktime_get(); + now->now = ktime_to_us(now->now_ns); + now->vrate = atomic64_read(&ioc->vtime_rate); + + /* + * The current vtime is + * + * vtime at period start + (wallclock time since the start) * vrate + * + * As a consistent snapshot of `period_at_vtime` and `period_at` is + * needed, they're seqcount protected. + */ + do { + seq = read_seqcount_begin(&ioc->period_seqcount); + now->vnow = ioc->period_at_vtime + + (now->now - ioc->period_at) * now->vrate; + } while (read_seqcount_retry(&ioc->period_seqcount, seq)); +} + +static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) +{ + lockdep_assert_held(&ioc->lock); + WARN_ON_ONCE(ioc->running != IOC_RUNNING); + + write_seqcount_begin(&ioc->period_seqcount); + ioc->period_at = now->now; + ioc->period_at_vtime = now->vnow; + write_seqcount_end(&ioc->period_seqcount); + + ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); + add_timer(&ioc->timer); +} + +/* + * Update @iocg's `active` and `inuse` to @active and @inuse, update level + * weight sums and propagate upwards accordingly. + */ +static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +{ + struct ioc *ioc = iocg->ioc; + int lvl; + + lockdep_assert_held(&ioc->lock); + + inuse = min(active, inuse); + + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + u32 parent_active = 0, parent_inuse = 0; + + /* update the level sums */ + parent->child_active_sum += (s32)(active - child->active); + parent->child_inuse_sum += (s32)(inuse - child->inuse); + /* apply the udpates */ + child->active = active; + child->inuse = inuse; + + /* + * The delta between inuse and active sums indicates that + * that much of weight is being given away. Parent's inuse + * and active should reflect the ratio. + */ + if (parent->child_active_sum) { + parent_active = parent->weight; + parent_inuse = DIV64_U64_ROUND_UP( + parent_active * parent->child_inuse_sum, + parent->child_active_sum); + } + + /* do we need to keep walking up? */ + if (parent_active == parent->active && + parent_inuse == parent->inuse) + break; + + active = parent_active; + inuse = parent_inuse; + } + + ioc->weights_updated = true; +} + +static void commit_active_weights(struct ioc *ioc) +{ + lockdep_assert_held(&ioc->lock); + + if (ioc->weights_updated) { + /* paired with rmb in current_hweight(), see there */ + smp_wmb(); + atomic_inc(&ioc->hweight_gen); + ioc->weights_updated = false; + } +} + +static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +{ + __propagate_active_weight(iocg, active, inuse); + commit_active_weights(iocg->ioc); +} + +static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) +{ + struct ioc *ioc = iocg->ioc; + int lvl; + u32 hwa, hwi; + int ioc_gen; + + /* hot path - if uptodate, use cached */ + ioc_gen = atomic_read(&ioc->hweight_gen); + if (ioc_gen == iocg->hweight_gen) + goto out; + + /* + * Paired with wmb in commit_active_weights(). If we saw the + * updated hweight_gen, all the weight updates from + * __propagate_active_weight() are visible too. + * + * We can race with weight updates during calculation and get it + * wrong. However, hweight_gen would have changed and a future + * reader will recalculate and we're guaranteed to discard the + * wrong result soon. + */ + smp_rmb(); + + hwa = hwi = HWEIGHT_WHOLE; + for (lvl = 0; lvl <= iocg->level - 1; lvl++) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + u32 active_sum = READ_ONCE(parent->child_active_sum); + u32 inuse_sum = READ_ONCE(parent->child_inuse_sum); + u32 active = READ_ONCE(child->active); + u32 inuse = READ_ONCE(child->inuse); + + /* we can race with deactivations and either may read as zero */ + if (!active_sum || !inuse_sum) + continue; + + active_sum = max(active, active_sum); + hwa = hwa * active / active_sum; /* max 16bits * 10000 */ + + inuse_sum = max(inuse, inuse_sum); + hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */ + } + + iocg->hweight_active = max_t(u32, hwa, 1); + iocg->hweight_inuse = max_t(u32, hwi, 1); + iocg->hweight_gen = ioc_gen; +out: + if (hw_activep) + *hw_activep = iocg->hweight_active; + if (hw_inusep) + *hw_inusep = iocg->hweight_inuse; +} + +static void weight_updated(struct ioc_gq *iocg) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); + u32 weight; + + lockdep_assert_held(&ioc->lock); + + weight = iocg->cfg_weight ?: iocc->dfl_weight; + if (weight != iocg->weight && iocg->active) + propagate_active_weight(iocg, weight, + DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); + iocg->weight = weight; +} + +static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + u64 last_period, cur_period, max_period_delta; + u64 vtime, vmargin, vmin; + int i; + + /* + * If seem to be already active, just update the stamp to tell the + * timer that we're still active. We don't mind occassional races. + */ + if (!list_empty(&iocg->active_list)) { + ioc_now(ioc, now); + cur_period = atomic64_read(&ioc->cur_period); + if (atomic64_read(&iocg->active_period) != cur_period) + atomic64_set(&iocg->active_period, cur_period); + return true; + } + + /* racy check on internal node IOs, treat as root level IOs */ + if (iocg->child_active_sum) + return false; + + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, now); + + /* update period */ + cur_period = atomic64_read(&ioc->cur_period); + last_period = atomic64_read(&iocg->active_period); + atomic64_set(&iocg->active_period, cur_period); + + /* already activated or breaking leaf-only constraint? */ + for (i = iocg->level; i > 0; i--) + if (!list_empty(&iocg->active_list)) + goto fail_unlock; + if (iocg->child_active_sum) + goto fail_unlock; + + /* + * vtime may wrap when vrate is raised substantially due to + * underestimated IO costs. Look at the period and ignore its + * vtime if the iocg has been idle for too long. Also, cap the + * budget it can start with to the margin. + */ + max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); + vtime = atomic64_read(&iocg->vtime); + vmargin = ioc->margin_us * now->vrate; + vmin = now->vnow - vmargin; + + if (last_period + max_period_delta < cur_period || + time_before64(vtime, vmin)) { + atomic64_add(vmin - vtime, &iocg->vtime); + atomic64_add(vmin - vtime, &iocg->done_vtime); + vtime = vmin; + } + + /* + * Activate, propagate weight and start period timer if not + * running. Reset hweight_gen to avoid accidental match from + * wrapping. + */ + iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; + list_add(&iocg->active_list, &ioc->active_iocgs); + propagate_active_weight(iocg, iocg->weight, + iocg->last_inuse ?: iocg->weight); + + TRACE_IOCG_PATH(iocg_activate, iocg, now, + last_period, cur_period, vtime); + + iocg->last_vtime = vtime; + + if (ioc->running == IOC_IDLE) { + ioc->running = IOC_RUNNING; + ioc_start_period(ioc, now); + } + + spin_unlock_irq(&ioc->lock); + return true; + +fail_unlock: + spin_unlock_irq(&ioc->lock); + return false; +} + +static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, + int flags, void *key) +{ + struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); + struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; + u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); + + ctx->vbudget -= cost; + + if (ctx->vbudget < 0) + return -1; + + iocg_commit_bio(ctx->iocg, wait->bio, cost); + + /* + * autoremove_wake_function() removes the wait entry only when it + * actually changed the task state. We want the wait always + * removed. Remove explicitly and use default_wake_function(). + */ + list_del_init(&wq_entry->entry); + wait->committed = true; + + default_wake_function(wq_entry, mode, flags, key); + return 0; +} + +static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct iocg_wake_ctx ctx = { .iocg = iocg }; + u64 margin_ns = (u64)(ioc->period_us * + WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; + u64 abs_vdebt, vdebt, vshortage, expires, oexpires; + s64 vbudget; + u32 hw_inuse; + + lockdep_assert_held(&iocg->waitq.lock); + + current_hweight(iocg, NULL, &hw_inuse); + vbudget = now->vnow - atomic64_read(&iocg->vtime); + + /* pay off debt */ + abs_vdebt = atomic64_read(&iocg->abs_vdebt); + vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse); + if (vdebt && vbudget > 0) { + u64 delta = min_t(u64, vbudget, vdebt); + u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), + abs_vdebt); + + atomic64_add(delta, &iocg->vtime); + atomic64_add(delta, &iocg->done_vtime); + atomic64_sub(abs_delta, &iocg->abs_vdebt); + if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0)) + atomic64_set(&iocg->abs_vdebt, 0); + } + + /* + * Wake up the ones which are due and see how much vtime we'll need + * for the next one. + */ + ctx.hw_inuse = hw_inuse; + ctx.vbudget = vbudget - vdebt; + __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); + if (!waitqueue_active(&iocg->waitq)) + return; + if (WARN_ON_ONCE(ctx.vbudget >= 0)) + return; + + /* determine next wakeup, add a quarter margin to guarantee chunking */ + vshortage = -ctx.vbudget; + expires = now->now_ns + + DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; + expires += margin_ns / 4; + + /* if already active and close enough, don't bother */ + oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); + if (hrtimer_is_queued(&iocg->waitq_timer) && + abs(oexpires - expires) <= margin_ns / 4) + return; + + hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), + margin_ns / 4, HRTIMER_MODE_ABS); +} + +static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) +{ + struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); + struct ioc_now now; + unsigned long flags; + + ioc_now(iocg->ioc, &now); + + spin_lock_irqsave(&iocg->waitq.lock, flags); + iocg_kick_waitq(iocg, &now); + spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + return HRTIMER_NORESTART; +} + +static void iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + u64 vtime = atomic64_read(&iocg->vtime); + u64 vmargin = ioc->margin_us * now->vrate; + u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; + u64 expires, oexpires; + u32 hw_inuse; + + /* debt-adjust vtime */ + current_hweight(iocg, NULL, &hw_inuse); + vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse); + + /* clear or maintain depending on the overage */ + if (time_before_eq64(vtime, now->vnow)) { + blkcg_clear_delay(blkg); + return; + } + if (!atomic_read(&blkg->use_delay) && + time_before_eq64(vtime, now->vnow + vmargin)) + return; + + /* use delay */ + if (cost) { + u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC, + now->vrate); + blkcg_add_delay(blkg, now->now_ns, cost_ns); + } + blkcg_use_delay(blkg); + + expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow, + now->vrate) * NSEC_PER_USEC; + + /* if already active and close enough, don't bother */ + oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); + if (hrtimer_is_queued(&iocg->delay_timer) && + abs(oexpires - expires) <= margin_ns / 4) + return; + + hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), + margin_ns / 4, HRTIMER_MODE_ABS); +} + +static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) +{ + struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); + struct ioc_now now; + + ioc_now(iocg->ioc, &now); + iocg_kick_delay(iocg, &now, 0); + + return HRTIMER_NORESTART; +} + +static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) +{ + u32 nr_met[2] = { }; + u32 nr_missed[2] = { }; + u64 rq_wait_ns = 0; + int cpu, rw; + + for_each_online_cpu(cpu) { + struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); + u64 this_rq_wait_ns; + + for (rw = READ; rw <= WRITE; rw++) { + u32 this_met = READ_ONCE(stat->missed[rw].nr_met); + u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed); + + nr_met[rw] += this_met - stat->missed[rw].last_met; + nr_missed[rw] += this_missed - stat->missed[rw].last_missed; + stat->missed[rw].last_met = this_met; + stat->missed[rw].last_missed = this_missed; + } + + this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns); + rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; + stat->last_rq_wait_ns = this_rq_wait_ns; + } + + for (rw = READ; rw <= WRITE; rw++) { + if (nr_met[rw] + nr_missed[rw]) + missed_ppm_ar[rw] = + DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION, + nr_met[rw] + nr_missed[rw]); + else + missed_ppm_ar[rw] = 0; + } + + *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, + ioc->period_us * NSEC_PER_USEC); +} + +/* was iocg idle this period? */ +static bool iocg_is_idle(struct ioc_gq *iocg) +{ + struct ioc *ioc = iocg->ioc; + + /* did something get issued this period? */ + if (atomic64_read(&iocg->active_period) == + atomic64_read(&ioc->cur_period)) + return false; + + /* is something in flight? */ + if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime)) + return false; + + return true; +} + +/* returns usage with margin added if surplus is large enough */ +static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) +{ + /* add margin */ + usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); + usage += SURPLUS_SCALE_ABS; + + /* don't bother if the surplus is too small */ + if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse) + return 0; + + return usage; +} + +static void ioc_timer_fn(struct timer_list *timer) +{ + struct ioc *ioc = container_of(timer, struct ioc, timer); + struct ioc_gq *iocg, *tiocg; + struct ioc_now now; + int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; + u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 missed_ppm[2], rq_wait_pct; + u64 period_vtime; + int i; + + /* how were the latencies during the period? */ + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + + /* take care of active iocgs */ + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, &now); + + period_vtime = now.vnow - ioc->period_at_vtime; + if (WARN_ON_ONCE(!period_vtime)) { + spin_unlock_irq(&ioc->lock); + return; + } + + /* + * Waiters determine the sleep durations based on the vrate they + * saw at the time of sleep. If vrate has increased, some waiters + * could be sleeping for too long. Wake up tardy waiters which + * should have woken up in the last period and expire idle iocgs. + */ + list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { + if (!waitqueue_active(&iocg->waitq) && + !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg)) + continue; + + spin_lock(&iocg->waitq.lock); + + if (waitqueue_active(&iocg->waitq) || + atomic64_read(&iocg->abs_vdebt)) { + /* might be oversleeping vtime / hweight changes, kick */ + iocg_kick_waitq(iocg, &now); + iocg_kick_delay(iocg, &now, 0); + } else if (iocg_is_idle(iocg)) { + /* no waiter and idle, deactivate */ + iocg->last_inuse = iocg->inuse; + __propagate_active_weight(iocg, 0, 0); + list_del_init(&iocg->active_list); + } + + spin_unlock(&iocg->waitq.lock); + } + commit_active_weights(ioc); + + /* calc usages and see whether some weights need to be moved around */ + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + u64 vdone, vtime, vusage, vmargin, vmin; + u32 hw_active, hw_inuse, usage; + + /* + * Collect unused and wind vtime closer to vnow to prevent + * iocgs from accumulating a large amount of budget. + */ + vdone = atomic64_read(&iocg->done_vtime); + vtime = atomic64_read(&iocg->vtime); + current_hweight(iocg, &hw_active, &hw_inuse); + + /* + * Latency QoS detection doesn't account for IOs which are + * in-flight for longer than a period. Detect them by + * comparing vdone against period start. If lagging behind + * IOs from past periods, don't increase vrate. + */ + if (!atomic_read(&iocg_to_blkg(iocg)->use_delay) && + time_after64(vtime, vdone) && + time_after64(vtime, now.vnow - + MAX_LAGGING_PERIODS * period_vtime) && + time_before64(vdone, now.vnow - period_vtime)) + nr_lagging++; + + if (waitqueue_active(&iocg->waitq)) + vusage = now.vnow - iocg->last_vtime; + else if (time_before64(iocg->last_vtime, vtime)) + vusage = vtime - iocg->last_vtime; + else + vusage = 0; + + iocg->last_vtime += vusage; + /* + * Factor in in-flight vtime into vusage to avoid + * high-latency completions appearing as idle. This should + * be done after the above ->last_time adjustment. + */ + vusage = max(vusage, vtime - vdone); + + /* calculate hweight based usage ratio and record */ + if (vusage) { + usage = DIV64_U64_ROUND_UP(vusage * hw_inuse, + period_vtime); + iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; + iocg->usages[iocg->usage_idx] = usage; + } else { + usage = 0; + } + + /* see whether there's surplus vtime */ + vmargin = ioc->margin_us * now.vrate; + vmin = now.vnow - vmargin; + + iocg->has_surplus = false; + + if (!waitqueue_active(&iocg->waitq) && + time_before64(vtime, vmin)) { + u64 delta = vmin - vtime; + + /* throw away surplus vtime */ + atomic64_add(delta, &iocg->vtime); + atomic64_add(delta, &iocg->done_vtime); + iocg->last_vtime += delta; + /* if usage is sufficiently low, maybe it can donate */ + if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { + iocg->has_surplus = true; + nr_surpluses++; + } + } else if (hw_inuse < hw_active) { + u32 new_hwi, new_inuse; + + /* was donating but might need to take back some */ + if (waitqueue_active(&iocg->waitq)) { + new_hwi = hw_active; + } else { + new_hwi = max(hw_inuse, + usage * SURPLUS_SCALE_PCT / 100 + + SURPLUS_SCALE_ABS); + } + + new_inuse = div64_u64((u64)iocg->inuse * new_hwi, + hw_inuse); + new_inuse = clamp_t(u32, new_inuse, 1, iocg->active); + + if (new_inuse > iocg->inuse) { + TRACE_IOCG_PATH(inuse_takeback, iocg, &now, + iocg->inuse, new_inuse, + hw_inuse, new_hwi); + __propagate_active_weight(iocg, iocg->weight, + new_inuse); + } + } else { + /* genuninely out of vtime */ + nr_shortages++; + } + } + + if (!nr_shortages || !nr_surpluses) + goto skip_surplus_transfers; + + /* there are both shortages and surpluses, transfer surpluses */ + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; + int nr_valid = 0; + + if (!iocg->has_surplus) + continue; + + /* base the decision on max historical usage */ + for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { + if (iocg->usages[i]) { + usage = max(usage, iocg->usages[i]); + nr_valid++; + } + } + if (nr_valid < MIN_VALID_USAGES) + continue; + + current_hweight(iocg, &hw_active, &hw_inuse); + new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse); + if (!new_hwi) + continue; + + new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, + hw_inuse); + if (new_inuse < iocg->inuse) { + TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, + iocg->inuse, new_inuse, + hw_inuse, new_hwi); + __propagate_active_weight(iocg, iocg->weight, new_inuse); + } + } +skip_surplus_transfers: + commit_active_weights(ioc); + + /* + * If q is getting clogged or we're missing too much, we're issuing + * too much IO and should lower vtime rate. If we're not missing + * and experiencing shortages but not surpluses, we're too stingy + * and should increase vtime rate. + */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT || + missed_ppm[READ] > ppm_rthr || + missed_ppm[WRITE] > ppm_wthr) { + ioc->busy_level = max(ioc->busy_level, 0); + ioc->busy_level++; + } else if (nr_lagging) { + ioc->busy_level = max(ioc->busy_level, 0); + } else if (nr_shortages && !nr_surpluses && + rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && + missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && + missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { + ioc->busy_level = min(ioc->busy_level, 0); + ioc->busy_level--; + } else { + ioc->busy_level = 0; + } + + ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); + + if (ioc->busy_level) { + u64 vrate = atomic64_read(&ioc->vtime_rate); + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + /* rq_wait signal is always reliable, ignore user vrate_min */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT) + vrate_min = VRATE_MIN; + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), + 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), + 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages, + nr_surpluses); + + atomic64_set(&ioc->vtime_rate, vrate); + ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( + ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); + } + + ioc_refresh_params(ioc, false); + + /* + * This period is done. Move onto the next one. If nothing's + * going on with the device, stop the timer. + */ + atomic64_inc(&ioc->cur_period); + + if (ioc->running != IOC_STOP) { + if (!list_empty(&ioc->active_iocgs)) { + ioc_start_period(ioc, &now); + } else { + ioc->busy_level = 0; + ioc->running = IOC_IDLE; + } + } + + spin_unlock_irq(&ioc->lock); +} + +static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, + bool is_merge, u64 *costp) +{ + struct ioc *ioc = iocg->ioc; + u64 coef_seqio, coef_randio, coef_page; + u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1); + u64 seek_pages = 0; + u64 cost = 0; + + switch (bio_op(bio)) { + case REQ_OP_READ: + coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; + coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; + coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; + break; + case REQ_OP_WRITE: + coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; + coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; + coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; + break; + default: + goto out; + } + + if (iocg->cursor) { + seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); + seek_pages >>= IOC_SECT_TO_PAGE_SHIFT; + } + + if (!is_merge) { + if (seek_pages > LCOEF_RANDIO_PAGES) { + cost += coef_randio; + } else { + cost += coef_seqio; + } + } + cost += pages * coef_page; +out: + *costp = cost; +} + +static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) +{ + u64 cost; + + calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); + return cost; +} + +static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct ioc *ioc = rqos_to_ioc(rqos); + struct ioc_gq *iocg = blkg_to_iocg(blkg); + struct ioc_now now; + struct iocg_wait wait; + u32 hw_active, hw_inuse; + u64 abs_cost, cost, vtime; + + /* bypass IOs if disabled or for root cgroup */ + if (!ioc->enabled || !iocg->level) + return; + + /* always activate so that even 0 cost IOs get protected to some level */ + if (!iocg_activate(iocg, &now)) + return; + + /* calculate the absolute vtime cost */ + abs_cost = calc_vtime_cost(bio, iocg, false); + if (!abs_cost) + return; + + iocg->cursor = bio_end_sector(bio); + + vtime = atomic64_read(&iocg->vtime); + current_hweight(iocg, &hw_active, &hw_inuse); + + if (hw_inuse < hw_active && + time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) { + TRACE_IOCG_PATH(inuse_reset, iocg, &now, + iocg->inuse, iocg->weight, hw_inuse, hw_active); + spin_lock_irq(&ioc->lock); + propagate_active_weight(iocg, iocg->weight, iocg->weight); + spin_unlock_irq(&ioc->lock); + current_hweight(iocg, &hw_active, &hw_inuse); + } + + cost = abs_cost_to_cost(abs_cost, hw_inuse); + + /* + * If no one's waiting and within budget, issue right away. The + * tests are racy but the races aren't systemic - we only miss once + * in a while which is fine. + */ + if (!waitqueue_active(&iocg->waitq) && + !atomic64_read(&iocg->abs_vdebt) && + time_before_eq64(vtime + cost, now.vnow)) { + iocg_commit_bio(iocg, bio, cost); + return; + } + + /* + * We're over budget. If @bio has to be issued regardless, + * remember the abs_cost instead of advancing vtime. + * iocg_kick_waitq() will pay off the debt before waking more IOs. + * This way, the debt is continuously paid off each period with the + * actual budget available to the cgroup. If we just wound vtime, + * we would incorrectly use the current hw_inuse for the entire + * amount which, for example, can lead to the cgroup staying + * blocked for a long time even with substantially raised hw_inuse. + */ + if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { + atomic64_add(abs_cost, &iocg->abs_vdebt); + iocg_kick_delay(iocg, &now, cost); + return; + } + + /* + * Append self to the waitq and schedule the wakeup timer if we're + * the first waiter. The timer duration is calculated based on the + * current vrate. vtime and hweight changes can make it too short + * or too long. Each wait entry records the absolute cost it's + * waiting for to allow re-evaluation using a custom wait entry. + * + * If too short, the timer simply reschedules itself. If too long, + * the period timer will notice and trigger wakeups. + * + * All waiters are on iocg->waitq and the wait states are + * synchronized using waitq.lock. + */ + spin_lock_irq(&iocg->waitq.lock); + + /* + * We activated above but w/o any synchronization. Deactivation is + * synchronized with waitq.lock and we won't get deactivated as + * long as we're waiting, so we're good if we're activated here. + * In the unlikely case that we are deactivated, just issue the IO. + */ + if (unlikely(list_empty(&iocg->active_list))) { + spin_unlock_irq(&iocg->waitq.lock); + iocg_commit_bio(iocg, bio, cost); + return; + } + + init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); + wait.wait.private = current; + wait.bio = bio; + wait.abs_cost = abs_cost; + wait.committed = false; /* will be set true by waker */ + + __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); + iocg_kick_waitq(iocg, &now); + + spin_unlock_irq(&iocg->waitq.lock); + + while (true) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (wait.committed) + break; + io_schedule(); + } + + /* waker already committed us, proceed */ + finish_wait(&iocg->waitq, &wait.wait); +} + +static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, + struct bio *bio) +{ + struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); + struct ioc *ioc = iocg->ioc; + sector_t bio_end = bio_end_sector(bio); + struct ioc_now now; + u32 hw_inuse; + u64 abs_cost, cost; + + /* bypass if disabled or for root cgroup */ + if (!ioc->enabled || !iocg->level) + return; + + abs_cost = calc_vtime_cost(bio, iocg, true); + if (!abs_cost) + return; + + ioc_now(ioc, &now); + current_hweight(iocg, NULL, &hw_inuse); + cost = abs_cost_to_cost(abs_cost, hw_inuse); + + /* update cursor if backmerging into the request at the cursor */ + if (blk_rq_pos(rq) < bio_end && + blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) + iocg->cursor = bio_end; + + /* + * Charge if there's enough vtime budget and the existing request + * has cost assigned. Otherwise, account it as debt. See debt + * handling in ioc_rqos_throttle() for details. + */ + if (rq->bio && rq->bio->bi_iocost_cost && + time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) + iocg_commit_bio(iocg, bio, cost); + else + atomic64_add(abs_cost, &iocg->abs_vdebt); +} + +static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); + + if (iocg && bio->bi_iocost_cost) + atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); +} + +static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + u64 on_q_ns, rq_wait_ns; + int pidx, rw; + + if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) + return; + + switch (req_op(rq) & REQ_OP_MASK) { + case REQ_OP_READ: + pidx = QOS_RLAT; + rw = READ; + break; + case REQ_OP_WRITE: + pidx = QOS_WLAT; + rw = WRITE; + break; + default: + return; + } + + on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; + + if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC) + this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); + else + this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); + + this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns); +} + +static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + + spin_lock_irq(&ioc->lock); + ioc_refresh_params(ioc, false); + spin_unlock_irq(&ioc->lock); +} + +static void ioc_rqos_exit(struct rq_qos *rqos) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + + blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); + + spin_lock_irq(&ioc->lock); + ioc->running = IOC_STOP; + spin_unlock_irq(&ioc->lock); + + del_timer_sync(&ioc->timer); + free_percpu(ioc->pcpu_stat); + kfree(ioc); +} + +static struct rq_qos_ops ioc_rqos_ops = { + .throttle = ioc_rqos_throttle, + .merge = ioc_rqos_merge, + .done_bio = ioc_rqos_done_bio, + .done = ioc_rqos_done, + .queue_depth_changed = ioc_rqos_queue_depth_changed, + .exit = ioc_rqos_exit, +}; + +static int blk_iocost_init(struct request_queue *q) +{ + struct ioc *ioc; + struct rq_qos *rqos; + int ret; + + ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); + if (!ioc) + return -ENOMEM; + + ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); + if (!ioc->pcpu_stat) { + kfree(ioc); + return -ENOMEM; + } + + rqos = &ioc->rqos; + rqos->id = RQ_QOS_COST; + rqos->ops = &ioc_rqos_ops; + rqos->q = q; + + spin_lock_init(&ioc->lock); + timer_setup(&ioc->timer, ioc_timer_fn, 0); + INIT_LIST_HEAD(&ioc->active_iocgs); + + ioc->running = IOC_IDLE; + atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + seqcount_init(&ioc->period_seqcount); + ioc->period_at = ktime_to_us(ktime_get()); + atomic64_set(&ioc->cur_period, 0); + atomic_set(&ioc->hweight_gen, 0); + + spin_lock_irq(&ioc->lock); + ioc->autop_idx = AUTOP_INVALID; + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + rq_qos_add(q, rqos); + ret = blkcg_activate_policy(q, &blkcg_policy_iocost); + if (ret) { + rq_qos_del(q, rqos); + free_percpu(ioc->pcpu_stat); + kfree(ioc); + return ret; + } + return 0; +} + +static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) +{ + struct ioc_cgrp *iocc; + + iocc = kzalloc(sizeof(struct ioc_cgrp), gfp); + if (!iocc) + return NULL; + + iocc->dfl_weight = CGROUP_WEIGHT_DFL; + return &iocc->cpd; +} + +static void ioc_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(container_of(cpd, struct ioc_cgrp, cpd)); +} + +static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + int levels = blkcg->css.cgroup->level + 1; + struct ioc_gq *iocg; + + iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]), + gfp, q->node); + if (!iocg) + return NULL; + + return &iocg->pd; +} + +static void ioc_pd_init(struct blkg_policy_data *pd) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); + struct ioc *ioc = q_to_ioc(blkg->q); + struct ioc_now now; + struct blkcg_gq *tblkg; + unsigned long flags; + + ioc_now(ioc, &now); + + iocg->ioc = ioc; + atomic64_set(&iocg->vtime, now.vnow); + atomic64_set(&iocg->done_vtime, now.vnow); + atomic64_set(&iocg->abs_vdebt, 0); + atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); + INIT_LIST_HEAD(&iocg->active_list); + iocg->hweight_active = HWEIGHT_WHOLE; + iocg->hweight_inuse = HWEIGHT_WHOLE; + + init_waitqueue_head(&iocg->waitq); + hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + iocg->waitq_timer.function = iocg_waitq_timer_fn; + hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + iocg->delay_timer.function = iocg_delay_timer_fn; + + iocg->level = blkg->blkcg->css.cgroup->level; + + for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { + struct ioc_gq *tiocg = blkg_to_iocg(tblkg); + iocg->ancestors[tiocg->level] = tiocg; + } + + spin_lock_irqsave(&ioc->lock, flags); + weight_updated(iocg); + spin_unlock_irqrestore(&ioc->lock, flags); +} + +static void ioc_pd_free(struct blkg_policy_data *pd) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + + if (ioc) { + spin_lock(&ioc->lock); + if (!list_empty(&iocg->active_list)) { + propagate_active_weight(iocg, 0, 0); + list_del_init(&iocg->active_list); + } + spin_unlock(&ioc->lock); + + hrtimer_cancel(&iocg->waitq_timer); + hrtimer_cancel(&iocg->delay_timer); + } + kfree(iocg); +} + +static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc_gq *iocg = pd_to_iocg(pd); + + if (dname && iocg->cfg_weight) + seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight); + return 0; +} + + +static int ioc_weight_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); + + seq_printf(sf, "default %u\n", iocc->dfl_weight); + blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); + struct blkg_conf_ctx ctx; + struct ioc_gq *iocg; + u32 v; + int ret; + + if (!strchr(buf, ':')) { + struct blkcg_gq *blkg; + + if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v)) + return -EINVAL; + + if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) + return -EINVAL; + + spin_lock(&blkcg->lock); + iocc->dfl_weight = v; + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + struct ioc_gq *iocg = blkg_to_iocg(blkg); + + if (iocg) { + spin_lock_irq(&iocg->ioc->lock); + weight_updated(iocg); + spin_unlock_irq(&iocg->ioc->lock); + } + } + spin_unlock(&blkcg->lock); + + return nbytes; + } + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); + if (ret) + return ret; + + iocg = blkg_to_iocg(ctx.blkg); + + if (!strncmp(ctx.body, "default", 7)) { + v = 0; + } else { + if (!sscanf(ctx.body, "%u", &v)) + goto einval; + if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) + goto einval; + } + + spin_lock_irq(&iocg->ioc->lock); + iocg->cfg_weight = v; + weight_updated(iocg); + spin_unlock_irq(&iocg->ioc->lock); + + blkg_conf_finish(&ctx); + return nbytes; + +einval: + blkg_conf_finish(&ctx); + return -EINVAL; +} + +static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc *ioc = pd_to_iocg(pd)->ioc; + + if (!dname) + return 0; + + seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", + dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", + ioc->params.qos[QOS_RPPM] / 10000, + ioc->params.qos[QOS_RPPM] % 10000 / 100, + ioc->params.qos[QOS_RLAT], + ioc->params.qos[QOS_WPPM] / 10000, + ioc->params.qos[QOS_WPPM] % 10000 / 100, + ioc->params.qos[QOS_WLAT], + ioc->params.qos[QOS_MIN] / 10000, + ioc->params.qos[QOS_MIN] % 10000 / 100, + ioc->params.qos[QOS_MAX] / 10000, + ioc->params.qos[QOS_MAX] % 10000 / 100); + return 0; +} + +static int ioc_qos_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + + blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static const match_table_t qos_ctrl_tokens = { + { QOS_ENABLE, "enable=%u" }, + { QOS_CTRL, "ctrl=%s" }, + { NR_QOS_CTRL_PARAMS, NULL }, +}; + +static const match_table_t qos_tokens = { + { QOS_RPPM, "rpct=%s" }, + { QOS_RLAT, "rlat=%u" }, + { QOS_WPPM, "wpct=%s" }, + { QOS_WLAT, "wlat=%u" }, + { QOS_MIN, "min=%s" }, + { QOS_MAX, "max=%s" }, + { NR_QOS_PARAMS, NULL }, +}; + +static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) +{ + struct gendisk *disk; + struct ioc *ioc; + u32 qos[NR_QOS_PARAMS]; + bool enable, user; + char *p; + int ret; + + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); + + ioc = q_to_ioc(disk->queue); + if (!ioc) { + ret = blk_iocost_init(disk->queue); + if (ret) + goto err; + ioc = q_to_ioc(disk->queue); + } + + spin_lock_irq(&ioc->lock); + memcpy(qos, ioc->params.qos, sizeof(qos)); + enable = ioc->enabled; + user = ioc->user_qos_params; + spin_unlock_irq(&ioc->lock); + + while ((p = strsep(&input, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + char buf[32]; + int tok; + s64 v; + + if (!*p) + continue; + + switch (match_token(p, qos_ctrl_tokens, args)) { + case QOS_ENABLE: + match_u64(&args[0], &v); + enable = v; + continue; + case QOS_CTRL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (!strcmp(buf, "auto")) + user = false; + else if (!strcmp(buf, "user")) + user = true; + else + goto einval; + continue; + } + + tok = match_token(p, qos_tokens, args); + switch (tok) { + case QOS_RPPM: + case QOS_WPPM: + if (match_strlcpy(buf, &args[0], sizeof(buf)) >= + sizeof(buf)) + goto einval; + if (cgroup_parse_float(buf, 2, &v)) + goto einval; + if (v < 0 || v > 10000) + goto einval; + qos[tok] = v * 100; + break; + case QOS_RLAT: + case QOS_WLAT: + if (match_u64(&args[0], &v)) + goto einval; + qos[tok] = v; + break; + case QOS_MIN: + case QOS_MAX: + if (match_strlcpy(buf, &args[0], sizeof(buf)) >= + sizeof(buf)) + goto einval; + if (cgroup_parse_float(buf, 2, &v)) + goto einval; + if (v < 0) + goto einval; + qos[tok] = clamp_t(s64, v * 100, + VRATE_MIN_PPM, VRATE_MAX_PPM); + break; + default: + goto einval; + } + user = true; + } + + if (qos[QOS_MIN] > qos[QOS_MAX]) + goto einval; + + spin_lock_irq(&ioc->lock); + + if (enable) { + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + ioc->enabled = true; + } else { + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + ioc->enabled = false; + } + + if (user) { + memcpy(ioc->params.qos, qos, sizeof(qos)); + ioc->user_qos_params = true; + } else { + ioc->user_qos_params = false; + } + + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + put_disk_and_module(disk); + return nbytes; +einval: + ret = -EINVAL; +err: + put_disk_and_module(disk); + return ret; +} + +static u64 ioc_cost_model_prfill(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc *ioc = pd_to_iocg(pd)->ioc; + u64 *u = ioc->params.i_lcoefs; + + if (!dname) + return 0; + + seq_printf(sf, "%s ctrl=%s model=linear " + "rbps=%llu rseqiops=%llu rrandiops=%llu " + "wbps=%llu wseqiops=%llu wrandiops=%llu\n", + dname, ioc->user_cost_model ? "user" : "auto", + u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], + u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); + return 0; +} + +static int ioc_cost_model_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + + blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static const match_table_t cost_ctrl_tokens = { + { COST_CTRL, "ctrl=%s" }, + { COST_MODEL, "model=%s" }, + { NR_COST_CTRL_PARAMS, NULL }, +}; + +static const match_table_t i_lcoef_tokens = { + { I_LCOEF_RBPS, "rbps=%u" }, + { I_LCOEF_RSEQIOPS, "rseqiops=%u" }, + { I_LCOEF_RRANDIOPS, "rrandiops=%u" }, + { I_LCOEF_WBPS, "wbps=%u" }, + { I_LCOEF_WSEQIOPS, "wseqiops=%u" }, + { I_LCOEF_WRANDIOPS, "wrandiops=%u" }, + { NR_I_LCOEFS, NULL }, +}; + +static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) +{ + struct gendisk *disk; + struct ioc *ioc; + u64 u[NR_I_LCOEFS]; + bool user; + char *p; + int ret; + + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); + + ioc = q_to_ioc(disk->queue); + if (!ioc) { + ret = blk_iocost_init(disk->queue); + if (ret) + goto err; + ioc = q_to_ioc(disk->queue); + } + + spin_lock_irq(&ioc->lock); + memcpy(u, ioc->params.i_lcoefs, sizeof(u)); + user = ioc->user_cost_model; + spin_unlock_irq(&ioc->lock); + + while ((p = strsep(&input, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + char buf[32]; + int tok; + u64 v; + + if (!*p) + continue; + + switch (match_token(p, cost_ctrl_tokens, args)) { + case COST_CTRL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (!strcmp(buf, "auto")) + user = false; + else if (!strcmp(buf, "user")) + user = true; + else + goto einval; + continue; + case COST_MODEL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (strcmp(buf, "linear")) + goto einval; + continue; + } + + tok = match_token(p, i_lcoef_tokens, args); + if (tok == NR_I_LCOEFS) + goto einval; + if (match_u64(&args[0], &v)) + goto einval; + u[tok] = v; + user = true; + } + + spin_lock_irq(&ioc->lock); + if (user) { + memcpy(ioc->params.i_lcoefs, u, sizeof(u)); + ioc->user_cost_model = true; + } else { + ioc->user_cost_model = false; + } + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + put_disk_and_module(disk); + return nbytes; + +einval: + ret = -EINVAL; +err: + put_disk_and_module(disk); + return ret; +} + +static struct cftype ioc_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = ioc_weight_show, + .write = ioc_weight_write, + }, + { + .name = "cost.qos", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioc_qos_show, + .write = ioc_qos_write, + }, + { + .name = "cost.model", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioc_cost_model_show, + .write = ioc_cost_model_write, + }, + {} +}; + +static struct blkcg_policy blkcg_policy_iocost = { + .dfl_cftypes = ioc_files, + .cpd_alloc_fn = ioc_cpd_alloc, + .cpd_free_fn = ioc_cpd_free, + .pd_alloc_fn = ioc_pd_alloc, + .pd_init_fn = ioc_pd_init, + .pd_free_fn = ioc_pd_free, +}; + +static int __init ioc_init(void) +{ + return blkcg_policy_register(&blkcg_policy_iocost); +} + +static void __exit ioc_exit(void) +{ + return blkcg_policy_unregister(&blkcg_policy_iocost); +} + +module_init(ioc_init); +module_exit(ioc_exit); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 0fff7b56df0e..c128d50cb410 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -725,7 +725,7 @@ int blk_iolatency_init(struct request_queue *q) return -ENOMEM; rqos = &blkiolat->rqos; - rqos->id = RQ_QOS_CGROUP; + rqos->id = RQ_QOS_LATENCY; rqos->ops = &blkcg_iolatency_ops; rqos->q = q; @@ -934,11 +934,13 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, } -static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, + struct request_queue *q, + struct blkcg *blkcg) { struct iolatency_grp *iolat; - iolat = kzalloc_node(sizeof(*iolat), gfp, node); + iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); if (!iolat) return NULL; iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), diff --git a/block/blk-merge.c b/block/blk-merge.c index 57f7990b342d..48e6725b32ee 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -132,19 +132,32 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); } +/* + * Return the maximum number of sectors from the start of a bio that may be + * submitted as a single request to a block device. If enough sectors remain, + * align the end to the physical block size. Otherwise align the end to the + * logical block size. This approach minimizes the number of non-aligned + * requests that are submitted to a block device if the start of a bio is not + * aligned to a physical block boundary. + */ static inline unsigned get_max_io_size(struct request_queue *q, struct bio *bio) { unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); - unsigned mask = queue_logical_block_size(q) - 1; + unsigned max_sectors = sectors; + unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; + unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; + unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1); - /* aligned to logical block size */ - sectors &= ~(mask >> 9); + max_sectors += start_offset; + max_sectors &= ~(pbs - 1); + if (max_sectors > start_offset) + return max_sectors - start_offset; - return sectors; + return sectors & (lbs - 1); } -static unsigned get_max_segment_size(struct request_queue *q, +static unsigned get_max_segment_size(const struct request_queue *q, unsigned offset) { unsigned long mask = queue_segment_boundary(q); @@ -157,26 +170,41 @@ static unsigned get_max_segment_size(struct request_queue *q, queue_max_segment_size(q)); } -/* - * Split the bvec @bv into segments, and update all kinds of - * variables. +/** + * bvec_split_segs - verify whether or not a bvec should be split in the middle + * @q: [in] request queue associated with the bio associated with @bv + * @bv: [in] bvec to examine + * @nsegs: [in,out] Number of segments in the bio being built. Incremented + * by the number of segments from @bv that may be appended to that + * bio without exceeding @max_segs + * @sectors: [in,out] Number of sectors in the bio being built. Incremented + * by the number of sectors from @bv that may be appended to that + * bio without exceeding @max_sectors + * @max_segs: [in] upper bound for *@nsegs + * @max_sectors: [in] upper bound for *@sectors + * + * When splitting a bio, it can happen that a bvec is encountered that is too + * big to fit in a single segment and hence that it has to be split in the + * middle. This function verifies whether or not that should happen. The value + * %true is returned if and only if appending the entire @bv to a bio with + * *@nsegs segments and *@sectors sectors would make that bio unacceptable for + * the block driver. */ -static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, - unsigned *nsegs, unsigned *sectors, unsigned max_segs) +static bool bvec_split_segs(const struct request_queue *q, + const struct bio_vec *bv, unsigned *nsegs, + unsigned *sectors, unsigned max_segs, + unsigned max_sectors) { - unsigned len = bv->bv_len; + unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9; + unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; - unsigned new_nsegs = 0, seg_size = 0; + unsigned seg_size = 0; - /* - * Multi-page bvec may be too big to hold in one segment, so the - * current bvec has to be splitted as multiple segments. - */ - while (len && new_nsegs + *nsegs < max_segs) { + while (len && *nsegs < max_segs) { seg_size = get_max_segment_size(q, bv->bv_offset + total_len); seg_size = min(seg_size, len); - new_nsegs++; + (*nsegs)++; total_len += seg_size; len -= seg_size; @@ -184,16 +212,31 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, break; } - if (new_nsegs) { - *nsegs += new_nsegs; - if (sectors) - *sectors += total_len >> 9; - } + *sectors += total_len >> 9; - /* split in the middle of the bvec if len != 0 */ - return !!len; + /* tell the caller to split the bvec if it is too big to fit */ + return len > 0 || bv->bv_len > max_len; } +/** + * blk_bio_segment_split - split a bio in two bios + * @q: [in] request queue pointer + * @bio: [in] bio to be split + * @bs: [in] bio set to allocate the clone from + * @segs: [out] number of segments in the bio with the first half of the sectors + * + * Clone @bio, update the bi_iter of the clone to represent the first sectors + * of @bio and update @bio->bi_iter to represent the remaining sectors. The + * following is guaranteed for the cloned bio: + * - That it has at most get_max_io_size(@q, @bio) sectors. + * - That it has at most queue_max_segments(@q) segments. + * + * Except for discard requests the cloned bio will point at the bi_io_vec of + * the original bio. It is the responsibility of the caller to ensure that the + * original bio is not freed before the cloned bio. The caller is also + * responsible for ensuring that @bs is only destroyed after processing of the + * split bio has finished. + */ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, @@ -213,34 +256,18 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) goto split; - if (sectors + (bv.bv_len >> 9) > max_sectors) { - /* - * Consider this a new segment if we're splitting in - * the middle of this vector. - */ - if (nsegs < max_segs && - sectors < max_sectors) { - /* split in the middle of bvec */ - bv.bv_len = (max_sectors - sectors) << 9; - bvec_split_segs(q, &bv, &nsegs, - §ors, max_segs); - } + if (nsegs < max_segs && + sectors + (bv.bv_len >> 9) <= max_sectors && + bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + nsegs++; + sectors += bv.bv_len >> 9; + } else if (bvec_split_segs(q, &bv, &nsegs, §ors, max_segs, + max_sectors)) { goto split; } - if (nsegs == max_segs) - goto split; - bvprv = bv; bvprvp = &bvprv; - - if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) { - nsegs++; - sectors += bv.bv_len >> 9; - } else if (bvec_split_segs(q, &bv, &nsegs, §ors, - max_segs)) { - goto split; - } } *segs = nsegs; @@ -250,6 +277,19 @@ split: return bio_split(bio, sectors, GFP_NOIO, bs); } +/** + * __blk_queue_split - split a bio and submit the second half + * @q: [in] request queue pointer + * @bio: [in, out] bio to be split + * @nr_segs: [out] number of segments in the first bio + * + * Split a bio into two bios, chain the two bios, submit the second half and + * store a pointer to the first half in *@bio. If the second bio is still too + * big it will be split by a recursive call to this function. Since this + * function may allocate a new bio from @q->bio_split, it is the responsibility + * of the caller to ensure that @q is only released after processing of the + * split bio has finished. + */ void __blk_queue_split(struct request_queue *q, struct bio **bio, unsigned int *nr_segs) { @@ -294,6 +334,17 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, } } +/** + * blk_queue_split - split a bio and submit the second half + * @q: [in] request queue pointer + * @bio: [in, out] bio to be split + * + * Split a bio into two bios, chains the two bios, submit the second half and + * store a pointer to the first half in *@bio. Since this function may allocate + * a new bio from @q->bio_split, it is the responsibility of the caller to + * ensure that @q is only released after processing of the split bio has + * finished. + */ void blk_queue_split(struct request_queue *q, struct bio **bio) { unsigned int nr_segs; @@ -305,6 +356,7 @@ EXPORT_SYMBOL(blk_queue_split); unsigned int blk_recalc_rq_segments(struct request *rq) { unsigned int nr_phys_segs = 0; + unsigned int nr_sectors = 0; struct req_iterator iter; struct bio_vec bv; @@ -321,7 +373,8 @@ unsigned int blk_recalc_rq_segments(struct request *rq) } rq_for_each_bvec(bv, rq, iter) - bvec_split_segs(rq->q, &bv, &nr_phys_segs, NULL, UINT_MAX); + bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors, + UINT_MAX, UINT_MAX); return nr_phys_segs; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index f945621a0e8f..0157f2b3485a 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -15,10 +15,10 @@ #include "blk.h" #include "blk-mq.h" -static int cpu_to_queue_index(struct blk_mq_queue_map *qmap, - unsigned int nr_queues, const int cpu) +static int queue_index(struct blk_mq_queue_map *qmap, + unsigned int nr_queues, const int q) { - return qmap->queue_offset + (cpu % nr_queues); + return qmap->queue_offset + (q % nr_queues); } static int get_first_sibling(unsigned int cpu) @@ -36,21 +36,36 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap) { unsigned int *map = qmap->mq_map; unsigned int nr_queues = qmap->nr_queues; - unsigned int cpu, first_sibling; + unsigned int cpu, first_sibling, q = 0; + + for_each_possible_cpu(cpu) + map[cpu] = -1; + + /* + * Spread queues among present CPUs first for minimizing + * count of dead queues which are mapped by all un-present CPUs + */ + for_each_present_cpu(cpu) { + if (q >= nr_queues) + break; + map[cpu] = queue_index(qmap, nr_queues, q++); + } for_each_possible_cpu(cpu) { + if (map[cpu] != -1) + continue; /* * First do sequential mapping between CPUs and queues. * In case we still have CPUs to map, and we have some number of * threads per cores then map sibling threads to the same queue * for performance optimizations. */ - if (cpu < nr_queues) { - map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); + if (q < nr_queues) { + map[cpu] = queue_index(qmap, nr_queues, q++); } else { first_sibling = get_first_sibling(cpu); if (first_sibling == cpu) - map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); + map[cpu] = queue_index(qmap, nr_queues, q++); else map[cpu] = map[first_sibling]; } diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index d6e1a9bd7131..a0d3ce30fa08 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -270,7 +270,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_dir_lock); queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); @@ -320,7 +320,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q) int ret, i; WARN_ON_ONCE(!q->kobj.parent); - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->sysfs_dir_lock); ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) @@ -349,23 +349,12 @@ unreg: return ret; } -int blk_mq_register_dev(struct device *dev, struct request_queue *q) -{ - int ret; - - mutex_lock(&q->sysfs_lock); - ret = __blk_mq_register_dev(dev, q); - mutex_unlock(&q->sysfs_lock); - - return ret; -} - void blk_mq_sysfs_unregister(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock; @@ -373,7 +362,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q) blk_mq_unregister_hctx(hctx); unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register(struct request_queue *q) @@ -381,7 +370,7 @@ int blk_mq_sysfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i, ret = 0; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock; @@ -392,7 +381,7 @@ int blk_mq_sysfs_register(struct request_queue *q) } unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); return ret; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index da19f0bc8876..008388e82b5c 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/blk-mq.h> +#include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" @@ -354,6 +355,37 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); +static bool blk_mq_tagset_count_completed_rqs(struct request *rq, + void *data, bool reserved) +{ + unsigned *count = data; + + if (blk_mq_request_completed(rq)) + (*count)++; + return true; +} + +/** + * blk_mq_tagset_wait_completed_request - wait until all completed req's + * complete funtion is run + * @tagset: Tag set to drain completed request + * + * Note: This function has to be run after all IO queues are shutdown + */ +void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) +{ + while (true) { + unsigned count = 0; + + blk_mq_tagset_busy_iter(tagset, + blk_mq_tagset_count_completed_rqs, &count); + if (!count) + break; + msleep(5); + } +} +EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); + /** * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. diff --git a/block/blk-mq.c b/block/blk-mq.c index e0b849bfe74d..20a49be536b5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); static int blk_mq_poll_stats_bkt(const struct request *rq) { - int ddir, bytes, bucket; + int ddir, sectors, bucket; ddir = rq_data_dir(rq); - bytes = blk_rq_bytes(rq); + sectors = blk_rq_stats_sectors(rq); - bucket = ddir + 2*(ilog2(bytes) - 9); + bucket = ddir + 2 * ilog2(sectors); if (bucket < 0) return -1; @@ -282,16 +282,16 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) EXPORT_SYMBOL(blk_mq_can_queue); /* - * Only need start/end time stamping if we have stats enabled, or using - * an IO scheduler. + * Only need start/end time stamping if we have iostat or + * blk stats enabled, or using an IO scheduler. */ static inline bool blk_mq_need_time_stamp(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; + return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, unsigned int op) + unsigned int tag, unsigned int op, u64 alloc_time_ns) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; @@ -325,11 +325,15 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + rq->alloc_time_ns = alloc_time_ns; +#endif if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; rq->io_start_time_ns = 0; + rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -356,8 +360,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct request *rq; unsigned int tag; bool clear_ctx_on_error = false; + u64 alloc_time_ns = 0; blk_queue_enter_live(q); + + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); + data->q = q; if (likely(!data->ctx)) { data->ctx = blk_mq_get_ctx(q); @@ -393,7 +403,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, return NULL; } - rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); + rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns); if (!op_is_flush(data->cmd_flags)) { rq->elv.icq = NULL; if (e && e->type->ops.prepare_request) { @@ -652,19 +662,18 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -void blk_mq_complete_request_sync(struct request *rq) -{ - WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); - rq->q->mq_ops->complete(rq); -} -EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync); - int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } EXPORT_SYMBOL_GPL(blk_mq_request_started); +int blk_mq_request_completed(struct request *rq) +{ + return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; +} +EXPORT_SYMBOL_GPL(blk_mq_request_completed); + void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -673,9 +682,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - rq->throtl_size = blk_rq_sectors(rq); -#endif + rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } @@ -2453,11 +2460,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; - /* - * Avoid others reading imcomplete hctx->cpumask through sysfs - */ - mutex_lock(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -2518,8 +2520,6 @@ static void blk_mq_map_swqueue(struct request_queue *q) HCTX_TYPE_DEFAULT, i); } - mutex_unlock(&q->sysfs_lock); - queue_for_each_hw_ctx(q, hctx, i) { /* * If no software queues are mapped to this hardware queue, @@ -2688,7 +2688,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!uninit_q) return ERR_PTR(-ENOMEM); - q = blk_mq_init_allocated_queue(set, uninit_q); + /* + * Initialize the queue without an elevator. device_add_disk() will do + * the initialization. + */ + q = blk_mq_init_allocated_queue(set, uninit_q, false); if (IS_ERR(q)) blk_cleanup_queue(uninit_q); @@ -2839,7 +2843,8 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) } struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q) + struct request_queue *q, + bool elevator_init) { /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -2901,18 +2906,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - if (!(set->flags & BLK_MQ_F_NO_SCHED)) { - int ret; - - ret = elevator_init_mq(q); - if (ret) - return ERR_PTR(ret); - } + if (elevator_init) + elevator_init_mq(q); return q; err_hctxs: kfree(q->queue_hw_ctx); + q->nr_hw_queues = 0; err_sys_init: blk_mq_sysfs_deinit(q); err_poll: diff --git a/block/blk-pm.c b/block/blk-pm.c index 0a028c189897..1adc1cd748b4 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -207,10 +207,12 @@ EXPORT_SYMBOL(blk_post_runtime_resume); */ void blk_set_runtime_active(struct request_queue *q) { - spin_lock_irq(&q->queue_lock); - q->rpm_status = RPM_ACTIVE; - pm_runtime_mark_last_busy(q->dev); - pm_request_autosuspend(q->dev); - spin_unlock_irq(&q->queue_lock); + if (q->dev) { + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + pm_request_autosuspend(q->dev); + spin_unlock_irq(&q->queue_lock); + } } EXPORT_SYMBOL(blk_set_runtime_active); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 3954c0dc1443..61b635bc2a31 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -83,6 +83,15 @@ void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) } while (rqos); } +void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + do { + if (rqos->ops->merge) + rqos->ops->merge(rqos, rq, bio); + rqos = rqos->next; + } while (rqos); +} + void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) { do { @@ -92,6 +101,15 @@ void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) } while (rqos); } +void __rq_qos_queue_depth_changed(struct rq_qos *rqos) +{ + do { + if (rqos->ops->queue_depth_changed) + rqos->ops->queue_depth_changed(rqos); + rqos = rqos->next; + } while (rqos); +} + /* * Return true, if we can't increase the depth further by scaling */ diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2300e038b9fa..08a09dbe0f4b 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -14,7 +14,8 @@ struct blk_mq_debugfs_attr; enum rq_qos_id { RQ_QOS_WBT, - RQ_QOS_CGROUP, + RQ_QOS_LATENCY, + RQ_QOS_COST, }; struct rq_wait { @@ -35,11 +36,13 @@ struct rq_qos { struct rq_qos_ops { void (*throttle)(struct rq_qos *, struct bio *); void (*track)(struct rq_qos *, struct request *, struct bio *); + void (*merge)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); + void (*queue_depth_changed)(struct rq_qos *); void (*exit)(struct rq_qos *); const struct blk_mq_debugfs_attr *debugfs_attrs; }; @@ -72,7 +75,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) { - return rq_qos_id(q, RQ_QOS_CGROUP); + return rq_qos_id(q, RQ_QOS_LATENCY); } static inline const char *rq_qos_id_to_name(enum rq_qos_id id) @@ -80,8 +83,10 @@ static inline const char *rq_qos_id_to_name(enum rq_qos_id id) switch (id) { case RQ_QOS_WBT: return "wbt"; - case RQ_QOS_CGROUP: - return "cgroup"; + case RQ_QOS_LATENCY: + return "latency"; + case RQ_QOS_COST: + return "cost"; } return "unknown"; } @@ -135,7 +140,9 @@ void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); +void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { @@ -185,6 +192,19 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq, __rq_qos_track(q->rq_qos, rq, bio); } +static inline void rq_qos_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_merge(q->rq_qos, rq, bio); +} + +static inline void rq_qos_queue_depth_changed(struct request_queue *q) +{ + if (q->rq_qos) + __rq_qos_queue_depth_changed(q->rq_qos); +} + void rq_qos_exit(struct request_queue *); #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 2c1831207a8f..6bd1e3b082d8 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -805,7 +805,7 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; - wbt_set_queue_depth(q, depth); + rq_qos_queue_depth_changed(q); } EXPORT_SYMBOL(blk_set_queue_depth); @@ -832,6 +832,22 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) } EXPORT_SYMBOL_GPL(blk_queue_write_cache); +/** + * blk_queue_required_elevator_features - Set a queue required elevator features + * @q: the request queue for the target device + * @features: Required elevator features OR'ed together + * + * Tell the block layer that for the device controlled through @q, only the + * only elevators that can be used are those that implement at least the set of + * features specified by @features. + */ +void blk_queue_required_elevator_features(struct request_queue *q, + unsigned int features) +{ + q->required_elevator_features = features; +} +EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9bfa3ea4ed63..b82736c781c5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -941,14 +941,14 @@ int blk_register_queue(struct gendisk *disk) int ret; struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; + bool has_elevator = false; if (WARN_ON(!q)) return -ENXIO; - WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), + WARN_ONCE(blk_queue_registered(q), "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); /* * SCSI probing may synchronously create and destroy a lot of @@ -968,8 +968,7 @@ int blk_register_queue(struct gendisk *disk) if (ret) return ret; - /* Prevent changes through sysfs until registration is completed. */ - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); if (ret < 0) { @@ -990,26 +989,36 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); } - kobject_uevent(&q->kobj, KOBJ_ADD); - - wbt_enable_default(q); - - blk_throtl_register_queue(q); - + /* + * The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator + * switch won't happen at all. + */ if (q->elevator) { - ret = elv_register_queue(q); + ret = elv_register_queue(q, false); if (ret) { - mutex_unlock(&q->sysfs_lock); - kobject_uevent(&q->kobj, KOBJ_REMOVE); + mutex_unlock(&q->sysfs_dir_lock); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); return ret; } + has_elevator = true; } + + mutex_lock(&q->sysfs_lock); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(q); + blk_throtl_register_queue(q); + + /* Now everything is ready and send out KOBJ_ADD uevent */ + kobject_uevent(&q->kobj, KOBJ_ADD); + if (has_elevator) + kobject_uevent(&q->elevator->kobj, KOBJ_ADD); + mutex_unlock(&q->sysfs_lock); + ret = 0; unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); @@ -1029,7 +1038,7 @@ void blk_unregister_queue(struct gendisk *disk) return; /* Return early if disk->queue was never registered. */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return; /* @@ -1038,25 +1047,28 @@ void blk_unregister_queue(struct gendisk *disk) * concurrent elv_iosched_store() calls. */ mutex_lock(&q->sysfs_lock); - blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + mutex_unlock(&q->sysfs_lock); + mutex_lock(&q->sysfs_dir_lock); /* * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); - mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - mutex_lock(&q->sysfs_lock); + /* + * q->kobj has been removed, so it is safe to check if elevator + * exists without holding q->sysfs_lock. + */ if (q->elevator) elv_unregister_queue(q); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); kobject_put(&disk_to_dev(disk)->kobj); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8ab6c8153223..18f773e52dfb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -478,12 +478,14 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq) timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } -static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, + struct request_queue *q, + struct blkcg *blkcg) { struct throtl_grp *tg; int rw; - tg = kzalloc_node(sizeof(*tg), gfp, node); + tg = kzalloc_node(sizeof(*tg), gfp, q->node); if (!tg) return NULL; @@ -2246,7 +2248,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns) struct request_queue *q = rq->q; struct throtl_data *td = q->td; - throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10); + throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), + time_ns >> 10); } void blk_throtl_bio_endio(struct bio *bio) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 313f45a37e9d..8af553a0ba00 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -629,15 +629,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } -void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) -{ - struct rq_qos *rqos = wbt_rq_qos(q); - if (rqos) { - RQWB(rqos)->rq_depth.queue_depth = depth; - __wbt_update_limits(RQWB(rqos)); - } -} - void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) { struct rq_qos *rqos = wbt_rq_qos(q); @@ -656,7 +647,7 @@ void wbt_enable_default(struct request_queue *q) return; /* Queue not registered? Maybe shutting down... */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return; if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) @@ -689,6 +680,12 @@ static int wbt_data_dir(const struct request *rq) return -1; } +static void wbt_queue_depth_changed(struct rq_qos *rqos) +{ + RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q); + __wbt_update_limits(RQWB(rqos)); +} + static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); @@ -811,6 +808,7 @@ static struct rq_qos_ops wbt_rqos_ops = { .requeue = wbt_requeue, .done = wbt_done, .cleanup = wbt_cleanup, + .queue_depth_changed = wbt_queue_depth_changed, .exit = wbt_exit, #ifdef CONFIG_BLK_DEBUG_FS .debugfs_attrs = wbt_debugfs_attrs, @@ -853,7 +851,7 @@ int wbt_init(struct request_queue *q) rwb->min_lat_nsec = wbt_default_latency_nsec(q); - wbt_set_queue_depth(q, blk_queue_depth(q)); + wbt_queue_depth_changed(&rwb->rqos); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); return 0; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index f47218d5b3b2..8e4e37660971 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -95,7 +95,6 @@ void wbt_enable_default(struct request_queue *); u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); -void wbt_set_queue_depth(struct request_queue *, unsigned int); void wbt_set_write_cache(struct request_queue *, bool); u64 wbt_default_latency_nsec(struct request_queue *); @@ -118,9 +117,6 @@ static inline void wbt_disable_default(struct request_queue *q) static inline void wbt_enable_default(struct request_queue *q) { } -static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) -{ -} static inline void wbt_set_write_cache(struct request_queue *q, bool wc) { } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 6c503824ba3f..4bc5f260248a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -202,6 +202,42 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_report_zones); +/* + * Special case of zone reset operation to reset all zones in one command, + * useful for applications like mkfs. + */ +static int __blkdev_reset_all_zones(struct block_device *bdev, gfp_t gfp_mask) +{ + struct bio *bio = bio_alloc(gfp_mask, 0); + int ret; + + /* across the zones operations, don't need any sectors */ + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, REQ_OP_ZONE_RESET_ALL, 0); + + ret = submit_bio_wait(bio); + bio_put(bio); + + return ret; +} + +static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, + sector_t nr_sectors) +{ + if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) + return false; + + if (nr_sectors != part_nr_sects_read(bdev->bd_part)) + return false; + /* + * REQ_OP_ZONE_RESET_ALL can be executed only if the block device is + * the entire disk, that is, if the blocks device start offset is 0 and + * its capacity is the same as the entire disk. + */ + return get_start_sect(bdev) == 0 && + part_nr_sects_read(bdev->bd_part) == get_capacity(bdev->bd_disk); +} + /** * blkdev_reset_zones - Reset zones write pointer * @bdev: Target block device @@ -235,6 +271,9 @@ int blkdev_reset_zones(struct block_device *bdev, /* Out of range */ return -EINVAL; + if (blkdev_allow_reset_all_zones(bdev, nr_sectors)) + return __blkdev_reset_all_zones(bdev, gfp_mask); + /* Check alignment (handle eventual smaller last zone) */ zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) diff --git a/block/blk.h b/block/blk.h index de6b2e146d6e..ed347f7a97b1 100644 --- a/block/blk.h +++ b/block/blk.h @@ -184,11 +184,11 @@ void blk_account_io_done(struct request *req, u64 now); void blk_insert_flush(struct request *rq); -int elevator_init_mq(struct request_queue *q); +void elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void __elevator_exit(struct request_queue *, struct elevator_queue *); -int elv_register_queue(struct request_queue *q); +int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); static inline void elevator_exit(struct request_queue *q, diff --git a/block/elevator.c b/block/elevator.c index 2f17d66d0e61..bba10e83478a 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,8 +83,26 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static bool elevator_match(const struct elevator_type *e, const char *name) +static inline bool elv_support_features(unsigned int elv_features, + unsigned int required_features) { + return (required_features & elv_features) == required_features; +} + +/** + * elevator_match - Test an elevator name and features + * @e: Scheduler to test + * @name: Elevator name to test + * @required_features: Features that the elevator must provide + * + * Return true is the elevator @e name matches @name and if @e provides all the + * the feratures spcified by @required_features. + */ +static bool elevator_match(const struct elevator_type *e, const char *name, + unsigned int required_features) +{ + if (!elv_support_features(e->elevator_features, required_features)) + return false; if (!strcmp(e->elevator_name, name)) return true; if (e->elevator_alias && !strcmp(e->elevator_alias, name)) @@ -93,15 +111,21 @@ static bool elevator_match(const struct elevator_type *e, const char *name) return false; } -/* - * Return scheduler with name 'name' +/** + * elevator_find - Find an elevator + * @name: Name of the elevator to find + * @required_features: Features that the elevator must provide + * + * Return the first registered scheduler with name @name and supporting the + * features @required_features and NULL otherwise. */ -static struct elevator_type *elevator_find(const char *name) +static struct elevator_type *elevator_find(const char *name, + unsigned int required_features) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (elevator_match(e, name)) + if (elevator_match(e, name, required_features)) return e; } @@ -120,12 +144,12 @@ static struct elevator_type *elevator_get(struct request_queue *q, spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->required_elevator_features); if (!e && try_loading) { spin_unlock(&elv_list_lock); request_module("%s-iosched", name); spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->required_elevator_features); } if (e && !try_module_get(e->elevator_owner)) @@ -135,20 +159,6 @@ static struct elevator_type *elevator_get(struct request_queue *q, return e; } -static char chosen_elevator[ELV_NAME_MAX]; - -static int __init elevator_setup(char *str) -{ - /* - * Be backwards-compatible with previous kernels, so users - * won't get the wrong elevator. - */ - strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); - return 1; -} - -__setup("elevator=", elevator_setup); - static struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, @@ -470,13 +480,16 @@ static struct kobj_type elv_ktype = { .release = elevator_release, }; -int elv_register_queue(struct request_queue *q) +/* + * elv_register_queue is called from either blk_register_queue or + * elevator_switch, elevator switch is prevented from being happen + * in the two paths, so it is safe to not hold q->sysfs_lock. + */ +int elv_register_queue(struct request_queue *q, bool uevent) { struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->sysfs_lock); - error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -487,24 +500,34 @@ int elv_register_queue(struct request_queue *q) attr++; } } - kobject_uevent(&e->kobj, KOBJ_ADD); + if (uevent) + kobject_uevent(&e->kobj, KOBJ_ADD); + + mutex_lock(&q->sysfs_lock); e->registered = 1; + mutex_unlock(&q->sysfs_lock); } return error; } +/* + * elv_unregister_queue is called from either blk_unregister_queue or + * elevator_switch, elevator switch is prevented from being happen + * in the two paths, so it is safe to not hold q->sysfs_lock. + */ void elv_unregister_queue(struct request_queue *q) { - lockdep_assert_held(&q->sysfs_lock); - if (q) { struct elevator_queue *e = q->elevator; kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + + mutex_lock(&q->sysfs_lock); e->registered = 0; /* Re-enable throttling in case elevator disabled it */ wbt_enable_default(q); + mutex_unlock(&q->sysfs_lock); } } @@ -526,7 +549,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name)) { + if (elevator_find(e->elevator_name, 0)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; @@ -567,10 +590,32 @@ int elevator_switch_mq(struct request_queue *q, lockdep_assert_held(&q->sysfs_lock); if (q->elevator) { - if (q->elevator->registered) + if (q->elevator->registered) { + mutex_unlock(&q->sysfs_lock); + + /* + * Concurrent elevator switch can't happen becasue + * sysfs write is always exclusively on same file. + * + * Also the elevator queue won't be freed after + * sysfs_lock is released becasue kobject_del() in + * blk_unregister_queue() waits for completion of + * .store & .show on its attributes. + */ elv_unregister_queue(q); + + mutex_lock(&q->sysfs_lock); + } ioc_clear_queue(q); elevator_exit(q, q->elevator); + + /* + * sysfs_lock may be dropped, so re-check if queue is + * unregistered. If yes, don't switch to new elevator + * any more + */ + if (!blk_queue_registered(q)) + return 0; } ret = blk_mq_init_sched(q, new_e); @@ -578,7 +623,11 @@ int elevator_switch_mq(struct request_queue *q, goto out; if (new_e) { - ret = elv_register_queue(q); + mutex_unlock(&q->sysfs_lock); + + ret = elv_register_queue(q, true); + + mutex_lock(&q->sysfs_lock); if (ret) { elevator_exit(q, q->elevator); goto out; @@ -594,37 +643,89 @@ out: return ret; } +static inline bool elv_support_iosched(struct request_queue *q) +{ + if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) + return false; + return true; +} + /* - * For blk-mq devices, we default to using mq-deadline, if available, for single - * queue devices. If deadline isn't available OR we have multiple queues, - * default to "none". + * For single queue devices, default to using mq-deadline. If we have multiple + * queues or mq-deadline is not available, default to "none". */ -int elevator_init_mq(struct request_queue *q) +static struct elevator_type *elevator_get_default(struct request_queue *q) +{ + if (q->nr_hw_queues != 1) + return NULL; + + return elevator_get(q, "mq-deadline", false); +} + +/* + * Get the first elevator providing the features required by the request queue. + * Default to "none" if no matching elevator is found. + */ +static struct elevator_type *elevator_get_by_features(struct request_queue *q) +{ + struct elevator_type *e, *found = NULL; + + spin_lock(&elv_list_lock); + + list_for_each_entry(e, &elv_list, list) { + if (elv_support_features(e->elevator_features, + q->required_elevator_features)) { + found = e; + break; + } + } + + if (found && !try_module_get(found->elevator_owner)) + found = NULL; + + spin_unlock(&elv_list_lock); + return found; +} + +/* + * For a device queue that has no required features, use the default elevator + * settings. Otherwise, use the first elevator available matching the required + * features. If no suitable elevator is find or if the chosen elevator + * initialization fails, fall back to the "none" elevator (no elevator). + */ +void elevator_init_mq(struct request_queue *q) { struct elevator_type *e; - int err = 0; + int err; - if (q->nr_hw_queues != 1) - return 0; + if (!elv_support_iosched(q)) + return; + + WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)); - /* - * q->sysfs_lock must be held to provide mutual exclusion between - * elevator_switch() and here. - */ - mutex_lock(&q->sysfs_lock); if (unlikely(q->elevator)) - goto out_unlock; + return; - e = elevator_get(q, "mq-deadline", false); + if (!q->required_elevator_features) + e = elevator_get_default(q); + else + e = elevator_get_by_features(q); if (!e) - goto out_unlock; + return; + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); err = blk_mq_init_sched(q, e); - if (err) + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + if (err) { + pr_warn("\"%s\" elevator initialization failed, " + "falling back to \"none\"\n", e->elevator_name); elevator_put(e); -out_unlock: - mutex_unlock(&q->sysfs_lock); - return err; + } } @@ -660,7 +761,7 @@ static int __elevator_change(struct request_queue *q, const char *name) struct elevator_type *e; /* Make sure queue is not in the middle of being removed */ - if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + if (!blk_queue_registered(q)) return -ENOENT; /* @@ -677,7 +778,8 @@ static int __elevator_change(struct request_queue *q, const char *name) if (!e) return -EINVAL; - if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { + if (q->elevator && + elevator_match(q->elevator->type, elevator_name, 0)) { elevator_put(e); return 0; } @@ -685,13 +787,6 @@ static int __elevator_change(struct request_queue *q, const char *name) return elevator_switch(q, e); } -static inline bool elv_support_iosched(struct request_queue *q) -{ - if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) - return false; - return true; -} - ssize_t elv_iosched_store(struct request_queue *q, const char *name, size_t count) { @@ -724,11 +819,13 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (elv && elevator_match(elv, __e->elevator_name)) { + if (elv && elevator_match(elv, __e->elevator_name, 0)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } - if (elv_support_iosched(q)) + if (elv_support_iosched(q) && + elevator_match(__e, __e->elevator_name, + q->required_elevator_features)) len += sprintf(name+len, "%s ", __e->elevator_name); } spin_unlock(&elv_list_lock); diff --git a/block/genhd.c b/block/genhd.c index 54f1f0d381f4..26b31fcae217 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -695,6 +695,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, dev_t devt; int retval; + /* + * The disk queue should now be all set with enough information about + * the device for the elevator code to pick an adequate default + * elevator if one is needed, that is, for devices requesting queue + * registration. + */ + if (register_queue) + elevator_init_mq(disk->queue); + /* minors == 0 indicates to use ext devt from part0 and should * be accompanied with EXT_DEVT flag. Make sure all * parameters make sense. diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 2a2a2e82832e..b490f47fd553 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -377,13 +377,6 @@ done: * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. - * - * For a zoned block device, __dd_dispatch_request() may return NULL - * if all the queued write requests are directed at zones that are already - * locked due to on-going write requests. In this case, make sure to mark - * the queue as needing a restart to ensure that the queue is run again - * and the pending writes dispatched once the target zones for the ongoing - * write requests are unlocked in dd_finish_request(). */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { @@ -392,9 +385,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); - if (!rq && blk_queue_is_zoned(hctx->queue) && - !list_empty(&dd->fifo_list[WRITE])) - blk_mq_sched_mark_restart_hctx(hctx); spin_unlock(&dd->lock); return rq; @@ -561,6 +551,13 @@ static void dd_prepare_request(struct request *rq, struct bio *bio) * spinlock so that the zone is never unlocked while deadline_fifo_request() * or deadline_next_request() are executing. This function is called for * all requests, whether or not these requests complete successfully. + * + * For a zoned block device, __dd_dispatch_request() may have stopped + * dispatching requests if all the queued requests are write requests directed + * at zones that are already locked due to on-going write requests. To ensure + * write request dispatch progress in this case, mark the queue as needing a + * restart to ensure that the queue is run again after completion of the + * request and zones being unlocked. */ static void dd_finish_request(struct request *rq) { @@ -572,6 +569,8 @@ static void dd_finish_request(struct request *rq) spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); + if (!list_empty(&dd->fifo_list[WRITE])) + blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); } } @@ -795,6 +794,7 @@ static struct elevator_type mq_deadline = { .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", + .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); diff --git a/block/opal_proto.h b/block/opal_proto.h index 466ec7be16ef..5532412d567c 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -119,8 +119,6 @@ enum opal_uid { OPAL_UID_HEXFF, }; -#define OPAL_METHOD_LENGTH 8 - /* Enum for indexing the OPALMETHOD array */ enum opal_method { OPAL_PROPERTIES, @@ -167,7 +165,6 @@ enum opal_token { OPAL_TABLE_LASTID = 0x0A, OPAL_TABLE_MIN = 0x0B, OPAL_TABLE_MAX = 0x0C, - /* authority table */ OPAL_PIN = 0x03, /* locking tokens */ @@ -182,7 +179,7 @@ enum opal_token { OPAL_LIFECYCLE = 0x06, /* locking info table */ OPAL_MAXRANGES = 0x04, - /* mbr control */ + /* mbr control */ OPAL_MBRENABLE = 0x01, OPAL_MBRDONE = 0x02, /* properties */ diff --git a/block/sed-opal.c b/block/sed-opal.c index 7e1a444a25b2..4e95a9792162 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -129,7 +129,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 }, /* tables */ - [OPAL_TABLE_TABLE] { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_LOCKINGRANGE_GLOBAL] = @@ -152,7 +151,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, /* C_PIN_TABLE object ID's */ - [OPAL_C_PIN_MSID] = { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, [OPAL_C_PIN_SID] = @@ -161,7 +159,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01}, /* half UID's (only first 4 bytes used) */ - [OPAL_HALF_UID_AUTHORITY_OBJ_REF] = { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff }, [OPAL_HALF_UID_BOOLEAN_ACE] = @@ -517,6 +514,7 @@ static int opal_discovery0(struct opal_dev *dev, void *data) ret = opal_recv_cmd(dev); if (ret) return ret; + return opal_discovery0_end(dev); } @@ -525,6 +523,7 @@ static int opal_discovery0_step(struct opal_dev *dev) const struct opal_step discovery0_step = { opal_discovery0, }; + return execute_step(dev, &discovery0_step, 0); } @@ -551,6 +550,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok) { if (!can_add(err, cmd, 1)) return; + cmd->cmd[cmd->pos++] = tok; } @@ -577,6 +577,7 @@ static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring, header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0; header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0; header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK; + cmd->cmd[cmd->pos++] = header0; cmd->cmd[cmd->pos++] = len; } @@ -649,6 +650,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr) if (lr == 0) return 0; + buffer[5] = LOCKING_RANGE_NON_GLOBAL; buffer[7] = lr; @@ -903,10 +905,6 @@ static int response_parse(const u8 *buf, size_t length, num_entries++; } - if (num_entries == 0) { - pr_debug("Couldn't parse response.\n"); - return -EINVAL; - } resp->num = num_entries; return 0; @@ -945,6 +943,7 @@ static size_t response_get_string(const struct parsed_resp *resp, int n, } *store = tok->pos + skip; + return tok->len - skip; } @@ -1062,6 +1061,7 @@ static int start_opal_session_cont(struct opal_dev *dev) dev->hsn = hsn; dev->tsn = tsn; + return 0; } @@ -1084,6 +1084,7 @@ static int end_session_cont(struct opal_dev *dev) { dev->hsn = 0; dev->tsn = 0; + return parse_and_check_status(dev); } @@ -1172,6 +1173,7 @@ static int gen_key(struct opal_dev *dev, void *data) return err; } + return finalize_and_send(dev, parse_and_check_status); } @@ -1184,12 +1186,14 @@ static int get_active_key_cont(struct opal_dev *dev) error = parse_and_check_status(dev); if (error) return error; + keylen = response_get_string(&dev->parsed, 4, &activekey); if (!activekey) { pr_debug("%s: Couldn't extract the Activekey from the response\n", __func__); return OPAL_INVAL_PARAM; } + dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); if (!dev->prev_data) @@ -1251,6 +1255,7 @@ static int generic_lr_enable_disable(struct opal_dev *dev, add_token_u8(&err, dev, OPAL_ENDLIST); add_token_u8(&err, dev, OPAL_ENDNAME); + return err; } @@ -1263,6 +1268,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, 0, 0); if (err) pr_debug("Failed to create enable global lr command\n"); + return err; } @@ -1313,7 +1319,6 @@ static int setup_locking_range(struct opal_dev *dev, void *data) if (err) { pr_debug("Error building Setup Locking range command.\n"); return err; - } return finalize_and_send(dev, parse_and_check_status); @@ -1393,6 +1398,7 @@ static int start_SIDASP_opal_session(struct opal_dev *dev, void *data) kfree(key); dev->prev_data = NULL; } + return ret; } @@ -1518,6 +1524,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data) pr_debug("Error building Erase Locking Range Command.\n"); return err; } + return finalize_and_send(dev, parse_and_check_status); } @@ -1636,6 +1643,7 @@ static int write_shadow_mbr(struct opal_dev *dev, void *data) off += len; } + return err; } @@ -1816,6 +1824,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data) pr_debug("Error building SET command.\n"); return err; } + return finalize_and_send(dev, parse_and_check_status); } @@ -1857,6 +1866,7 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data) pr_debug("Error building SET command.\n"); return ret; } + return finalize_and_send(dev, parse_and_check_status); } @@ -1957,6 +1967,7 @@ static int end_opal_session(struct opal_dev *dev, void *data) if (err < 0) return err; + return finalize_and_send(dev, end_session_cont); } @@ -1965,6 +1976,7 @@ static int end_opal_session_error(struct opal_dev *dev) const struct opal_step error_end_session = { end_opal_session, }; + return execute_step(dev, &error_end_session, 0); } @@ -1984,6 +1996,7 @@ static int check_opal_support(struct opal_dev *dev) ret = opal_discovery0_step(dev); dev->supported = !ret; mutex_unlock(&dev->dev_lock); + return ret; } @@ -2004,6 +2017,7 @@ void free_opal_dev(struct opal_dev *dev) { if (!dev) return; + clean_opal_dev(dev); kfree(dev); } @@ -2026,6 +2040,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) kfree(dev); return NULL; } + return dev; } EXPORT_SYMBOL(init_opal_dev); @@ -2045,6 +2060,7 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2062,6 +2078,7 @@ static int opal_erase_locking_range(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2089,6 +2106,7 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2113,6 +2131,7 @@ static int opal_set_mbr_done(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2133,6 +2152,7 @@ static int opal_write_shadow_mbr(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2151,6 +2171,7 @@ static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) setup_opal_dev(dev); add_suspend_info(dev, suspend); mutex_unlock(&dev->dev_lock); + return 0; } @@ -2169,12 +2190,14 @@ static int opal_add_user_to_lr(struct opal_dev *dev, pr_debug("Locking state was not RO or RW\n"); return -EINVAL; } + if (lk_unlk->session.who < OPAL_USER1 || lk_unlk->session.who > OPAL_USER9) { pr_debug("Authority was not within the range of users: %d\n", lk_unlk->session.who); return -EINVAL; } + if (lk_unlk->session.sum) { pr_debug("%s not supported in sum. Use setup locking range\n", __func__); @@ -2185,6 +2208,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2267,6 +2291,7 @@ static int opal_lock_unlock(struct opal_dev *dev, mutex_lock(&dev->dev_lock); ret = __opal_lock_unlock(dev, lk_unlk); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2289,6 +2314,7 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) setup_opal_dev(dev); ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2310,6 +2336,7 @@ static int opal_activate_lsp(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2327,6 +2354,7 @@ static int opal_setup_locking_range(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2347,6 +2375,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) setup_opal_dev(dev); ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2371,6 +2400,7 @@ static int opal_activate_user(struct opal_dev *dev, setup_opal_dev(dev); ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps)); mutex_unlock(&dev->dev_lock); + return ret; } @@ -2382,6 +2412,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) if (!dev) return false; + if (!dev->supported) return false; @@ -2399,6 +2430,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) suspend->unlk.session.sum); was_failure = true; } + if (dev->mbr_enabled) { ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); if (ret) @@ -2406,6 +2438,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) } } mutex_unlock(&dev->dev_lock); + return was_failure; } EXPORT_SYMBOL(opal_unlock_from_suspend); |