diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-23 02:00:32 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-23 02:00:32 +0200 |
commit | e9dd2b6837e26fe202708cce5ea4bb4ee3e3482e (patch) | |
tree | f42fd892495bfc4cbb740d06b016d267c9c42d00 /block | |
parent | Merge branch 'linux-next' of git://git.infradead.org/ubi-2.6 (diff) | |
parent | cfq-iosched: Fix a gcc 4.5 warning and put some comments (diff) | |
download | linux-e9dd2b6837e26fe202708cce5ea4bb4ee3e3482e.tar.xz linux-e9dd2b6837e26fe202708cce5ea4bb4ee3e3482e.zip |
Merge branch 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block: (39 commits)
cfq-iosched: Fix a gcc 4.5 warning and put some comments
block: Turn bvec_k{un,}map_irq() into static inline functions
block: fix accounting bug on cross partition merges
block: Make the integrity mapped property a bio flag
block: Fix double free in blk_integrity_unregister
block: Ensure physical block size is unsigned int
blkio-throttle: Fix possible multiplication overflow in iops calculations
blkio-throttle: limit max iops value to UINT_MAX
blkio-throttle: There is no need to convert jiffies to milli seconds
blkio-throttle: Fix link failure failure on i386
blkio: Recalculate the throttled bio dispatch time upon throttle limit change
blkio: Add root group to td->tg_list
blkio: deletion of a cgroup was causes oops
blkio: Do not export throttle files if CONFIG_BLK_DEV_THROTTLING=n
block: set the bounce_pfn to the actual DMA limit rather than to max memory
block: revert bad fix for memory hotplug causing bounces
Fix compile error in blk-exec.c for !CONFIG_DETECT_HUNG_TASK
block: set the bounce_pfn to the actual DMA limit rather than to max memory
block: Prevent hang_check firing during long I/O
cfq: improve fsync performance for small files
...
Fix up trivial conflicts due to __rcu sparse annotation in include/linux/genhd.h
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 12 | ||||
-rw-r--r-- | block/Makefile | 1 | ||||
-rw-r--r-- | block/blk-cgroup.c | 804 | ||||
-rw-r--r-- | block/blk-cgroup.h | 87 | ||||
-rw-r--r-- | block/blk-core.c | 53 | ||||
-rw-r--r-- | block/blk-exec.c | 9 | ||||
-rw-r--r-- | block/blk-integrity.c | 94 | ||||
-rw-r--r-- | block/blk-map.c | 5 | ||||
-rw-r--r-- | block/blk-merge.c | 25 | ||||
-rw-r--r-- | block/blk-settings.c | 12 | ||||
-rw-r--r-- | block/blk-sysfs.c | 11 | ||||
-rw-r--r-- | block/blk-throttle.c | 1123 | ||||
-rw-r--r-- | block/blk.h | 12 | ||||
-rw-r--r-- | block/cfq-iosched.c | 39 | ||||
-rw-r--r-- | block/cfq.h | 2 | ||||
-rw-r--r-- | block/genhd.c | 30 | ||||
-rw-r--r-- | block/ioctl.c | 2 |
17 files changed, 2069 insertions, 252 deletions
diff --git a/block/Kconfig b/block/Kconfig index 9be0b56eaee1..6c9213ef15a1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_THROTTLING + bool "Block layer bio throttling support" + depends on BLK_CGROUP=y && EXPERIMENTAL + default n + ---help--- + Block layer bio throttling support. It can be used to limit + the IO rate to a device. IO rate policies are per cgroup and + one needs to mount and use blkio cgroup controller for creating + cgroups and specifying per device IO rate policies. + + See Documentation/cgroups/blkio-controller.txt for more information. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Makefile b/block/Makefile index 0bb499a739cd..c850d5ef80a2 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2fef1ef931a0..b1febd0f6d2a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); +/* for encoding cft->private value on file */ +#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) +/* What policy owns the file, proportional or throttle */ +#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) +#define BLKIOFILE_ATTR(val) ((val) & 0xffff) + struct cgroup_subsys blkio_subsys = { .name = "blkio", .create = blkiocg_create, @@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, list_add(&pn->node, &blkcg->policy_list); } +static inline bool cftype_blkg_same_policy(struct cftype *cft, + struct blkio_group *blkg) +{ + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + + if (blkg->plid == plid) + return 1; + + return 0; +} + +/* Determines if policy node matches cgroup file being accessed */ +static inline bool pn_matches_cftype(struct cftype *cft, + struct blkio_policy_node *pn) +{ + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); + + return (plid == pn->plid && fileid == pn->fileid); +} + /* Must be called with blkcg->lock held */ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) { @@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) /* Must be called with blkcg->lock held */ static struct blkio_policy_node * -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, + enum blkio_policy_id plid, int fileid) { struct blkio_policy_node *pn; list_for_each_entry(pn, &blkcg->policy_list, node) { - if (pn->dev == dev) + if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) return pn; } @@ -86,6 +114,67 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); +static inline void +blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + if (blkiop->ops.blkio_update_group_weight_fn) + blkiop->ops.blkio_update_group_weight_fn(blkg->key, + blkg, weight); + } +} + +static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, + int fileid) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_bps_device + && blkiop->ops.blkio_update_group_read_bps_fn) + blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, + blkg, bps); + + if (fileid == BLKIO_THROTL_write_bps_device + && blkiop->ops.blkio_update_group_write_bps_fn) + blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, + blkg, bps); + } +} + +static inline void blkio_update_group_iops(struct blkio_group *blkg, + unsigned int iops, int fileid) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_iops_device + && blkiop->ops.blkio_update_group_read_iops_fn) + blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, + blkg, iops); + + if (fileid == BLKIO_THROTL_write_iops_device + && blkiop->ops.blkio_update_group_write_iops_fn) + blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, + blkg,iops); + } +} + /* * Add to the appropriate stat variable depending on the request type. * This should be called with the blkg->stats_lock held. @@ -341,7 +430,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) { unsigned long flags; @@ -350,6 +440,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, rcu_assign_pointer(blkg->key, key); blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + blkg->plid = plid; spin_unlock_irqrestore(&blkcg->lock, flags); /* Need to take css reference ? */ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); @@ -408,51 +499,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) } EXPORT_SYMBOL_GPL(blkiocg_lookup_group); -#define SHOW_FUNCTION(__VAR) \ -static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype) \ -{ \ - struct blkio_cgroup *blkcg; \ - \ - blkcg = cgroup_to_blkio_cgroup(cgroup); \ - return (u64)blkcg->__VAR; \ -} - -SHOW_FUNCTION(weight); -#undef SHOW_FUNCTION - -static int -blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) -{ - struct blkio_cgroup *blkcg; - struct blkio_group *blkg; - struct hlist_node *n; - struct blkio_policy_type *blkiop; - struct blkio_policy_node *pn; - - if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) - return -EINVAL; - - blkcg = cgroup_to_blkio_cgroup(cgroup); - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - blkcg->weight = (unsigned int)val; - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - pn = blkio_policy_search_node(blkcg, blkg->dev); - - if (pn) - continue; - - list_for_each_entry(blkiop, &blkio_list, list) - blkiop->ops.blkio_update_group_weight_fn(blkg, - blkcg->weight); - } - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); - return 0; -} - static int blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { @@ -593,52 +639,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, return disk_total; } -#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ -static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype, struct cgroup_map_cb *cb) \ -{ \ - struct blkio_cgroup *blkcg; \ - struct blkio_group *blkg; \ - struct hlist_node *n; \ - uint64_t cgroup_total = 0; \ - \ - if (!cgroup_lock_live_group(cgroup)) \ - return -ENODEV; \ - \ - blkcg = cgroup_to_blkio_cgroup(cgroup); \ - rcu_read_lock(); \ - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ - if (blkg->dev) { \ - spin_lock_irq(&blkg->stats_lock); \ - cgroup_total += blkio_get_stat(blkg, cb, \ - blkg->dev, type); \ - spin_unlock_irq(&blkg->stats_lock); \ - } \ - } \ - if (show_total) \ - cb->fill(cb, "Total", cgroup_total); \ - rcu_read_unlock(); \ - cgroup_unlock(); \ - return 0; \ -} - -SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); -SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); -SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); -SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); -SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); -SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); -SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); -SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); -#ifdef CONFIG_DEBUG_BLK_CGROUP -SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); -SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); -SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); -SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); -SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); -#endif -#undef SHOW_FUNCTION_PER_GROUP - static int blkio_check_dev_num(dev_t dev) { int part = 0; @@ -652,13 +652,14 @@ static int blkio_check_dev_num(dev_t dev) } static int blkio_policy_parse_and_set(char *buf, - struct blkio_policy_node *newpn) + struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) { char *s[4], *p, *major_s = NULL, *minor_s = NULL; int ret; unsigned long major, minor, temp; int i = 0; dev_t dev; + u64 bps, iops; memset(s, 0, sizeof(s)); @@ -705,12 +706,47 @@ static int blkio_policy_parse_and_set(char *buf, if (s[1] == NULL) return -EINVAL; - ret = strict_strtoul(s[1], 10, &temp); - if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || - temp > BLKIO_WEIGHT_MAX) - return -EINVAL; + switch (plid) { + case BLKIO_POLICY_PROP: + ret = strict_strtoul(s[1], 10, &temp); + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + return -EINVAL; - newpn->weight = temp; + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.weight = temp; + break; + case BLKIO_POLICY_THROTL: + switch(fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + ret = strict_strtoull(s[1], 10, &bps); + if (ret) + return -EINVAL; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.bps = bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + ret = strict_strtoull(s[1], 10, &iops); + if (ret) + return -EINVAL; + + if (iops > THROTL_IOPS_MAX) + return -EINVAL; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.iops = (unsigned int)iops; + break; + } + break; + default: + BUG(); + } return 0; } @@ -720,26 +756,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, { struct blkio_policy_node *pn; - pn = blkio_policy_search_node(blkcg, dev); + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device); if (pn) - return pn->weight; + return pn->val.weight; else return blkcg->weight; } EXPORT_SYMBOL_GPL(blkcg_get_weight); +uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device); + if (pn) + return pn->val.bps; + else + return -1; +} + +uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device); + if (pn) + return pn->val.bps; + else + return -1; +} + +unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device); + if (pn) + return pn->val.iops; + else + return -1; +} + +unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device); + if (pn) + return pn->val.iops; + else + return -1; +} + +/* Checks whether user asked for deleting a policy rule */ +static bool blkio_delete_rule_command(struct blkio_policy_node *pn) +{ + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->val.weight == 0) + return 1; + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + if (pn->val.bps == 0) + return 1; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + if (pn->val.iops == 0) + return 1; + } + break; + default: + BUG(); + } + + return 0; +} + +static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, + struct blkio_policy_node *newpn) +{ + switch(oldpn->plid) { + case BLKIO_POLICY_PROP: + oldpn->val.weight = newpn->val.weight; + break; + case BLKIO_POLICY_THROTL: + switch(newpn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + oldpn->val.bps = newpn->val.bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + oldpn->val.iops = newpn->val.iops; + } + break; + default: + BUG(); + } +} + +/* + * Some rules/values in blkg have changed. Propogate those to respective + * policies. + */ +static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, struct blkio_policy_node *pn) +{ + unsigned int weight, iops; + u64 bps; + + switch(pn->plid) { + case BLKIO_POLICY_PROP: + weight = pn->val.weight ? pn->val.weight : + blkcg->weight; + blkio_update_group_weight(blkg, weight); + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + bps = pn->val.bps ? pn->val.bps : (-1); + blkio_update_group_bps(blkg, bps, pn->fileid); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + iops = pn->val.iops ? pn->val.iops : (-1); + blkio_update_group_iops(blkg, iops, pn->fileid); + break; + } + break; + default: + BUG(); + } +} + +/* + * A policy node rule has been updated. Propogate this update to all the + * block groups which might be affected by this update. + */ +static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) +{ + struct blkio_group *blkg; + struct hlist_node *n; + + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (pn->dev != blkg->dev || pn->plid != blkg->plid) + continue; + blkio_update_blkg_policy(blkcg, blkg, pn); + } + + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); +} -static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) { int ret = 0; char *buf; struct blkio_policy_node *newpn, *pn; struct blkio_cgroup *blkcg; - struct blkio_group *blkg; int keep_newpn = 0; - struct hlist_node *n; - struct blkio_policy_type *blkiop; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); buf = kstrdup(buffer, GFP_KERNEL); if (!buf) @@ -751,7 +941,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, goto free_buf; } - ret = blkio_policy_parse_and_set(buf, newpn); + ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); if (ret) goto free_newpn; @@ -759,9 +949,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, spin_lock_irq(&blkcg->lock); - pn = blkio_policy_search_node(blkcg, newpn->dev); + pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); if (!pn) { - if (newpn->weight != 0) { + if (!blkio_delete_rule_command(newpn)) { blkio_policy_insert_node(blkcg, newpn); keep_newpn = 1; } @@ -769,33 +959,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, goto update_io_group; } - if (newpn->weight == 0) { - /* weight == 0 means deleteing a specific weight */ + if (blkio_delete_rule_command(newpn)) { blkio_policy_delete_node(pn); spin_unlock_irq(&blkcg->lock); goto update_io_group; } spin_unlock_irq(&blkcg->lock); - pn->weight = newpn->weight; + blkio_update_policy_rule(pn, newpn); update_io_group: - /* update weight for each cfqg */ - spin_lock(&blkio_list_lock); - spin_lock_irq(&blkcg->lock); - - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - if (newpn->dev == blkg->dev) { - list_for_each_entry(blkiop, &blkio_list, list) - blkiop->ops.blkio_update_group_weight_fn(blkg, - newpn->weight ? - newpn->weight : - blkcg->weight); - } - } - - spin_unlock_irq(&blkcg->lock); - spin_unlock(&blkio_list_lock); + blkio_update_policy_node_blkg(blkcg, newpn); free_newpn: if (!keep_newpn) @@ -805,23 +979,256 @@ free_buf: return ret; } -static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *m) +static void +blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) { - struct blkio_cgroup *blkcg; - struct blkio_policy_node *pn; + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->fileid == BLKIO_PROP_weight_device) + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.weight); + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.bps); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.iops); + break; + } + break; + default: + BUG(); + } +} - seq_printf(m, "dev\tweight\n"); +/* cgroup files which read their data from policy nodes end up here */ +static void blkio_read_policy_node_files(struct cftype *cft, + struct blkio_cgroup *blkcg, struct seq_file *m) +{ + struct blkio_policy_node *pn; - blkcg = cgroup_to_blkio_cgroup(cgrp); if (!list_empty(&blkcg->policy_list)) { spin_lock_irq(&blkcg->lock); list_for_each_entry(pn, &blkcg->policy_list, node) { - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), - MINOR(pn->dev), pn->weight); + if (!pn_matches_cftype(cft, pn)) + continue; + blkio_print_policy_node(m, pn); } spin_unlock_irq(&blkcg->lock); } +} + +static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + default: + BUG(); + } + + return 0; +} + +static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, + struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, + bool show_total) +{ + struct blkio_group *blkg; + struct hlist_node *n; + uint64_t cgroup_total = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (blkg->dev) { + if (!cftype_blkg_same_policy(cft, blkg)) + continue; + spin_lock_irq(&blkg->stats_lock); + cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, + type); + spin_unlock_irq(&blkg->stats_lock); + } + } + if (show_total) + cb->fill(cb, "Total", cgroup_total); + rcu_read_unlock(); + return 0; +} + +/* All map kind of cgroup file get serviced by this function */ +static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_TIME, 0); + case BLKIO_PROP_sectors: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SECTORS, 0); + case BLKIO_PROP_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_BYTES, 1); + case BLKIO_PROP_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICED, 1); + case BLKIO_PROP_io_service_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_TIME, 1); + case BLKIO_PROP_io_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_WAIT_TIME, 1); + case BLKIO_PROP_io_merged: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_MERGED, 1); + case BLKIO_PROP_io_queued: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_QUEUED, 1); +#ifdef CONFIG_DEBUG_BLK_CGROUP + case BLKIO_PROP_dequeue: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_DEQUEUE, 0); + case BLKIO_PROP_avg_queue_size: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_AVG_QUEUE_SIZE, 0); + case BLKIO_PROP_group_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_GROUP_WAIT_TIME, 0); + case BLKIO_PROP_idle_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_IDLE_TIME, 0); + case BLKIO_PROP_empty_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_EMPTY_TIME, 0); +#endif + default: + BUG(); + } + break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_BYTES, 1); + case BLKIO_THROTL_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICED, 1); + default: + BUG(); + } + break; + default: + BUG(); + } + + return 0; +} + +static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) +{ + struct blkio_group *blkg; + struct hlist_node *n; + struct blkio_policy_node *pn; + + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; + + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + blkcg->weight = (unsigned int)val; + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + pn = blkio_policy_search_node(blkcg, blkg->dev, + BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); + if (pn) + continue; + + blkio_update_group_weight(blkg, blkcg->weight); + } + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + return 0; +} + +static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return (u64)blkcg->weight; + } + break; + default: + BUG(); + } + return 0; +} + +static int +blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return blkio_weight_write(blkcg, val); + } + break; + default: + BUG(); + } return 0; } @@ -829,71 +1236,151 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, struct cftype blkio_files[] = { { .name = "weight_device", - .read_seq_string = blkiocg_weight_device_read, - .write_string = blkiocg_weight_device_write, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, .max_write_len = 256, }, { .name = "weight", - .read_u64 = blkiocg_weight_read, - .write_u64 = blkiocg_weight_write, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight), + .read_u64 = blkiocg_file_read_u64, + .write_u64 = blkiocg_file_write_u64, }, { .name = "time", - .read_map = blkiocg_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_time), + .read_map = blkiocg_file_read_map, }, { .name = "sectors", - .read_map = blkiocg_sectors_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_sectors), + .read_map = blkiocg_file_read_map, }, { .name = "io_service_bytes", - .read_map = blkiocg_io_service_bytes_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_bytes), + .read_map = blkiocg_file_read_map, }, { .name = "io_serviced", - .read_map = blkiocg_io_serviced_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_serviced), + .read_map = blkiocg_file_read_map, }, { .name = "io_service_time", - .read_map = blkiocg_io_service_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_time), + .read_map = blkiocg_file_read_map, }, { .name = "io_wait_time", - .read_map = blkiocg_io_wait_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_wait_time), + .read_map = blkiocg_file_read_map, }, { .name = "io_merged", - .read_map = blkiocg_io_merged_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_merged), + .read_map = blkiocg_file_read_map, }, { .name = "io_queued", - .read_map = blkiocg_io_queued_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_queued), + .read_map = blkiocg_file_read_map, }, { .name = "reset_stats", .write_u64 = blkiocg_reset_stats, }, +#ifdef CONFIG_BLK_DEV_THROTTLING + { + .name = "throttle.read_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.read_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + { + .name = "throttle.io_service_bytes", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_service_bytes), + .read_map = blkiocg_file_read_map, + }, + { + .name = "throttle.io_serviced", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_serviced), + .read_map = blkiocg_file_read_map, + }, +#endif /* CONFIG_BLK_DEV_THROTTLING */ + #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "avg_queue_size", - .read_map = blkiocg_avg_queue_size_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_avg_queue_size), + .read_map = blkiocg_file_read_map, }, { .name = "group_wait_time", - .read_map = blkiocg_group_wait_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_group_wait_time), + .read_map = blkiocg_file_read_map, }, { .name = "idle_time", - .read_map = blkiocg_idle_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_idle_time), + .read_map = blkiocg_file_read_map, }, { .name = "empty_time", - .read_map = blkiocg_empty_time_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_empty_time), + .read_map = blkiocg_file_read_map, }, { .name = "dequeue", - .read_map = blkiocg_dequeue_read, + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_dequeue), + .read_map = blkiocg_file_read_map, }, #endif }; @@ -932,13 +1419,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) /* * This blkio_group is being unlinked as associated cgroup is * going away. Let all the IO controlling policies know about - * this event. Currently this is static call to one io - * controlling policy. Once we have more policies in place, we - * need some dynamic registration of callback function. + * this event. */ spin_lock(&blkio_list_lock); - list_for_each_entry(blkiop, &blkio_list, list) + list_for_each_entry(blkiop, &blkio_list, list) { + if (blkiop->plid != blkg->plid) + continue; blkiop->ops.blkio_unlink_group_fn(key, blkg); + } spin_unlock(&blkio_list_lock); } while (1); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2b866ec1dcea..ea4861bdd549 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,6 +15,14 @@ #include <linux/cgroup.h> +enum blkio_policy_id { + BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ + BLKIO_POLICY_THROTL, /* Throttling */ +}; + +/* Max limits for throttle policy */ +#define THROTL_IOPS_MAX UINT_MAX + #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) #ifndef CONFIG_BLK_CGROUP @@ -65,6 +73,35 @@ enum blkg_state_flags { BLKG_empty, }; +/* cgroup files owned by proportional weight policy */ +enum blkcg_file_name_prop { + BLKIO_PROP_weight = 1, + BLKIO_PROP_weight_device, + BLKIO_PROP_io_service_bytes, + BLKIO_PROP_io_serviced, + BLKIO_PROP_time, + BLKIO_PROP_sectors, + BLKIO_PROP_io_service_time, + BLKIO_PROP_io_wait_time, + BLKIO_PROP_io_merged, + BLKIO_PROP_io_queued, + BLKIO_PROP_avg_queue_size, + BLKIO_PROP_group_wait_time, + BLKIO_PROP_idle_time, + BLKIO_PROP_empty_time, + BLKIO_PROP_dequeue, +}; + +/* cgroup files owned by throttle policy */ +enum blkcg_file_name_throtl { + BLKIO_THROTL_read_bps_device, + BLKIO_THROTL_write_bps_device, + BLKIO_THROTL_read_iops_device, + BLKIO_THROTL_write_iops_device, + BLKIO_THROTL_io_service_bytes, + BLKIO_THROTL_io_serviced, +}; + struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; @@ -112,6 +149,8 @@ struct blkio_group { char path[128]; /* The device MKDEV(major, minor), this group has been created for */ dev_t dev; + /* policy which owns this blk group */ + enum blkio_policy_id plid; /* Need to serialize the stats in the case of reset/update */ spinlock_t stats_lock; @@ -121,24 +160,60 @@ struct blkio_group { struct blkio_policy_node { struct list_head node; dev_t dev; - unsigned int weight; + /* This node belongs to max bw policy or porportional weight policy */ + enum blkio_policy_id plid; + /* cgroup file to which this rule belongs to */ + int fileid; + + union { + unsigned int weight; + /* + * Rate read/write in terms of byptes per second + * Whether this rate represents read or write is determined + * by file type "fileid". + */ + u64 bps; + unsigned int iops; + } val; }; extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, dev_t dev); +extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, + dev_t dev); typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); -typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, - unsigned int weight); + +typedef void (blkio_update_group_weight_fn) (void *key, + struct blkio_group *blkg, unsigned int weight); +typedef void (blkio_update_group_read_bps_fn) (void * key, + struct blkio_group *blkg, u64 read_bps); +typedef void (blkio_update_group_write_bps_fn) (void *key, + struct blkio_group *blkg, u64 write_bps); +typedef void (blkio_update_group_read_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int read_iops); +typedef void (blkio_update_group_write_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int write_iops); struct blkio_policy_ops { blkio_unlink_group_fn *blkio_unlink_group_fn; blkio_update_group_weight_fn *blkio_update_group_weight_fn; + blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; + blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; + blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; + blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; }; struct blkio_policy_type { struct list_head list; struct blkio_policy_ops ops; + enum blkio_policy_id plid; }; /* Blkio controller policy registration */ @@ -212,7 +287,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev); + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid); extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); @@ -234,7 +310,8 @@ static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) {} + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) {} static inline int blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index 32a1c123dfb3..500eb859886e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!new_io) + if (!new_io) { + part = rq->part; part_stat_inc(cpu, part, merges[rw]); - else { + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); part_round_stats(cpu, part); part_inc_in_flight(part, rw); + rq->part = part; } part_stat_unlock(); @@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); + rq->part = NULL; } EXPORT_SYMBOL(blk_rq_init); @@ -382,6 +385,7 @@ void blk_sync_queue(struct request_queue *q) del_timer_sync(&q->unplug_timer); del_timer_sync(&q->timeout); cancel_work_sync(&q->unplug_work); + throtl_shutdown_timer_wq(q); } EXPORT_SYMBOL(blk_sync_queue); @@ -459,6 +463,8 @@ void blk_cleanup_queue(struct request_queue *q) if (q->elevator) elevator_exit(q->elevator); + blk_throtl_exit(q); + blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); @@ -515,6 +521,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) return NULL; } + if (blk_throtl_init(q)) { + kmem_cache_free(blk_requestq_cachep, q); + return NULL; + } + setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); init_timer(&q->unplug_timer); @@ -796,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags, rl->starved[is_sync] = 0; priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) + if (priv) { rl->elvpriv++; - if (blk_queue_io_stat(q)) - rw_flags |= REQ_IO_STAT; + /* + * Don't do stats for non-priv requests + */ + if (blk_queue_io_stat(q)) + rw_flags |= REQ_IO_STAT; + } + spin_unlock_irq(q->queue_lock); rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); @@ -1522,6 +1538,15 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } + blk_throtl_bio(q, &bio); + + /* + * If bio = NULL, bio has been throttled and will be submitted + * later. + */ + if (!bio) + break; + trace_block_bio_queue(q, bio); ret = q->make_request_fn(q, bio); @@ -1612,11 +1637,12 @@ void submit_bio(int rw, struct bio *bio) if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", + printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, - bdevname(bio->bi_bdev, b)); + bdevname(bio->bi_bdev, b), + count); } } @@ -1759,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1779,7 +1805,7 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); @@ -2579,6 +2605,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_delayed_work(struct request_queue *q, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work(kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work); + int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * diff --git a/block/blk-exec.c b/block/blk-exec.c index e1672f14840e..cf1456a02acd 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, DECLARE_COMPLETION_ONSTACK(wait); char sense[SCSI_SENSE_BUFFERSIZE]; int err = 0; + unsigned long hang_check; /* * we need an extra reference to the request, so we can look at @@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, rq->end_io_data = &wait; blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); - wait_for_completion(&wait); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); + else + wait_for_completion(&wait); if (rq->errors) err = -EIO; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index edce1ef7933d..54bcba6c02a7 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep; /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements - * @rq: request with integrity metadata attached + * @q: request queue + * @bio: bio with integrity metadata attached * * Description: Returns the number of elements required in a - * scatterlist corresponding to the integrity metadata in a request. + * scatterlist corresponding to the integrity metadata in a bio. */ -int blk_rq_count_integrity_sg(struct request *rq) +int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) { - struct bio_vec *iv, *ivprv; - struct req_iterator iter; - unsigned int segments; + struct bio_vec *iv, *ivprv = NULL; + unsigned int segments = 0; + unsigned int seg_size = 0; + unsigned int i = 0; - ivprv = NULL; - segments = 0; + bio_for_each_integrity_vec(iv, bio, i) { - rq_for_each_integrity_segment(iv, rq, iter) { + if (ivprv) { + if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + goto new_segment; + + if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) + goto new_segment; - if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + if (seg_size + iv->bv_len > queue_max_segment_size(q)) + goto new_segment; + + seg_size += iv->bv_len; + } else { +new_segment: segments++; + seg_size = iv->bv_len; + } ivprv = iv; } @@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg); /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist - * @rq: request with integrity metadata attached + * @q: request queue + * @bio: bio with integrity metadata attached * @sglist: target scatterlist * * Description: Map the integrity vectors in request into a * scatterlist. The scatterlist must be big enough to hold all * elements. I.e. sized using blk_rq_count_integrity_sg(). */ -int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist) { - struct bio_vec *iv, *ivprv; - struct req_iterator iter; - struct scatterlist *sg; - unsigned int segments; - - ivprv = NULL; - sg = NULL; - segments = 0; + struct bio_vec *iv, *ivprv = NULL; + struct scatterlist *sg = NULL; + unsigned int segments = 0; + unsigned int i = 0; - rq_for_each_integrity_segment(iv, rq, iter) { + bio_for_each_integrity_vec(iv, bio, i) { if (ivprv) { if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) + goto new_segment; + + if (sg->length + iv->bv_len > queue_max_segment_size(q)) + goto new_segment; + sg->length += iv->bv_len; } else { new_segment: @@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) } EXPORT_SYMBOL(blk_integrity_compare); +int blk_integrity_merge_rq(struct request_queue *q, struct request *req, + struct request *next) +{ + if (blk_integrity_rq(req) != blk_integrity_rq(next)) + return -1; + + if (req->nr_integrity_segments + next->nr_integrity_segments > + q->limits.max_integrity_segments) + return -1; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_merge_rq); + +int blk_integrity_merge_bio(struct request_queue *q, struct request *req, + struct bio *bio) +{ + int nr_integrity_segs; + struct bio *next = bio->bi_next; + + bio->bi_next = NULL; + nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); + bio->bi_next = next; + + if (req->nr_integrity_segments + nr_integrity_segs > + q->limits.max_integrity_segments) + return -1; + + req->nr_integrity_segments += nr_integrity_segs; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_merge_bio); + struct integrity_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_integrity *, char *); @@ -381,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk) kobject_uevent(&bi->kobj, KOBJ_REMOVE); kobject_del(&bi->kobj); kobject_put(&bi->kobj); - kmem_cache_free(integrity_cachep, bi); disk->integrity = NULL; } EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-map.c b/block/blk-map.c index ade0a08c9099..d4a586d8691e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq, * direct dma. else, set up kernel bounce buffers */ uaddr = (unsigned long) ubuf; - if (blk_rq_aligned(q, ubuf, len) && !map_data) + if (blk_rq_aligned(q, uaddr, len) && !map_data) bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); else bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); @@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, unsigned int len, gfp_t gfp_mask) { int reading = rq_data_dir(rq) == READ; + unsigned long addr = (unsigned long) kbuf; int do_copy = 0; struct bio *bio; int ret; @@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); + do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf); if (do_copy) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else diff --git a/block/blk-merge.c b/block/blk-merge.c index eafc94f68d79..0a2fd8a48a38 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q, { int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; - return 0; - } + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) + goto no_merge; + + if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio)) + goto no_merge; /* * This will form the start of a new hw segment. Bump both @@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q, */ req->nr_phys_segments += nr_phys_segs; return 1; + +no_merge: + req->cmd_flags |= REQ_NOMERGE; + if (req == q->last_merge) + q->last_merge = NULL; + return 0; } int ll_back_merge_fn(struct request_queue *q, struct request *req, @@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > queue_max_segments(q)) return 0; + if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next)) + return 0; + /* Merge is OK... */ req->nr_phys_segments = total_phys_segments; return 1; @@ -343,7 +351,7 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); + part = req->part; part_round_stats(cpu, part); part_dec_in_flight(part, rq_data_dir(req)); @@ -384,9 +392,6 @@ static int attempt_merge(struct request_queue *q, struct request *req, || next->special) return 0; - if (blk_integrity_rq(req) != blk_integrity_rq(next)) - return 0; - /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn diff --git a/block/blk-settings.c b/block/blk-settings.c index a234f4bf1d6f..315b88c8cbbb 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); void blk_set_default_limits(struct queue_limits *lim) { lim->max_segments = BLK_MAX_SEGMENTS; + lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; @@ -213,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) */ if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) dma = 1; - q->limits.bounce_pfn = max_low_pfn; + q->limits.bounce_pfn = max(max_low_pfn, b_pfn); #else if (b_pfn < blk_max_low_pfn) dma = 1; @@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size); * hardware can operate on without reverting to read-modify-write * operations. */ -void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) +void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) { q->limits.physical_block_size = size; @@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - /** * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers * @t: the stacking driver (top) @@ -514,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->seg_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); + t->max_integrity_segments = min_not_zero(t->max_integrity_segments, + b->max_integrity_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0749b89c6885..da8a8a40cd4c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page) return queue_var_show(queue_max_segments(q), (page)); } +static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.max_integrity_segments, (page)); +} + static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) @@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = { .show = queue_max_segments_show, }; +static struct queue_sysfs_entry queue_max_integrity_segments_entry = { + .attr = {.name = "max_integrity_segments", .mode = S_IRUGO }, + .show = queue_max_integrity_segments_show, +}; + static struct queue_sysfs_entry queue_max_segment_size_entry = { .attr = {.name = "max_segment_size", .mode = S_IRUGO }, .show = queue_max_segment_size_show, @@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = { &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, &queue_max_segments_entry.attr, + &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, diff --git a/block/blk-throttle.c b/block/blk-throttle.c new file mode 100644 index 000000000000..56ad4531b412 --- /dev/null +++ b/block/blk-throttle.c @@ -0,0 +1,1123 @@ +/* + * Interface for controlling IO bandwidth on a request queue + * + * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/blktrace_api.h> +#include "blk-cgroup.h" + +/* Max dispatch from a group in 1 round */ +static int throtl_grp_quantum = 8; + +/* Total max dispatch from all groups in one round */ +static int throtl_quantum = 32; + +/* Throttling is performed over 100ms slice and after that slice is renewed */ +static unsigned long throtl_slice = HZ/10; /* 100 ms */ + +struct throtl_rb_root { + struct rb_root rb; + struct rb_node *left; + unsigned int count; + unsigned long min_disptime; +}; + +#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ + .count = 0, .min_disptime = 0} + +#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) + +struct throtl_grp { + /* List of throtl groups on the request queue*/ + struct hlist_node tg_node; + + /* active throtl group service_tree member */ + struct rb_node rb_node; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + struct blkio_group blkg; + atomic_t ref; + unsigned int flags; + + /* Two lists for READ and WRITE */ + struct bio_list bio_lists[2]; + + /* Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* bytes per second rate limits */ + uint64_t bps[2]; + + /* IOPS limits */ + unsigned int iops[2]; + + /* Number of bytes disptached in current slice */ + uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; + + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; + + /* Some throttle limits got updated for the group */ + bool limits_changed; +}; + +struct throtl_data +{ + /* List of throtl groups */ + struct hlist_head tg_list; + + /* service tree for active throtl groups */ + struct throtl_rb_root tg_service_tree; + + struct throtl_grp root_tg; + struct request_queue *queue; + + /* Total Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* + * number of total undestroyed groups + */ + unsigned int nr_undestroyed_grps; + + /* Work for dispatching throttled bios */ + struct delayed_work throtl_work; + + atomic_t limits_changed; +}; + +enum tg_state_flags { + THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ +}; + +#define THROTL_TG_FNS(name) \ +static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ +} \ +static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ +} \ +static inline int throtl_tg_##name(const struct throtl_grp *tg) \ +{ \ + return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ +} + +THROTL_TG_FNS(on_rr); + +#define throtl_log_tg(td, tg, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ + blkg_path(&(tg)->blkg), ##args); \ + +#define throtl_log(td, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) + +static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct throtl_grp, blkg); + + return NULL; +} + +static inline int total_nr_queued(struct throtl_data *td) +{ + return (td->nr_queued[0] + td->nr_queued[1]); +} + +static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) +{ + atomic_inc(&tg->ref); + return tg; +} + +static void throtl_put_tg(struct throtl_grp *tg) +{ + BUG_ON(atomic_read(&tg->ref) <= 0); + if (!atomic_dec_and_test(&tg->ref)) + return; + kfree(tg); +} + +static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, + struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct throtl_grp *tg = NULL; + void *key = td; + struct backing_dev_info *bdi = &td->queue->backing_dev_info; + unsigned int major, minor; + + /* + * TODO: Speed up blkiocg_lookup_group() by maintaining a radix + * tree of blkg (instead of traversing through hash list all + * the time. + */ + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + /* Fill in device details for root group */ + if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + tg->blkg.dev = MKDEV(major, minor); + goto done; + } + + if (tg) + goto done; + + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); + if (!tg) + goto done; + + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * request queue which will be dropped by either request queue + * exit or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&tg->ref, 1); + + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, + MKDEV(major, minor), BLKIO_POLICY_THROTL); + + tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); + tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); + tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); + tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); + + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; +done: + return tg; +} + +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +{ + struct cgroup *cgroup; + struct throtl_grp *tg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + tg = throtl_find_alloc_tg(td, cgroup); + if (!tg) + tg = &td->root_tg; + rcu_read_unlock(); + return tg; +} + +static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) +{ + /* Service tree is empty */ + if (!root->count) + return NULL; + + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_tg(root->left); + + return NULL; +} + +static void rb_erase_init(struct rb_node *n, struct rb_root *root) +{ + rb_erase(n, root); + RB_CLEAR_NODE(n); +} + +static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) +{ + if (root->left == n) + root->left = NULL; + rb_erase_init(n, &root->rb); + --root->count; +} + +static void update_min_dispatch_time(struct throtl_rb_root *st) +{ + struct throtl_grp *tg; + + tg = throtl_rb_first(st); + if (!tg) + return; + + st->min_disptime = tg->disptime; +} + +static void +tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct throtl_grp *__tg; + unsigned long key = tg->disptime; + int left = 1; + + while (*node != NULL) { + parent = *node; + __tg = rb_entry_tg(parent); + + if (time_before(key, __tg->disptime)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &tg->rb_node; + + rb_link_node(&tg->rb_node, parent, node); + rb_insert_color(&tg->rb_node, &st->rb); +} + +static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + tg_service_tree_add(st, tg); + throtl_mark_tg_on_rr(tg); + st->count++; +} + +static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (!throtl_tg_on_rr(tg)) + __throtl_enqueue_tg(td, tg); +} + +static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); + throtl_clear_tg_on_rr(tg); +} + +static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (throtl_tg_on_rr(tg)) + __throtl_dequeue_tg(td, tg); +} + +static void throtl_schedule_next_dispatch(struct throtl_data *td) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + /* + * If there are more bios pending, schedule more work. + */ + if (!total_nr_queued(td)) + return; + + BUG_ON(!st->count); + + update_min_dispatch_time(st); + + if (time_before_eq(st->min_disptime, jiffies)) + throtl_schedule_delayed_work(td->queue, 0); + else + throtl_schedule_delayed_work(td->queue, + (st->min_disptime - jiffies)); +} + +static inline void +throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + tg->slice_start[rw] = jiffies; + tg->slice_end[rw] = jiffies + throtl_slice; + throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +static inline void throtl_extend_slice(struct throtl_data *td, + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +/* Determine if previously allocated or extended slice is complete or not */ +static bool +throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) + return 0; + + return 1; +} + +/* Trim the used slices and adjust slice start accordingly */ +static inline void +throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + unsigned long nr_slices, time_elapsed, io_trim; + u64 bytes_trim, tmp; + + BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); + + /* + * If bps are unlimited (-1), then time slice don't get + * renewed. Don't try to trim the slice if slice is used. A new + * slice will start when appropriate. + */ + if (throtl_slice_used(td, tg, rw)) + return; + + time_elapsed = jiffies - tg->slice_start[rw]; + + nr_slices = time_elapsed / throtl_slice; + + if (!nr_slices) + return; + tmp = tg->bps[rw] * throtl_slice * nr_slices; + do_div(tmp, HZ); + bytes_trim = tmp; + + io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; + + if (!bytes_trim && !io_trim) + return; + + if (tg->bytes_disp[rw] >= bytes_trim) + tg->bytes_disp[rw] -= bytes_trim; + else + tg->bytes_disp[rw] = 0; + + if (tg->io_disp[rw] >= io_trim) + tg->io_disp[rw] -= io_trim; + else + tg->io_disp[rw] = 0; + + tg->slice_start[rw] += nr_slices * throtl_slice; + + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" + " start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, + tg->slice_start[rw], tg->slice_end[rw], jiffies); +} + +static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned int io_allowed; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + u64 tmp; + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + /* + * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * 1 then at max jiffy elapsed should be equivalent of 1 second as we + * will allow dispatch after 1 second and after that slice should + * have been trimmed. + */ + + tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); + + if (tmp > UINT_MAX) + io_allowed = UINT_MAX; + else + io_allowed = tmp; + + if (tg->io_disp[rw] + 1 <= io_allowed) { + if (wait) + *wait = 0; + return 1; + } + + /* Calc approx time to dispatch */ + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; + + if (jiffy_wait > jiffy_elapsed) + jiffy_wait = jiffy_wait - jiffy_elapsed; + else + jiffy_wait = 1; + + if (wait) + *wait = jiffy_wait; + return 0; +} + +static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + u64 bytes_allowed, extra_bytes, tmp; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + tmp = tg->bps[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); + bytes_allowed = tmp; + + if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { + if (wait) + *wait = 0; + return 1; + } + + /* Calc approx time to dispatch */ + extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; + jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + + if (!jiffy_wait) + jiffy_wait = 1; + + /* + * This wait time is without taking into consideration the rounding + * up we did. Add that time also. + */ + jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); + if (wait) + *wait = jiffy_wait; + return 0; +} + +/* + * Returns whether one can dispatch a bio or not. Also returns approx number + * of jiffies to wait before this bio is with-in IO rate and can be dispatched + */ +static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + + /* + * Currently whole state machine of group depends on first bio + * queued in the group bio list. So one should not be calling + * this function with a different bio if there are other bios + * queued. + */ + BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); + + /* If tg->bps = -1, then BW is unlimited */ + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { + if (wait) + *wait = 0; + return 1; + } + + /* + * If previous slice expired, start a new one otherwise renew/extend + * existing slice to make sure it is at least throtl_slice interval + * long since now. + */ + if (throtl_slice_used(td, tg, rw)) + throtl_start_new_slice(td, tg, rw); + else { + if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) + throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); + } + + if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) + && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { + if (wait) + *wait = 0; + return 1; + } + + max_wait = max(bps_wait, iops_wait); + + if (wait) + *wait = max_wait; + + if (time_before(tg->slice_end[rw], jiffies + max_wait)) + throtl_extend_slice(td, tg, rw, jiffies + max_wait); + + return 0; +} + +static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + bool sync = bio->bi_rw & REQ_SYNC; + + /* Charge the bio to the group */ + tg->bytes_disp[rw] += bio->bi_size; + tg->io_disp[rw]++; + + /* + * TODO: This will take blkg->stats_lock. Figure out a way + * to avoid this cost. + */ + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); +} + +static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio) +{ + bool rw = bio_data_dir(bio); + + bio_list_add(&tg->bio_lists[rw], bio); + /* Take a bio reference on tg */ + throtl_ref_get_tg(tg); + tg->nr_queued[rw]++; + td->nr_queued[rw]++; + throtl_enqueue_tg(td, tg); +} + +static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) +{ + unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + struct bio *bio; + + if ((bio = bio_list_peek(&tg->bio_lists[READ]))) + tg_may_dispatch(td, tg, bio, &read_wait); + + if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) + tg_may_dispatch(td, tg, bio, &write_wait); + + min_wait = min(read_wait, write_wait); + disptime = jiffies + min_wait; + + /* Update dispatch time */ + throtl_dequeue_tg(td, tg); + tg->disptime = disptime; + throtl_enqueue_tg(td, tg); +} + +static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, + bool rw, struct bio_list *bl) +{ + struct bio *bio; + + bio = bio_list_pop(&tg->bio_lists[rw]); + tg->nr_queued[rw]--; + /* Drop bio reference on tg */ + throtl_put_tg(tg); + + BUG_ON(td->nr_queued[rw] <= 0); + td->nr_queued[rw]--; + + throtl_charge_bio(tg, bio); + bio_list_add(bl, bio); + bio->bi_rw |= REQ_THROTTLED; + + throtl_trim_slice(td, tg, rw); +} + +static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio_list *bl) +{ + unsigned int nr_reads = 0, nr_writes = 0; + unsigned int max_nr_reads = throtl_grp_quantum*3/4; + unsigned int max_nr_writes = throtl_grp_quantum - nr_reads; + struct bio *bio; + + /* Try to dispatch 75% READS and 25% WRITES */ + + while ((bio = bio_list_peek(&tg->bio_lists[READ])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_reads++; + + if (nr_reads >= max_nr_reads) + break; + } + + while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_writes++; + + if (nr_writes >= max_nr_writes) + break; + } + + return nr_reads + nr_writes; +} + +static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) +{ + unsigned int nr_disp = 0; + struct throtl_grp *tg; + struct throtl_rb_root *st = &td->tg_service_tree; + + while (1) { + tg = throtl_rb_first(st); + + if (!tg) + break; + + if (time_before(jiffies, tg->disptime)) + break; + + throtl_dequeue_tg(td, tg); + + nr_disp += throtl_dispatch_tg(td, tg, bl); + + if (tg->nr_queued[0] || tg->nr_queued[1]) { + tg_update_disptime(td, tg); + throtl_enqueue_tg(td, tg); + } + + if (nr_disp >= throtl_quantum) + break; + } + + return nr_disp; +} + +static void throtl_process_limit_change(struct throtl_data *td) +{ + struct throtl_grp *tg; + struct hlist_node *pos, *n; + + /* + * Make sure atomic_inc() effects from + * throtl_update_blkio_group_read_bps(), group of functions are + * visible. + * Is this required or smp_mb__after_atomic_inc() was suffcient + * after the atomic_inc(). + */ + smp_rmb(); + if (!atomic_read(&td->limits_changed)) + return; + + throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * Do I need an smp_rmb() here to make sure tg->limits_changed + * update is visible. I am relying on smp_rmb() at the + * beginning of function and not putting a new one here. + */ + + if (throtl_tg_on_rr(tg) && tg->limits_changed) { + throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" + " riops=%u wiops=%u", tg->bps[READ], + tg->bps[WRITE], tg->iops[READ], + tg->iops[WRITE]); + tg_update_disptime(td, tg); + tg->limits_changed = false; + } + } + + smp_mb__before_atomic_dec(); + atomic_dec(&td->limits_changed); + smp_mb__after_atomic_dec(); +} + +/* Dispatch throttled bios. Should be called without queue lock held. */ +static int throtl_dispatch(struct request_queue *q) +{ + struct throtl_data *td = q->td; + unsigned int nr_disp = 0; + struct bio_list bio_list_on_stack; + struct bio *bio; + + spin_lock_irq(q->queue_lock); + + throtl_process_limit_change(td); + + if (!total_nr_queued(td)) + goto out; + + bio_list_init(&bio_list_on_stack); + + throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", + total_nr_queued(td), td->nr_queued[READ], + td->nr_queued[WRITE]); + + nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); + + if (nr_disp) + throtl_log(td, "bios disp=%u", nr_disp); + + throtl_schedule_next_dispatch(td); +out: + spin_unlock_irq(q->queue_lock); + + /* + * If we dispatched some requests, unplug the queue to make sure + * immediate dispatch + */ + if (nr_disp) { + while((bio = bio_list_pop(&bio_list_on_stack))) + generic_make_request(bio); + blk_unplug(q); + } + return nr_disp; +} + +void blk_throtl_work(struct work_struct *work) +{ + struct throtl_data *td = container_of(work, struct throtl_data, + throtl_work.work); + struct request_queue *q = td->queue; + + throtl_dispatch(q); +} + +/* Call with queue lock held */ +void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) +{ + + struct throtl_data *td = q->td; + struct delayed_work *dwork = &td->throtl_work; + + if (total_nr_queued(td) > 0) { + /* + * We might have a work scheduled to be executed in future. + * Cancel that and schedule a new one. + */ + __cancel_delayed_work(dwork); + kblockd_schedule_delayed_work(q, dwork, delay); + throtl_log(td, "schedule work. delay=%lu jiffies=%lu", + delay, jiffies); + } +} +EXPORT_SYMBOL(throtl_schedule_delayed_work); + +static void +throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&tg->tg_node)); + + hlist_del_init(&tg->tg_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + throtl_put_tg(tg); + td->nr_undestroyed_grps--; +} + +static void throtl_release_tgs(struct throtl_data *td) +{ + struct hlist_node *pos, *n; + struct throtl_grp *tg; + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&tg->blkg)) + throtl_destroy_tg(td, tg); + } +} + +static void throtl_td_free(struct throtl_data *td) +{ + kfree(td); +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid throtl_data pointer as long as we are + * rcu read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if queue was going away, cgroup deltion + * path got to it first. + */ +void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct throtl_data *td = key; + + spin_lock_irqsave(td->queue->queue_lock, flags); + throtl_destroy_tg(td, tg_of_blkg(blkg)); + spin_unlock_irqrestore(td->queue->queue_lock, flags); +} + +/* + * For all update functions, key should be a valid pointer because these + * update functions are called under blkcg_lock, that means, blkg is + * valid and in turn key is valid. queue exit path can not race becuase + * of blkcg_lock + * + * Can not take queue lock in update functions as queue lock under blkcg_lock + * is not allowed. Under other paths we take blkcg_lock under queue_lock. + */ +static void throtl_update_blkio_group_read_bps(void *key, + struct blkio_group *blkg, u64 read_bps) +{ + struct throtl_data *td = key; + + tg_of_blkg(blkg)->bps[READ] = read_bps; + /* Make sure read_bps is updated before setting limits_changed */ + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + + /* Make sure tg->limits_changed is updated before td->limits_changed */ + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + + /* Schedule a work now to process the limit change */ + throtl_schedule_delayed_work(td->queue, 0); +} + +static void throtl_update_blkio_group_write_bps(void *key, + struct blkio_group *blkg, u64 write_bps) +{ + struct throtl_data *td = key; + + tg_of_blkg(blkg)->bps[WRITE] = write_bps; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); +} + +static void throtl_update_blkio_group_read_iops(void *key, + struct blkio_group *blkg, unsigned int read_iops) +{ + struct throtl_data *td = key; + + tg_of_blkg(blkg)->iops[READ] = read_iops; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); +} + +static void throtl_update_blkio_group_write_iops(void *key, + struct blkio_group *blkg, unsigned int write_iops) +{ + struct throtl_data *td = key; + + tg_of_blkg(blkg)->iops[WRITE] = write_iops; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); +} + +void throtl_shutdown_timer_wq(struct request_queue *q) +{ + struct throtl_data *td = q->td; + + cancel_delayed_work_sync(&td->throtl_work); +} + +static struct blkio_policy_type blkio_policy_throtl = { + .ops = { + .blkio_unlink_group_fn = throtl_unlink_blkio_group, + .blkio_update_group_read_bps_fn = + throtl_update_blkio_group_read_bps, + .blkio_update_group_write_bps_fn = + throtl_update_blkio_group_write_bps, + .blkio_update_group_read_iops_fn = + throtl_update_blkio_group_read_iops, + .blkio_update_group_write_iops_fn = + throtl_update_blkio_group_write_iops, + }, + .plid = BLKIO_POLICY_THROTL, +}; + +int blk_throtl_bio(struct request_queue *q, struct bio **biop) +{ + struct throtl_data *td = q->td; + struct throtl_grp *tg; + struct bio *bio = *biop; + bool rw = bio_data_dir(bio), update_disptime = true; + + if (bio->bi_rw & REQ_THROTTLED) { + bio->bi_rw &= ~REQ_THROTTLED; + return 0; + } + + spin_lock_irq(q->queue_lock); + tg = throtl_get_tg(td); + + if (tg->nr_queued[rw]) { + /* + * There is already another bio queued in same dir. No + * need to update dispatch time. + * Still update the disptime if rate limits on this group + * were changed. + */ + if (!tg->limits_changed) + update_disptime = false; + else + tg->limits_changed = false; + + goto queue_bio; + } + + /* Bio is with-in rate limit of group */ + if (tg_may_dispatch(td, tg, bio, NULL)) { + throtl_charge_bio(tg, bio); + goto out; + } + +queue_bio: + throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" + " iodisp=%u iops=%u queued=%d/%d", + rw == READ ? 'R' : 'W', + tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], + tg->io_disp[rw], tg->iops[rw], + tg->nr_queued[READ], tg->nr_queued[WRITE]); + + throtl_add_bio_tg(q->td, tg, bio); + *biop = NULL; + + if (update_disptime) { + tg_update_disptime(td, tg); + throtl_schedule_next_dispatch(td); + } + +out: + spin_unlock_irq(q->queue_lock); + return 0; +} + +int blk_throtl_init(struct request_queue *q) +{ + struct throtl_data *td; + struct throtl_grp *tg; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + + INIT_HLIST_HEAD(&td->tg_list); + td->tg_service_tree = THROTL_RB_ROOT; + atomic_set(&td->limits_changed, 0); + + /* Init root group */ + tg = &td->root_tg; + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + + /* Practically unlimited BW */ + tg->bps[0] = tg->bps[1] = -1; + tg->iops[0] = tg->iops[1] = -1; + + /* + * Set root group reference to 2. One reference will be dropped when + * all groups on tg_list are being deleted during queue exit. Other + * reference will remain there as we don't want to delete this group + * as it is statically allocated and gets destroyed when throtl_data + * goes away. + */ + atomic_set(&tg->ref, 2); + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; + + INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); + + rcu_read_lock(); + blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, + 0, BLKIO_POLICY_THROTL); + rcu_read_unlock(); + + /* Attach throtl data to request queue */ + td->queue = q; + q->td = td; + return 0; +} + +void blk_throtl_exit(struct request_queue *q) +{ + struct throtl_data *td = q->td; + bool wait = false; + + BUG_ON(!td); + + throtl_shutdown_timer_wq(q); + + spin_lock_irq(q->queue_lock); + throtl_release_tgs(td); + + /* If there are other groups */ + if (td->nr_undestroyed_grps > 0) + wait = true; + + spin_unlock_irq(q->queue_lock); + + /* + * Wait for tg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other undestroyed groups out + * there (other than root group). This can happen if cgroup deletion + * path claimed the responsibility of cleaning up a group before + * queue cleanup code get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + + /* + * Just being safe to make sure after previous flush if some body did + * update limits through cgroup and another work got queued, cancel + * it. + */ + throtl_shutdown_timer_wq(q); + throtl_td_free(td); +} + +static int __init throtl_init(void) +{ + blkio_policy_register(&blkio_policy_throtl); + return 0; +} + +module_init(throtl_init); diff --git a/block/blk.h b/block/blk.h index d6b911ac002c..f864012ec300 100644 --- a/block/blk.h +++ b/block/blk.h @@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q); int blk_dev_init(void); -void elv_quiesce_start(struct request_queue *q); -void elv_quiesce_end(struct request_queue *q); - - /* * Return the threshold (number of used requests) at which the queue is * considered to be congested. It include a little hysteresis to keep the @@ -132,14 +128,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) - -#endif /* BLK_DEV_INTEGRITY */ - static inline int blk_cpu_to_group(int cpu) { int group = NR_CPUS; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 9eba291eb6fd..4cd59b0d7c15 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -160,6 +160,7 @@ enum wl_prio_t { BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, + CFQ_PRIO_NR, }; /* @@ -184,10 +185,19 @@ struct cfq_group { /* number of cfqq currently on this group */ int nr_cfqq; - /* Per group busy queus average. Useful for workload slice calc. */ - unsigned int busy_queues_avg[2]; /* - * rr lists of queues with requests, onle rr for each priority class. + * Per group busy queus average. Useful for workload slice calc. We + * create the array for each prio class but at run time it is used + * only for RT and BE class and slot for IDLE class remains unused. + * This is primarily done to avoid confusion and a gcc warning. + */ + unsigned int busy_queues_avg[CFQ_PRIO_NR]; + /* + * rr lists of queues with requests. We maintain service trees for + * RT and BE classes. These trees are subdivided in subclasses + * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE + * class there is no subclassification and all the cfq queues go on + * a single tree service_tree_idle. * Counts are embedded in the cfq_rb_root */ struct cfq_rb_root service_trees[2][3]; @@ -221,7 +231,6 @@ struct cfq_data { enum wl_type_t serving_type; unsigned long workload_expires; struct cfq_group *serving_group; - bool noidle_tree_requires_idle; /* * Each priority tree is sorted by next_request position. These @@ -977,8 +986,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) return NULL; } -void -cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, + unsigned int weight) { cfqg_of_blkg(blkg)->weight = weight; } @@ -2180,7 +2189,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) slice = max_t(unsigned, slice, CFQ_MIN_TT); cfq_log(cfqd, "workload slice:%d", slice); cfqd->workload_expires = jiffies + slice; - cfqd->noidle_tree_requires_idle = false; } static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) @@ -3177,7 +3185,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->queued[0] + cfqq->queued[1] >= 4) cfq_mark_cfqq_deep(cfqq); - if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || + if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) + enable_idle = 0; + else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { @@ -3494,17 +3504,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_slice_expired(cfqd, 1); else if (sync && cfqq_empty && !cfq_close_cooperator(cfqd, cfqq)) { - cfqd->noidle_tree_requires_idle |= - !(rq->cmd_flags & REQ_NOIDLE); - /* - * Idling is enabled for SYNC_WORKLOAD. - * SYNC_NOIDLE_WORKLOAD idles at the end of the tree - * only if we processed at least one !REQ_NOIDLE request - */ - if (cfqd->serving_type == SYNC_WORKLOAD - || cfqd->noidle_tree_requires_idle - || cfqq->cfqg->nr_cfqq == 1) - cfq_arm_slice_timer(cfqd); + cfq_arm_slice_timer(cfqd); } } @@ -4090,6 +4090,7 @@ static struct blkio_policy_type blkio_policy_cfq = { .blkio_unlink_group_fn = cfq_unlink_blkio_group, .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, }, + .plid = BLKIO_POLICY_PROP, }; #else static struct blkio_policy_type blkio_policy_cfq; diff --git a/block/cfq.h b/block/cfq.h index 93448e5a2e41..54a6d90f8e8c 100644 --- a/block/cfq.h +++ b/block/cfq.h @@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev) { - blkiocg_add_blkio_group(blkcg, blkg, key, dev); + blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); } static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) diff --git a/block/genhd.c b/block/genhd.c index 59a2db6fecef..8313834596db 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -541,13 +541,15 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); + /* Register BDI before referencing it from bdev */ + bdi = &disk->queue->backing_dev_info; + bdi_register_dev(bdi, disk_devt(disk)); + blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); - bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -642,6 +644,7 @@ void __init printk_all_partitions(void) struct hd_struct *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; + u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; /* * Don't show empty devices or things that have been @@ -660,10 +663,14 @@ void __init printk_all_partitions(void) while ((part = disk_part_iter_next(&piter))) { bool is_part0 = part == &disk->part0; - printk("%s%s %10llu %s", is_part0 ? "" : " ", + uuid[0] = 0; + if (part->info) + part_unpack_uuid(part->info->uuid, uuid); + + printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), (unsigned long long)part->nr_sects >> 1, - disk_name(disk, part->partno, name_buf)); + disk_name(disk, part->partno, name_buf), uuid); if (is_part0) { if (disk->driverfs_dev != NULL && disk->driverfs_dev->driver != NULL) @@ -925,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head) { struct disk_part_tbl *ptbl = container_of(head, struct disk_part_tbl, rcu_head); + struct gendisk *disk = ptbl->disk; + struct request_queue *q = disk->queue; + unsigned long flags; kfree(ptbl); + + spin_lock_irqsave(q->queue_lock, flags); + elv_quiesce_end(q); + spin_unlock_irqrestore(q->queue_lock, flags); } /** @@ -944,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk, struct disk_part_tbl *new_ptbl) { struct disk_part_tbl *old_ptbl = disk->part_tbl; + struct request_queue *q = disk->queue; rcu_assign_pointer(disk->part_tbl, new_ptbl); if (old_ptbl) { rcu_assign_pointer(old_ptbl->last_lookup, NULL); + + spin_lock_irq(q->queue_lock); + elv_quiesce_start(q); + spin_unlock_irq(q->queue_lock); + call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); } } @@ -989,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) return -ENOMEM; new_ptbl->len = target; + new_ptbl->disk = disk; for (i = 0; i < len; i++) rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); @@ -1004,6 +1025,7 @@ static void disk_release(struct device *dev) kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); + free_part_info(&disk->part0); kfree(disk); } struct class block_class = { diff --git a/block/ioctl.c b/block/ioctl.c index d8052f0dabd3..2c15fe0912c4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user /* all seems OK */ part = add_partition(disk, partno, start, length, - ADDPART_FLAG_NONE); + ADDPART_FLAG_NONE, NULL); mutex_unlock(&bdev->bd_mutex); return IS_ERR(part) ? PTR_ERR(part) : 0; case BLKPG_DEL_PARTITION: |