From e43473b7f223ec866f7db273697e76c337c390f9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 15 Sep 2010 17:06:35 -0400 Subject: blkio: Core implementation of throttle policy o Actual implementation of throttling policy in block layer. Currently it implements READ and WRITE bytes per second throttling logic. IOPS throttling comes in later patches. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 909 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 909 insertions(+) create mode 100644 block/blk-throttle.c (limited to 'block/blk-throttle.c') diff --git a/block/blk-throttle.c b/block/blk-throttle.c new file mode 100644 index 000000000000..4b492011e0de --- /dev/null +++ b/block/blk-throttle.c @@ -0,0 +1,909 @@ +/* + * Interface for controlling IO bandwidth on a request queue + * + * Copyright (C) 2010 Vivek Goyal + */ + +#include +#include +#include +#include +#include +#include "blk-cgroup.h" + +/* Max dispatch from a group in 1 round */ +static int throtl_grp_quantum = 8; + +/* Total max dispatch from all groups in one round */ +static int throtl_quantum = 32; + +/* Throttling is performed over 100ms slice and after that slice is renewed */ +static unsigned long throtl_slice = HZ/10; /* 100 ms */ + +struct throtl_rb_root { + struct rb_root rb; + struct rb_node *left; + unsigned int count; + unsigned long min_disptime; +}; + +#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ + .count = 0, .min_disptime = 0} + +#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) + +struct throtl_grp { + /* List of throtl groups on the request queue*/ + struct hlist_node tg_node; + + /* active throtl group service_tree member */ + struct rb_node rb_node; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + struct blkio_group blkg; + atomic_t ref; + unsigned int flags; + + /* Two lists for READ and WRITE */ + struct bio_list bio_lists[2]; + + /* Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* bytes per second rate limits */ + uint64_t bps[2]; + + /* Number of bytes disptached in current slice */ + uint64_t bytes_disp[2]; + + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; +}; + +struct throtl_data +{ + /* List of throtl groups */ + struct hlist_head tg_list; + + /* service tree for active throtl groups */ + struct throtl_rb_root tg_service_tree; + + struct throtl_grp root_tg; + struct request_queue *queue; + + /* Total Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* + * number of total undestroyed groups (excluding root group) + */ + unsigned int nr_undestroyed_grps; + + /* Work for dispatching throttled bios */ + struct delayed_work throtl_work; +}; + +enum tg_state_flags { + THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ +}; + +#define THROTL_TG_FNS(name) \ +static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ +} \ +static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ +} \ +static inline int throtl_tg_##name(const struct throtl_grp *tg) \ +{ \ + return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ +} + +THROTL_TG_FNS(on_rr); + +#define throtl_log_tg(td, tg, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ + blkg_path(&(tg)->blkg), ##args); \ + +#define throtl_log(td, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) + +static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct throtl_grp, blkg); + + return NULL; +} + +static inline int total_nr_queued(struct throtl_data *td) +{ + return (td->nr_queued[0] + td->nr_queued[1]); +} + +static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) +{ + atomic_inc(&tg->ref); + return tg; +} + +static void throtl_put_tg(struct throtl_grp *tg) +{ + BUG_ON(atomic_read(&tg->ref) <= 0); + if (!atomic_dec_and_test(&tg->ref)) + return; + kfree(tg); +} + +static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, + struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct throtl_grp *tg = NULL; + void *key = td; + struct backing_dev_info *bdi = &td->queue->backing_dev_info; + unsigned int major, minor; + + /* + * TODO: Speed up blkiocg_lookup_group() by maintaining a radix + * tree of blkg (instead of traversing through hash list all + * the time. + */ + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + /* Fill in device details for root group */ + if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + tg->blkg.dev = MKDEV(major, minor); + goto done; + } + + if (tg) + goto done; + + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); + if (!tg) + goto done; + + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * request queue which will be dropped by either request queue + * exit or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&tg->ref, 1); + + /* Add group onto cgroup list */ + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, + MKDEV(major, minor), BLKIO_POLICY_THROTL); + + tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); + tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); + + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; +done: + return tg; +} + +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +{ + struct cgroup *cgroup; + struct throtl_grp *tg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + tg = throtl_find_alloc_tg(td, cgroup); + if (!tg) + tg = &td->root_tg; + rcu_read_unlock(); + return tg; +} + +static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) +{ + /* Service tree is empty */ + if (!root->count) + return NULL; + + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_tg(root->left); + + return NULL; +} + +static void rb_erase_init(struct rb_node *n, struct rb_root *root) +{ + rb_erase(n, root); + RB_CLEAR_NODE(n); +} + +static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) +{ + if (root->left == n) + root->left = NULL; + rb_erase_init(n, &root->rb); + --root->count; +} + +static void update_min_dispatch_time(struct throtl_rb_root *st) +{ + struct throtl_grp *tg; + + tg = throtl_rb_first(st); + if (!tg) + return; + + st->min_disptime = tg->disptime; +} + +static void +tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct throtl_grp *__tg; + unsigned long key = tg->disptime; + int left = 1; + + while (*node != NULL) { + parent = *node; + __tg = rb_entry_tg(parent); + + if (time_before(key, __tg->disptime)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &tg->rb_node; + + rb_link_node(&tg->rb_node, parent, node); + rb_insert_color(&tg->rb_node, &st->rb); +} + +static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + tg_service_tree_add(st, tg); + throtl_mark_tg_on_rr(tg); + st->count++; +} + +static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (!throtl_tg_on_rr(tg)) + __throtl_enqueue_tg(td, tg); +} + +static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); + throtl_clear_tg_on_rr(tg); +} + +static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (throtl_tg_on_rr(tg)) + __throtl_dequeue_tg(td, tg); +} + +static void throtl_schedule_next_dispatch(struct throtl_data *td) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + /* + * If there are more bios pending, schedule more work. + */ + if (!total_nr_queued(td)) + return; + + BUG_ON(!st->count); + + update_min_dispatch_time(st); + + if (time_before_eq(st->min_disptime, jiffies)) + throtl_schedule_delayed_work(td->queue, 0); + else + throtl_schedule_delayed_work(td->queue, + (st->min_disptime - jiffies)); +} + +static inline void +throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + tg->bytes_disp[rw] = 0; + tg->slice_start[rw] = jiffies; + tg->slice_end[rw] = jiffies + throtl_slice; + throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +static inline void throtl_extend_slice(struct throtl_data *td, + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +/* Determine if previously allocated or extended slice is complete or not */ +static bool +throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) + return 0; + + return 1; +} + +/* Trim the used slices and adjust slice start accordingly */ +static inline void +throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + unsigned long nr_slices, bytes_trim, time_elapsed; + + BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); + + /* + * If bps are unlimited (-1), then time slice don't get + * renewed. Don't try to trim the slice if slice is used. A new + * slice will start when appropriate. + */ + if (throtl_slice_used(td, tg, rw)) + return; + + time_elapsed = jiffies - tg->slice_start[rw]; + + nr_slices = time_elapsed / throtl_slice; + + if (!nr_slices) + return; + + bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; + + if (!bytes_trim) + return; + + if (tg->bytes_disp[rw] >= bytes_trim) + tg->bytes_disp[rw] -= bytes_trim; + else + tg->bytes_disp[rw] = 0; + + tg->slice_start[rw] += nr_slices * throtl_slice; + + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu" + " start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, + tg->slice_start[rw], tg->slice_end[rw], jiffies); +} + +/* + * Returns whether one can dispatch a bio or not. Also returns approx number + * of jiffies to wait before this bio is with-in IO rate and can be dispatched + */ +static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + u64 bytes_allowed, extra_bytes; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + /* + * Currently whole state machine of group depends on first bio + * queued in the group bio list. So one should not be calling + * this function with a different bio if there are other bios + * queued. + */ + BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); + + /* If tg->bps = -1, then BW is unlimited */ + if (tg->bps[rw] == -1) { + if (wait) + *wait = 0; + return 1; + } + + /* + * If previous slice expired, start a new one otherwise renew/extend + * existing slice to make sure it is at least throtl_slice interval + * long since now. + */ + if (throtl_slice_used(td, tg, rw)) + throtl_start_new_slice(td, tg, rw); + else { + if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) + throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); + } + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) + / MSEC_PER_SEC; + + if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { + if (wait) + *wait = 0; + return 1; + } + + /* Calc approx time to dispatch */ + extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; + jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + + if (!jiffy_wait) + jiffy_wait = 1; + + /* + * This wait time is without taking into consideration the rounding + * up we did. Add that time also. + */ + jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); + + if (wait) + *wait = jiffy_wait; + + if (time_before(tg->slice_end[rw], jiffies + jiffy_wait)) + throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait); + + return 0; +} + +static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + bool sync = bio->bi_rw & REQ_SYNC; + + /* Charge the bio to the group */ + tg->bytes_disp[rw] += bio->bi_size; + + /* + * TODO: This will take blkg->stats_lock. Figure out a way + * to avoid this cost. + */ + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); + +} + +static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio) +{ + bool rw = bio_data_dir(bio); + + bio_list_add(&tg->bio_lists[rw], bio); + /* Take a bio reference on tg */ + throtl_ref_get_tg(tg); + tg->nr_queued[rw]++; + td->nr_queued[rw]++; + throtl_enqueue_tg(td, tg); +} + +static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) +{ + unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + struct bio *bio; + + if ((bio = bio_list_peek(&tg->bio_lists[READ]))) + tg_may_dispatch(td, tg, bio, &read_wait); + + if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) + tg_may_dispatch(td, tg, bio, &write_wait); + + min_wait = min(read_wait, write_wait); + disptime = jiffies + min_wait; + + /* + * If group is already on active tree, then update dispatch time + * only if it is lesser than existing dispatch time. Otherwise + * always update the dispatch time + */ + + if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime)) + return; + + /* Update dispatch time */ + throtl_dequeue_tg(td, tg); + tg->disptime = disptime; + throtl_enqueue_tg(td, tg); +} + +static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, + bool rw, struct bio_list *bl) +{ + struct bio *bio; + + bio = bio_list_pop(&tg->bio_lists[rw]); + tg->nr_queued[rw]--; + /* Drop bio reference on tg */ + throtl_put_tg(tg); + + BUG_ON(td->nr_queued[rw] <= 0); + td->nr_queued[rw]--; + + throtl_charge_bio(tg, bio); + bio_list_add(bl, bio); + bio->bi_rw |= REQ_THROTTLED; + + throtl_trim_slice(td, tg, rw); +} + +static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio_list *bl) +{ + unsigned int nr_reads = 0, nr_writes = 0; + unsigned int max_nr_reads = throtl_grp_quantum*3/4; + unsigned int max_nr_writes = throtl_grp_quantum - nr_reads; + struct bio *bio; + + /* Try to dispatch 75% READS and 25% WRITES */ + + while ((bio = bio_list_peek(&tg->bio_lists[READ])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_reads++; + + if (nr_reads >= max_nr_reads) + break; + } + + while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_writes++; + + if (nr_writes >= max_nr_writes) + break; + } + + return nr_reads + nr_writes; +} + +static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) +{ + unsigned int nr_disp = 0; + struct throtl_grp *tg; + struct throtl_rb_root *st = &td->tg_service_tree; + + while (1) { + tg = throtl_rb_first(st); + + if (!tg) + break; + + if (time_before(jiffies, tg->disptime)) + break; + + throtl_dequeue_tg(td, tg); + + nr_disp += throtl_dispatch_tg(td, tg, bl); + + if (tg->nr_queued[0] || tg->nr_queued[1]) { + tg_update_disptime(td, tg); + throtl_enqueue_tg(td, tg); + } + + if (nr_disp >= throtl_quantum) + break; + } + + return nr_disp; +} + +/* Dispatch throttled bios. Should be called without queue lock held. */ +static int throtl_dispatch(struct request_queue *q) +{ + struct throtl_data *td = q->td; + unsigned int nr_disp = 0; + struct bio_list bio_list_on_stack; + struct bio *bio; + + spin_lock_irq(q->queue_lock); + + if (!total_nr_queued(td)) + goto out; + + bio_list_init(&bio_list_on_stack); + + throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", + total_nr_queued(td), td->nr_queued[READ], + td->nr_queued[WRITE]); + + nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); + + if (nr_disp) + throtl_log(td, "bios disp=%u", nr_disp); + + throtl_schedule_next_dispatch(td); +out: + spin_unlock_irq(q->queue_lock); + + /* + * If we dispatched some requests, unplug the queue to make sure + * immediate dispatch + */ + if (nr_disp) { + while((bio = bio_list_pop(&bio_list_on_stack))) + generic_make_request(bio); + blk_unplug(q); + } + return nr_disp; +} + +void blk_throtl_work(struct work_struct *work) +{ + struct throtl_data *td = container_of(work, struct throtl_data, + throtl_work.work); + struct request_queue *q = td->queue; + + throtl_dispatch(q); +} + +/* Call with queue lock held */ +void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) +{ + + struct throtl_data *td = q->td; + struct delayed_work *dwork = &td->throtl_work; + + if (total_nr_queued(td) > 0) { + /* + * We might have a work scheduled to be executed in future. + * Cancel that and schedule a new one. + */ + __cancel_delayed_work(dwork); + kblockd_schedule_delayed_work(q, dwork, delay); + throtl_log(td, "schedule work. delay=%lu jiffies=%lu", + delay, jiffies); + } +} +EXPORT_SYMBOL(throtl_schedule_delayed_work); + +static void +throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&tg->tg_node)); + + hlist_del_init(&tg->tg_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + throtl_put_tg(tg); + td->nr_undestroyed_grps--; +} + +static void throtl_release_tgs(struct throtl_data *td) +{ + struct hlist_node *pos, *n; + struct throtl_grp *tg; + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&tg->blkg)) + throtl_destroy_tg(td, tg); + } +} + +static void throtl_td_free(struct throtl_data *td) +{ + kfree(td); +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid throtl_data pointer as long as we are + * rcu read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if queue was going away, cgroup deltion + * path got to it first. + */ +void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct throtl_data *td = key; + + spin_lock_irqsave(td->queue->queue_lock, flags); + throtl_destroy_tg(td, tg_of_blkg(blkg)); + spin_unlock_irqrestore(td->queue->queue_lock, flags); +} + +static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg, + u64 read_bps) +{ + tg_of_blkg(blkg)->bps[READ] = read_bps; +} + +static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg, + u64 write_bps) +{ + tg_of_blkg(blkg)->bps[WRITE] = write_bps; +} + +void throtl_shutdown_timer_wq(struct request_queue *q) +{ + struct throtl_data *td = q->td; + + cancel_delayed_work_sync(&td->throtl_work); +} + +static struct blkio_policy_type blkio_policy_throtl = { + .ops = { + .blkio_unlink_group_fn = throtl_unlink_blkio_group, + .blkio_update_group_read_bps_fn = + throtl_update_blkio_group_read_bps, + .blkio_update_group_write_bps_fn = + throtl_update_blkio_group_write_bps, + }, +}; + +int blk_throtl_bio(struct request_queue *q, struct bio **biop) +{ + struct throtl_data *td = q->td; + struct throtl_grp *tg; + struct bio *bio = *biop; + bool rw = bio_data_dir(bio), update_disptime = true; + + if (bio->bi_rw & REQ_THROTTLED) { + bio->bi_rw &= ~REQ_THROTTLED; + return 0; + } + + spin_lock_irq(q->queue_lock); + tg = throtl_get_tg(td); + + if (tg->nr_queued[rw]) { + /* + * There is already another bio queued in same dir. No + * need to update dispatch time. + */ + update_disptime = false; + goto queue_bio; + } + + /* Bio is with-in rate limit of group */ + if (tg_may_dispatch(td, tg, bio, NULL)) { + throtl_charge_bio(tg, bio); + goto out; + } + +queue_bio: + throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu" + " queued=%d/%d", rw == READ ? 'R' : 'W', + tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], + tg->nr_queued[READ], tg->nr_queued[WRITE]); + + throtl_add_bio_tg(q->td, tg, bio); + *biop = NULL; + + if (update_disptime) { + tg_update_disptime(td, tg); + throtl_schedule_next_dispatch(td); + } + +out: + spin_unlock_irq(q->queue_lock); + return 0; +} + +int blk_throtl_init(struct request_queue *q) +{ + struct throtl_data *td; + struct throtl_grp *tg; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + + INIT_HLIST_HEAD(&td->tg_list); + td->tg_service_tree = THROTL_RB_ROOT; + + /* Init root group */ + tg = &td->root_tg; + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + + /* Practically unlimited BW */ + tg->bps[0] = tg->bps[1] = -1; + atomic_set(&tg->ref, 1); + + INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); + + rcu_read_lock(); + blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, + 0, BLKIO_POLICY_THROTL); + rcu_read_unlock(); + + /* Attach throtl data to request queue */ + td->queue = q; + q->td = td; + return 0; +} + +void blk_throtl_exit(struct request_queue *q) +{ + struct throtl_data *td = q->td; + bool wait = false; + + BUG_ON(!td); + + throtl_shutdown_timer_wq(q); + + spin_lock_irq(q->queue_lock); + throtl_release_tgs(td); + blkiocg_del_blkio_group(&td->root_tg.blkg); + + /* If there are other groups */ + if (td->nr_undestroyed_grps >= 1) + wait = true; + + spin_unlock_irq(q->queue_lock); + + /* + * Wait for tg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other undestroyed groups out + * there (other than root group). This can happen if cgroup deletion + * path claimed the responsibility of cleaning up a group before + * queue cleanup code get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + throtl_td_free(td); +} + +static int __init throtl_init(void) +{ + blkio_policy_register(&blkio_policy_throtl); + return 0; +} + +module_init(throtl_init); -- cgit v1.2.3 From 8e89d13f4ede2467629a971618537430fafaaea3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 15 Sep 2010 17:06:37 -0400 Subject: blkio: Implementation of IOPS limit logic o core logic of implementing IOPS throttling. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 164 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 37 deletions(-) (limited to 'block/blk-throttle.c') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 4b492011e0de..af53f37c1b13 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -59,8 +59,13 @@ struct throtl_grp { /* bytes per second rate limits */ uint64_t bps[2]; + /* IOPS limits */ + unsigned int iops[2]; + /* Number of bytes disptached in current slice */ uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; /* When did we start a new slice */ unsigned long slice_start[2]; @@ -194,6 +199,8 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); + tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); + tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); hlist_add_head(&tg->tg_node, &td->tg_list); td->nr_undestroyed_grps++; @@ -335,6 +342,7 @@ static inline void throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) { tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + throtl_slice; throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -365,7 +373,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) static inline void throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) { - unsigned long nr_slices, bytes_trim, time_elapsed; + unsigned long nr_slices, bytes_trim, time_elapsed, io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); @@ -385,8 +393,9 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) return; bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; + io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; - if (!bytes_trim) + if (!bytes_trim && !io_trim) return; if (tg->bytes_disp[rw] >= bytes_trim) @@ -394,51 +403,62 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) else tg->bytes_disp[rw] = 0; + if (tg->io_disp[rw] >= io_trim) + tg->io_disp[rw] -= io_trim; + else + tg->io_disp[rw] = 0; + tg->slice_start[rw] += nr_slices * throtl_slice; - throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu" + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu" " start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } -/* - * Returns whether one can dispatch a bio or not. Also returns approx number - * of jiffies to wait before this bio is with-in IO rate and can be dispatched - */ -static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, - struct bio *bio, unsigned long *wait) +static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes; + unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; - /* - * Currently whole state machine of group depends on first bio - * queued in the group bio list. So one should not be calling - * this function with a different bio if there are other bios - * queued. - */ - BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; - /* If tg->bps = -1, then BW is unlimited */ - if (tg->bps[rw] == -1) { + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) + / MSEC_PER_SEC; + + if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; return 1; } - /* - * If previous slice expired, start a new one otherwise renew/extend - * existing slice to make sure it is at least throtl_slice interval - * long since now. - */ - if (throtl_slice_used(td, tg, rw)) - throtl_start_new_slice(td, tg, rw); - else { - if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) - throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); - } + /* Calc approx time to dispatch */ + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; + + if (jiffy_wait > jiffy_elapsed) + jiffy_wait = jiffy_wait - jiffy_elapsed; + else + jiffy_wait = 1; + + if (wait) + *wait = jiffy_wait; + return 0; +} + +static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + u64 bytes_allowed, extra_bytes; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -469,12 +489,62 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); - if (wait) *wait = jiffy_wait; + return 0; +} + +/* + * Returns whether one can dispatch a bio or not. Also returns approx number + * of jiffies to wait before this bio is with-in IO rate and can be dispatched + */ +static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + + /* + * Currently whole state machine of group depends on first bio + * queued in the group bio list. So one should not be calling + * this function with a different bio if there are other bios + * queued. + */ + BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); - if (time_before(tg->slice_end[rw], jiffies + jiffy_wait)) - throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait); + /* If tg->bps = -1, then BW is unlimited */ + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { + if (wait) + *wait = 0; + return 1; + } + + /* + * If previous slice expired, start a new one otherwise renew/extend + * existing slice to make sure it is at least throtl_slice interval + * long since now. + */ + if (throtl_slice_used(td, tg, rw)) + throtl_start_new_slice(td, tg, rw); + else { + if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) + throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); + } + + if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) + && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { + if (wait) + *wait = 0; + return 1; + } + + max_wait = max(bps_wait, iops_wait); + + if (wait) + *wait = max_wait; + + if (time_before(tg->slice_end[rw], jiffies + max_wait)) + throtl_extend_slice(td, tg, rw, jiffies + max_wait); return 0; } @@ -486,13 +556,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_size; + tg->io_disp[rw]++; /* * TODO: This will take blkg->stats_lock. Figure out a way * to avoid this cost. */ blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); - } static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, @@ -763,6 +833,18 @@ static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg, tg_of_blkg(blkg)->bps[WRITE] = write_bps; } +static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg, + unsigned int read_iops) +{ + tg_of_blkg(blkg)->iops[READ] = read_iops; +} + +static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg, + unsigned int write_iops) +{ + tg_of_blkg(blkg)->iops[WRITE] = write_iops; +} + void throtl_shutdown_timer_wq(struct request_queue *q) { struct throtl_data *td = q->td; @@ -777,7 +859,12 @@ static struct blkio_policy_type blkio_policy_throtl = { throtl_update_blkio_group_read_bps, .blkio_update_group_write_bps_fn = throtl_update_blkio_group_write_bps, + .blkio_update_group_read_iops_fn = + throtl_update_blkio_group_read_iops, + .blkio_update_group_write_iops_fn = + throtl_update_blkio_group_write_iops, }, + .plid = BLKIO_POLICY_THROTL, }; int blk_throtl_bio(struct request_queue *q, struct bio **biop) @@ -811,9 +898,11 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) } queue_bio: - throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu" - " queued=%d/%d", rw == READ ? 'R' : 'W', + throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" + " iodisp=%u iops=%u queued=%d/%d", + rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], + tg->io_disp[rw], tg->iops[rw], tg->nr_queued[READ], tg->nr_queued[WRITE]); throtl_add_bio_tg(q->td, tg, bio); @@ -850,6 +939,7 @@ int blk_throtl_init(struct request_queue *q) /* Practically unlimited BW */ tg->bps[0] = tg->bps[1] = -1; + tg->iops[0] = tg->iops[1] = -1; atomic_set(&tg->ref, 1); INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); -- cgit v1.2.3 From 02977e4af7ed3b478c505e50491ffdf3e1314cf4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 14:49:48 +0200 Subject: blkio: Add root group to td->tg_list o Currently all the dynamically allocated groups, except root grp is added to td->tg_list. This was not a problem so far but in next patch I will travel through td->tg_list to process any updates of limits on the group. If root group is not in tg_list, then root group's updates are not processed. o It is better to root group also to tg_list instead of doing special processing for it during limit updates. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'block/blk-throttle.c') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index af53f37c1b13..bc2936b80add 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -87,7 +87,7 @@ struct throtl_data unsigned int nr_queued[2]; /* - * number of total undestroyed groups (excluding root group) + * number of total undestroyed groups */ unsigned int nr_undestroyed_grps; @@ -940,7 +940,17 @@ int blk_throtl_init(struct request_queue *q) /* Practically unlimited BW */ tg->bps[0] = tg->bps[1] = -1; tg->iops[0] = tg->iops[1] = -1; - atomic_set(&tg->ref, 1); + + /* + * Set root group reference to 2. One reference will be dropped when + * all groups on tg_list are being deleted during queue exit. Other + * reference will remain there as we don't want to delete this group + * as it is statically allocated and gets destroyed when throtl_data + * goes away. + */ + atomic_set(&tg->ref, 2); + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); @@ -966,10 +976,9 @@ void blk_throtl_exit(struct request_queue *q) spin_lock_irq(q->queue_lock); throtl_release_tgs(td); - blkiocg_del_blkio_group(&td->root_tg.blkg); /* If there are other groups */ - if (td->nr_undestroyed_grps >= 1) + if (td->nr_undestroyed_grps > 0) wait = true; spin_unlock_irq(q->queue_lock); -- cgit v1.2.3 From fe0714377ee2ca161bf2afb7773e22f15f1786d4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 14:49:49 +0200 Subject: blkio: Recalculate the throttled bio dispatch time upon throttle limit change o Currently any cgroup throttle limit changes are processed asynchronousy and the change does not take affect till a new bio is dispatched from same group. o It might happen that a user sets a redicuously low limit on throttling. Say 1 bytes per second on reads. In such cases simple operations like mount a disk can wait for a very long time. o Once bio is throttled, there is no easy way to come out of that wait even if user increases the read limit later. o This patch fixes it. Now if a user changes the cgroup limits, we recalculate the bio dispatch time according to new limits. o Can't take queueu lock under blkcg_lock, hence after the change I wake up the dispatch thread again which recalculates the time. So there are some variables being synchronized across two threads without lock and I had to make use of barriers. Hoping I have used barriers correctly. Any review of memory barrier code especially will help. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 ++++-- block/blk-cgroup.h | 21 ++++---- block/blk-throttle.c | 134 ++++++++++++++++++++++++++++++++++++++++++++------- block/cfq-iosched.c | 4 +- 4 files changed, 139 insertions(+), 35 deletions(-) (limited to 'block/blk-throttle.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b06ca70354e3..52c12130a5de 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -124,7 +124,8 @@ blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) if (blkiop->plid != blkg->plid) continue; if (blkiop->ops.blkio_update_group_weight_fn) - blkiop->ops.blkio_update_group_weight_fn(blkg, weight); + blkiop->ops.blkio_update_group_weight_fn(blkg->key, + blkg, weight); } } @@ -141,11 +142,13 @@ static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, if (fileid == BLKIO_THROTL_read_bps_device && blkiop->ops.blkio_update_group_read_bps_fn) - blkiop->ops.blkio_update_group_read_bps_fn(blkg, bps); + blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, + blkg, bps); if (fileid == BLKIO_THROTL_write_bps_device && blkiop->ops.blkio_update_group_write_bps_fn) - blkiop->ops.blkio_update_group_write_bps_fn(blkg, bps); + blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, + blkg, bps); } } @@ -162,11 +165,13 @@ static inline void blkio_update_group_iops(struct blkio_group *blkg, if (fileid == BLKIO_THROTL_read_iops_device && blkiop->ops.blkio_update_group_read_iops_fn) - blkiop->ops.blkio_update_group_read_iops_fn(blkg, iops); + blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, + blkg, iops); if (fileid == BLKIO_THROTL_write_iops_device && blkiop->ops.blkio_update_group_write_iops_fn) - blkiop->ops.blkio_update_group_write_iops_fn(blkg,iops); + blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, + blkg,iops); } } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2070053a30b1..034c35562dba 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -186,16 +186,17 @@ extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev); typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); -typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, - unsigned int weight); -typedef void (blkio_update_group_read_bps_fn) (struct blkio_group *blkg, - u64 read_bps); -typedef void (blkio_update_group_write_bps_fn) (struct blkio_group *blkg, - u64 write_bps); -typedef void (blkio_update_group_read_iops_fn) (struct blkio_group *blkg, - unsigned int read_iops); -typedef void (blkio_update_group_write_iops_fn) (struct blkio_group *blkg, - unsigned int write_iops); + +typedef void (blkio_update_group_weight_fn) (void *key, + struct blkio_group *blkg, unsigned int weight); +typedef void (blkio_update_group_read_bps_fn) (void * key, + struct blkio_group *blkg, u64 read_bps); +typedef void (blkio_update_group_write_bps_fn) (void *key, + struct blkio_group *blkg, u64 write_bps); +typedef void (blkio_update_group_read_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int read_iops); +typedef void (blkio_update_group_write_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int write_iops); struct blkio_policy_ops { blkio_unlink_group_fn *blkio_unlink_group_fn; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index bc2936b80add..11713ed852f4 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -70,6 +70,9 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; + + /* Some throttle limits got updated for the group */ + bool limits_changed; }; struct throtl_data @@ -93,6 +96,8 @@ struct throtl_data /* Work for dispatching throttled bios */ struct delayed_work throtl_work; + + atomic_t limits_changed; }; enum tg_state_flags { @@ -592,15 +597,6 @@ static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; - /* - * If group is already on active tree, then update dispatch time - * only if it is lesser than existing dispatch time. Otherwise - * always update the dispatch time - */ - - if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime)) - return; - /* Update dispatch time */ throtl_dequeue_tg(td, tg); tg->disptime = disptime; @@ -691,6 +687,46 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) return nr_disp; } +static void throtl_process_limit_change(struct throtl_data *td) +{ + struct throtl_grp *tg; + struct hlist_node *pos, *n; + + /* + * Make sure atomic_inc() effects from + * throtl_update_blkio_group_read_bps(), group of functions are + * visible. + * Is this required or smp_mb__after_atomic_inc() was suffcient + * after the atomic_inc(). + */ + smp_rmb(); + if (!atomic_read(&td->limits_changed)) + return; + + throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * Do I need an smp_rmb() here to make sure tg->limits_changed + * update is visible. I am relying on smp_rmb() at the + * beginning of function and not putting a new one here. + */ + + if (throtl_tg_on_rr(tg) && tg->limits_changed) { + throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" + " riops=%u wiops=%u", tg->bps[READ], + tg->bps[WRITE], tg->iops[READ], + tg->iops[WRITE]); + tg_update_disptime(td, tg); + tg->limits_changed = false; + } + } + + smp_mb__before_atomic_dec(); + atomic_dec(&td->limits_changed); + smp_mb__after_atomic_dec(); +} + /* Dispatch throttled bios. Should be called without queue lock held. */ static int throtl_dispatch(struct request_queue *q) { @@ -701,6 +737,8 @@ static int throtl_dispatch(struct request_queue *q) spin_lock_irq(q->queue_lock); + throtl_process_limit_change(td); + if (!total_nr_queued(td)) goto out; @@ -821,28 +859,74 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) spin_unlock_irqrestore(td->queue->queue_lock, flags); } -static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg, - u64 read_bps) +/* + * For all update functions, key should be a valid pointer because these + * update functions are called under blkcg_lock, that means, blkg is + * valid and in turn key is valid. queue exit path can not race becuase + * of blkcg_lock + * + * Can not take queue lock in update functions as queue lock under blkcg_lock + * is not allowed. Under other paths we take blkcg_lock under queue_lock. + */ +static void throtl_update_blkio_group_read_bps(void *key, + struct blkio_group *blkg, u64 read_bps) { + struct throtl_data *td = key; + tg_of_blkg(blkg)->bps[READ] = read_bps; + /* Make sure read_bps is updated before setting limits_changed */ + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + + /* Make sure tg->limits_changed is updated before td->limits_changed */ + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + + /* Schedule a work now to process the limit change */ + throtl_schedule_delayed_work(td->queue, 0); } -static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg, - u64 write_bps) +static void throtl_update_blkio_group_write_bps(void *key, + struct blkio_group *blkg, u64 write_bps) { + struct throtl_data *td = key; + tg_of_blkg(blkg)->bps[WRITE] = write_bps; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); } -static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg, - unsigned int read_iops) +static void throtl_update_blkio_group_read_iops(void *key, + struct blkio_group *blkg, unsigned int read_iops) { + struct throtl_data *td = key; + tg_of_blkg(blkg)->iops[READ] = read_iops; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); } -static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg, - unsigned int write_iops) +static void throtl_update_blkio_group_write_iops(void *key, + struct blkio_group *blkg, unsigned int write_iops) { + struct throtl_data *td = key; + tg_of_blkg(blkg)->iops[WRITE] = write_iops; + smp_wmb(); + tg_of_blkg(blkg)->limits_changed = true; + smp_mb__before_atomic_inc(); + atomic_inc(&td->limits_changed); + smp_mb__after_atomic_inc(); + throtl_schedule_delayed_work(td->queue, 0); } void throtl_shutdown_timer_wq(struct request_queue *q) @@ -886,8 +970,14 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) /* * There is already another bio queued in same dir. No * need to update dispatch time. + * Still update the disptime if rate limits on this group + * were changed. */ - update_disptime = false; + if (!tg->limits_changed) + update_disptime = false; + else + tg->limits_changed = false; + goto queue_bio; } @@ -929,6 +1019,7 @@ int blk_throtl_init(struct request_queue *q) INIT_HLIST_HEAD(&td->tg_list); td->tg_service_tree = THROTL_RB_ROOT; + atomic_set(&td->limits_changed, 0); /* Init root group */ tg = &td->root_tg; @@ -996,6 +1087,13 @@ void blk_throtl_exit(struct request_queue *q) */ if (wait) synchronize_rcu(); + + /* + * Just being safe to make sure after previous flush if some body did + * update limits through cgroup and another work got queued, cancel + * it. + */ + throtl_shutdown_timer_wq(q); throtl_td_free(td); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 684592621736..86338d5d4d09 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -951,8 +951,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) return NULL; } -void -cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, + unsigned int weight) { cfqg_of_blkg(blkg)->weight = weight; } -- cgit v1.2.3 From 3aad5d3ee4e4fce8f4b5bb6ca73342dcade42b33 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 14:51:14 +0200 Subject: blkio-throttle: Fix link failure failure on i386 o Randy Dunlap reported following linux-next failure. This patch fixes it. on i386: blk-throttle.c:(.text+0x1abb8): undefined reference to `__udivdi3' blk-throttle.c:(.text+0x1b1dc): undefined reference to `__udivdi3' o bytes_per_second interface is 64bit and I was continuing to do 64 bit division even on 32bit platform without help of special macros/functions hence the failure. Signed-off-by: Vivek Goyal Reported-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/blk-throttle.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'block/blk-throttle.c') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 11713ed852f4..a46700255719 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -378,7 +378,8 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) static inline void throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) { - unsigned long nr_slices, bytes_trim, time_elapsed, io_trim; + unsigned long nr_slices, time_elapsed, io_trim; + u64 bytes_trim, tmp; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); @@ -396,8 +397,10 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) if (!nr_slices) return; + tmp = tg->bps[rw] * throtl_slice * nr_slices; + do_div(tmp, HZ); + bytes_trim = tmp; - bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ; io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; if (!bytes_trim && !io_trim) @@ -415,7 +418,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) tg->slice_start[rw] += nr_slices * throtl_slice; - throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu" + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" " start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); @@ -462,7 +465,7 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes; + u64 bytes_allowed, extra_bytes, tmp; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -473,8 +476,9 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) - / MSEC_PER_SEC; + tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd); + do_div(tmp, MSEC_PER_SEC); + bytes_allowed = tmp; if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { if (wait) -- cgit v1.2.3 From 5e901a2b95db709c5e40599ff4df6029be1e2a12 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 21:16:38 +0200 Subject: blkio-throttle: There is no need to convert jiffies to milli seconds o Do not convert jiffies to mili seconds as it is not required. Just work with jiffies and HZ. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'block/blk-throttle.c') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a46700255719..c1bc1b6c887a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -439,8 +439,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd)) - / MSEC_PER_SEC; + io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ; if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) @@ -476,8 +475,8 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - tmp = tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd); - do_div(tmp, MSEC_PER_SEC); + tmp = tg->bps[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); bytes_allowed = tmp; if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { -- cgit v1.2.3 From c49c06e4960949a9bced708858433fcf6ca36a9c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 1 Oct 2010 21:16:42 +0200 Subject: blkio-throttle: Fix possible multiplication overflow in iops calculations o User can specify max iops value of 32bit (UINT_MAX), through cgroup interface. If a user has specified say 4294967294 (UNIT_MAX - 2), then on 32bit platform, following multiplication can overflow. io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) o Explicitly cast the multiplication to 64bit and then perform division and then check whether result is still great then UNINT_MAX. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'block/blk-throttle.c') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c1bc1b6c887a..56ad4531b412 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -430,6 +430,7 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, bool rw = bio_data_dir(bio); unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + u64 tmp; jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -439,7 +440,20 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ; + /* + * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * 1 then at max jiffy elapsed should be equivalent of 1 second as we + * will allow dispatch after 1 second and after that slice should + * have been trimmed. + */ + + tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); + + if (tmp > UINT_MAX) + io_allowed = UINT_MAX; + else + io_allowed = tmp; if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) -- cgit v1.2.3