diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 23:27:21 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-20 23:27:21 +0100 |
commit | 5b0ed5964928b0aaf0d644c17c886c7f5ea4bb3f (patch) | |
tree | 02df7848b8c28552039bf463e0034f5d5518b2a9 /block/blk-cgroup.c | |
parent | Merge tag 'for-6.3/dio-2023-02-16' of git://git.kernel.dk/linux (diff) | |
parent | brd: use radix_tree_maybe_preload instead of radix_tree_preload (diff) | |
download | linux-5b0ed5964928b0aaf0d644c17c886c7f5ea4bb3f.tar.xz linux-5b0ed5964928b0aaf0d644c17c886c7f5ea4bb3f.zip |
Merge tag 'for-6.3/block-2023-02-16' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- NVMe updates via Christoph:
- Small improvements to the logging functionality (Amit Engel)
- Authentication cleanups (Hannes Reinecke)
- Cleanup and optimize the DMA mapping cod in the PCIe driver
(Keith Busch)
- Work around the command effects for Format NVM (Keith Busch)
- Misc cleanups (Keith Busch, Christoph Hellwig)
- Fix and cleanup freeing single sgl (Keith Busch)
- MD updates via Song:
- Fix a rare crash during the takeover process
- Don't update recovery_cp when curr_resync is ACTIVE
- Free writes_pending in md_stop
- Change active_io to percpu
- Updates to drbd, inching us closer to unifying the out-of-tree driver
with the in-tree one (Andreas, Christoph, Lars, Robert)
- BFQ update adding support for multi-actuator drives (Paolo, Federico,
Davide)
- Make brd compliant with REQ_NOWAIT (me)
- Fix for IOPOLL and queue entering, fixing stalled IO waiting on
timeouts (me)
- Fix for REQ_NOWAIT with multiple bios (me)
- Fix memory leak in blktrace cleanup (Greg)
- Clean up sbitmap and fix a potential hang (Kemeng)
- Clean up some bits in BFQ, and fix a bug in the request injection
(Kemeng)
- Clean up the request allocation and issue code, and fix some bugs
related to that (Kemeng)
- ublk updates and fixes:
- Add support for unprivileged ublk (Ming)
- Improve device deletion handling (Ming)
- Misc (Liu, Ziyang)
- s390 dasd fixes (Alexander, Qiheng)
- Improve utility of request caching and fixes (Anuj, Xiao)
- zoned cleanups (Pankaj)
- More constification for kobjs (Thomas)
- blk-iocost cleanups (Yu)
- Remove bio splitting from drivers that don't need it (Christoph)
- Switch blk-cgroups to use struct gendisk. Some of this is now
incomplete as select late reverts were done. (Christoph)
- Add bvec initialization helpers, and convert callers to use that
rather than open-coding it (Christoph)
- Misc fixes and cleanups (Jinke, Keith, Arnd, Bart, Li, Martin,
Matthew, Ulf, Zhong)
* tag 'for-6.3/block-2023-02-16' of git://git.kernel.dk/linux: (169 commits)
brd: use radix_tree_maybe_preload instead of radix_tree_preload
block: use proper return value from bio_failfast()
block: bio-integrity: Copy flags when bio_integrity_payload is cloned
block: Fix io statistics for cgroup in throttle path
brd: mark as nowait compatible
brd: check for REQ_NOWAIT and set correct page allocation mask
brd: return 0/-error from brd_insert_page()
block: sync mixed merged request's failfast with 1st bio's
Revert "blk-cgroup: pin the gendisk in struct blkcg_gq"
Revert "blk-cgroup: pass a gendisk to blkg_lookup"
Revert "blk-cgroup: delay blk-cgroup initialization until add_disk"
Revert "blk-cgroup: delay calling blkcg_exit_disk until disk_release"
Revert "blk-cgroup: move the cgroup information to struct gendisk"
nvme-pci: remove iod use_sgls
nvme-pci: fix freeing single sgl
block: ublk: check IO buffer based on flag need_get_data
s390/dasd: Fix potential memleak in dasd_eckd_init()
s390/dasd: sort out physical vs virtual pointers usage
block: Remove the ALLOC_CACHE_SLACK constant
block: make kobj_type structures constant
...
Diffstat (limited to 'block/blk-cgroup.c')
-rw-r--r-- | block/blk-cgroup.c | 150 |
1 files changed, 91 insertions, 59 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9ac1efb053e0..bd50b55bdb61 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -118,14 +118,26 @@ static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); + struct request_queue *q = blkg->q; int i; + /* + * pd_free_fn() can also be called from blkcg_deactivate_policy(), + * in order to make sure pd_free_fn() is called in order, the deletion + * of the list blkg->q_node is delayed to here from blkg_destroy(), and + * blkcg_mutex is used to synchronize blkg_free_workfn() and + * blkcg_deactivate_policy(). + */ + mutex_lock(&q->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + if (blkg->parent) + blkg_put(blkg->parent); + list_del_init(&blkg->q_node); + mutex_unlock(&q->blkcg_mutex); - if (blkg->q) - blk_put_queue(blkg->q); + blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); @@ -158,8 +170,6 @@ static void __blkg_release(struct rcu_head *rcu) /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); blkg_free(blkg); } @@ -249,16 +259,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; - if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) - goto err_free; - + goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) - goto err_free; - + goto out_exit_refcnt; if (!blk_get_queue(disk->queue)) - goto err_free; + goto out_free_iostat; blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); @@ -281,19 +288,28 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg); + pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) - goto err_free; - + goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; + pd->online = false; } return blkg; -err_free: - blkg_free(blkg); +out_free_pds: + while (--i >= 0) + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + blk_put_queue(disk->queue); +out_free_iostat: + free_percpu(blkg->iostat_cpu); +out_exit_refcnt: + percpu_ref_exit(&blkg->refcnt); +out_free_blkg: + kfree(blkg); return NULL; } @@ -359,8 +375,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[i]); + if (blkg->pd[i]) { + if (pol->pd_online_fn) + pol->pd_online_fn(blkg->pd[i]); + blkg->pd[i]->online = true; + } } } blkg->online = true; @@ -376,7 +395,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, err_put_css: css_put(&blkcg->css); err_free_blkg: - blkg_free(new_blkg); + if (new_blkg) + blkg_free(new_blkg); return ERR_PTR(ret); } @@ -458,21 +478,28 @@ static void blkg_destroy(struct blkcg_gq *blkg) lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); - /* Something wrong if we are trying to remove same group twice */ - WARN_ON_ONCE(list_empty(&blkg->q_node)); - WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + /* + * blkg stays on the queue list until blkg_free_workfn(), see details in + * blkg_free_workfn(), hence this function can be called from + * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before + * blkg_free_workfn(). + */ + if (hlist_unhashed(&blkg->blkcg_node)) + return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg->pd[i]); + if (blkg->pd[i] && blkg->pd[i]->online) { + blkg->pd[i]->online = false; + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[i]); + } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); - list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); /* @@ -559,7 +586,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - if (!blkg->q->disk || !blkg->q->disk->bdi->dev) + if (!blkg->q->disk) return NULL; return bdi_dev_name(blkg->q->disk->bdi); } @@ -1273,6 +1300,7 @@ int blkcg_init_disk(struct gendisk *disk) int ret; INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) @@ -1349,9 +1377,9 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css) static void blkcg_exit(struct task_struct *tsk) { - if (tsk->throttle_queue) - blk_put_queue(tsk->throttle_queue); - tsk->throttle_queue = NULL; + if (tsk->throttle_disk) + put_disk(tsk->throttle_disk); + tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { @@ -1377,14 +1405,14 @@ struct cgroup_subsys io_cgrp_subsys = { EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** - * blkcg_activate_policy - activate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_activate_policy - activate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to activate * - * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through + * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * - * Activation happens with @q bypassed, so nobody would be accessing blkgs + * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. @@ -1392,9 +1420,9 @@ EXPORT_SYMBOL_GPL(io_cgrp_subsys); * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) +int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; @@ -1419,8 +1447,8 @@ retry: pd = pd_prealloc; pd_prealloc = NULL; } else { - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, - blkg->blkcg); + pd = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_NOWAIT | __GFP_NOWARN); } if (!pd) { @@ -1437,8 +1465,8 @@ retry: if (pd_prealloc) pol->pd_free_fn(pd_prealloc); - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, - blkg->blkcg); + pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_KERNEL); if (pd_prealloc) goto retry; else @@ -1448,6 +1476,7 @@ retry: blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; + pd->online = false; } /* all allocated, init in the same order */ @@ -1455,9 +1484,11 @@ retry: list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) pol->pd_init_fn(blkg->pd[pol->plid]); - if (pol->pd_online_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { + if (pol->pd_online_fn) pol->pd_online_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid]->online = true; + } __set_bit(pol->plid, q->blkcg_pols); ret = 0; @@ -1492,16 +1523,17 @@ enomem: EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** - * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to deactivate * - * Deactivate @pol on @q. Follows the same synchronization rules as + * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ -void blkcg_deactivate_policy(struct request_queue *q, +void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; if (!blkcg_policy_enabled(q, pol)) @@ -1510,6 +1542,7 @@ void blkcg_deactivate_policy(struct request_queue *q, if (queue_is_mq(q)) blk_mq_freeze_queue(q); + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); @@ -1519,7 +1552,7 @@ void blkcg_deactivate_policy(struct request_queue *q, spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { - if (pol->pd_offline_fn) + if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; @@ -1528,6 +1561,7 @@ void blkcg_deactivate_policy(struct request_queue *q, } spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); @@ -1797,29 +1831,29 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we - * check to see if current->throttle_queue is set and if not this doesn't do + * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { - struct request_queue *q = current->throttle_queue; + struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; - if (!q) + if (!disk) return; - current->throttle_queue = NULL; + current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; - blkg = blkg_lookup(blkcg, q); + blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) @@ -1828,11 +1862,10 @@ void blkcg_maybe_throttle_current(void) blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); - blk_put_queue(q); + put_disk(disk); return; out: rcu_read_unlock(); - blk_put_queue(q); } /** @@ -1854,18 +1887,17 @@ out: */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { - struct request_queue *q = disk->queue; - if (unlikely(current->flags & PF_KTHREAD)) return; - if (current->throttle_queue != q) { - if (!blk_get_queue(q)) + if (current->throttle_disk != disk) { + if (test_bit(GD_DEAD, &disk->state)) return; + get_device(disk_to_dev(disk)); - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; + if (current->throttle_disk) + put_disk(current->throttle_disk); + current->throttle_disk = disk; } if (use_memdelay) |