diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-21 20:02:48 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-02-21 20:02:48 +0100 |
commit | 582cd91f69de8e44857cb610ebca661dac8656b7 (patch) | |
tree | 0d680db02a5c236ee87b408b3f13ce33ebaca907 /block/bio.c | |
parent | Merge tag 'for-5.12/libata-2021-02-17' of git://git.kernel.dk/linux-block (diff) | |
parent | mm: simplify swapdev_block (diff) | |
download | linux-582cd91f69de8e44857cb610ebca661dac8656b7.tar.xz linux-582cd91f69de8e44857cb610ebca661dac8656b7.zip |
Merge tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe:
"Another nice round of removing more code than what is added, mostly
due to Christoph's relentless pursuit of tech debt removal/cleanups.
This pull request contains:
- Two series of BFQ improvements (Paolo, Jan, Jia)
- Block iov_iter improvements (Pavel)
- bsg error path fix (Pan)
- blk-mq scheduler improvements (Jan)
- -EBUSY discard fix (Jan)
- bvec allocation improvements (Ming, Christoph)
- bio allocation and init improvements (Christoph)
- Store bdev pointer in bio instead of gendisk + partno (Christoph)
- Block trace point cleanups (Christoph)
- hard read-only vs read-only split (Christoph)
- Block based swap cleanups (Christoph)
- Zoned write granularity support (Damien)
- Various fixes/tweaks (Chunguang, Guoqing, Lei, Lukas, Huhai)"
* tag 'for-5.12/block-2021-02-17' of git://git.kernel.dk/linux-block: (104 commits)
mm: simplify swapdev_block
sd_zbc: clear zone resources for non-zoned case
block: introduce blk_queue_clear_zone_settings()
zonefs: use zone write granularity as block size
block: introduce zone_write_granularity limit
block: use blk_queue_set_zoned in add_partition()
nullb: use blk_queue_set_zoned() to setup zoned devices
nvme: cleanup zone information initialization
block: document zone_append_max_bytes attribute
block: use bi_max_vecs to find the bvec pool
md/raid10: remove dead code in reshape_request
block: mark the bio as cloned in bio_iov_bvec_set
block: set BIO_NO_PAGE_REF in bio_iov_bvec_set
block: remove a layer of indentation in bio_iov_iter_get_pages
block: turn the nr_iovecs argument to bio_alloc* into an unsigned short
block: remove the 1 and 4 vec bvec_slabs entries
block: streamline bvec_alloc
block: factor out a bvec_alloc_gfp helper
block: move struct biovec_slab to bio.c
block: reuse BIO_INLINE_VECS for integrity bvecs
...
Diffstat (limited to 'block/bio.c')
-rw-r--r-- | block/bio.c | 571 |
1 files changed, 253 insertions, 318 deletions
diff --git a/block/bio.c b/block/bio.c index 2f21d2958b60..a1c4d2900c7a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,27 +19,40 @@ #include <linux/highmem.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> +#include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 - -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } -static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { - BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), +static struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +} bvec_slabs[] __read_mostly = { + { .nr_vecs = 16, .name = "biovec-16" }, + { .nr_vecs = 64, .name = "biovec-64" }, + { .nr_vecs = 128, .name = "biovec-128" }, + { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, }; -#undef BV + +static struct biovec_slab *biovec_slab(unsigned short nr_vecs) +{ + switch (nr_vecs) { + /* smaller bios use inline vecs */ + case 5 ... 16: + return &bvec_slabs[0]; + case 17 ... 64: + return &bvec_slabs[1]; + case 65 ... 128: + return &bvec_slabs[2]; + case 129 ... BIO_MAX_PAGES: + return &bvec_slabs[3]; + default: + BUG(); + return NULL; + } +} /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by @@ -58,178 +71,133 @@ struct bio_slab { char name[8]; }; static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; +static DEFINE_XARRAY(bio_slabs); -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +static struct bio_slab *create_bio_slab(unsigned int size) { - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; + struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); - mutex_lock(&bio_slab_lock); + if (!bslab) + return NULL; - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); + bslab->slab = kmem_cache_create(bslab->name, size, + ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); + if (!bslab->slab) + goto fail_alloc_slab; - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } + bslab->slab_ref = 1; + bslab->slab_size = size; - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; + if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) + return bslab; + + kmem_cache_destroy(bslab->slab); - bslab = &bio_slabs[entry]; +fail_alloc_slab: + kfree(bslab); + return NULL; +} + +static inline unsigned int bs_bio_slab_size(struct bio_set *bs) +{ + return bs->front_pad + sizeof(struct bio) + bs->back_pad; +} - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, - SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; +static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) +{ + unsigned int size = bs_bio_slab_size(bs); + struct bio_slab *bslab; - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: + mutex_lock(&bio_slab_lock); + bslab = xa_load(&bio_slabs, size); + if (bslab) + bslab->slab_ref++; + else + bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); - return slab; + + if (bslab) + return bslab->slab; + return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; - unsigned int i; + unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - + bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; + WARN_ON_ONCE(bslab->slab != bs->bio_slab); + WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; + xa_erase(&bio_slabs, slab_size); + kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; + kfree(bslab); out: mutex_unlock(&bio_slab_lock); } -unsigned int bvec_nr_vecs(unsigned short idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - return bvec_slabs[--idx].nr_vecs; -} - -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - if (!idx) - return; - idx--; + BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); - BIO_BUG_ON(idx >= BVEC_POOL_NR); - - if (idx == BVEC_POOL_MAX) { + if (nr_vecs == BIO_MAX_PAGES) mempool_free(bv, pool); - } else { - struct biovec_slab *bvs = bvec_slabs + idx; + else if (nr_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); +} - kmem_cache_free(bvs->slab, bv); - } +/* + * Make the first allocation restricted and don't dump info on allocation + * failures, since we'll fall back to the mempool in case of failure. + */ +static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +{ + return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask) { - struct bio_vec *bvl; + struct biovec_slab *bvs = biovec_slab(*nr_vecs); - /* - * see comment near bvec_array define! - */ - switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: + if (WARN_ON_ONCE(!bvs)) return NULL; - } /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. + * Upgrade the nr_vecs request to take full advantage of the allocation. + * We also rely on this in the bvec_free path. */ - if (*idx == BVEC_POOL_MAX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { - struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); + *nr_vecs = bvs->nr_vecs; - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + /* + * Try a slab allocation first for all smaller allocations. If that + * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. + * The mempool is sized to handle up to BIO_MAX_PAGES entries. + */ + if (*nr_vecs < BIO_MAX_PAGES) { + struct bio_vec *bvl; - /* - * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM - * is set, retry with the 1-entry mempool - */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { - *idx = BVEC_POOL_MAX; - goto fallback; - } + bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + return bvl; + *nr_vecs = BIO_MAX_PAGES; } - (*idx)++; - return bvl; + return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) @@ -255,7 +223,7 @@ static void bio_free(struct bio *bio) bio_uninit(bio); if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); /* * If we have front padding, adjust the bio pointer before freeing @@ -299,12 +267,8 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio) { - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - bio_uninit(bio); - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -405,122 +369,97 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. + * Allocate a bio from the mempools in @bs. * - * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will - * always be able to allocate a bio. This is due to the mempool guarantees. - * To make this work, callers must never allocate more than 1 bio at a time - * from this pool. Callers that need to allocate more than 1 bio must always - * submit the previously allocated bio for IO before attempting to allocate - * a new one. Failure to do so can cause deadlocks under memory pressure. + * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to + * allocate a bio. This is due to the mempool guarantees. To make this work, + * callers must never allocate more than 1 bio at a time from the general pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under submit_bio_noacct() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * submit_bio_noacct() that converts recursion into iteration, to prevent - * stack overflows. + * Note that when running under submit_bio_noacct() (i.e. any block driver), + * bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. * - * This would normally mean allocating multiple bios under - * submit_bio_noacct() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. + * This would normally mean allocating multiple bios under submit_bio_noacct() + * would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. * - * RETURNS: - * Pointer to new bio on success, NULL on failure. + * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - struct bio_vec *bvl = NULL; struct bio *bio; void *p; - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && - nr_iovecs > 0)) - return NULL; - /* - * submit_bio_noacct() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath submit_bio_noacct(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_DIRECT_RECLAIM; if that fails, we punt those - * bios we would be blocking to the rescuer workqueue before - * we retry with the original gfp_flags. - */ - - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0)) + return NULL; + /* + * submit_bio_noacct() converts recursion to iteration; this means if + * we're running beneath it, any bios we allocate and submit will not be + * submitted (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios + * from the same bio_set() while running underneath submit_bio_noacct(). + * If we were to allocate multiple bios (say a stacking block driver + * that was splitting bios), we would deadlock if we exhausted the + * mempool's reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are bios on + * current->bio_list, we first try the allocation without + * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be + * blocking to the rescuer workqueue before we retry with the original + * gfp_flags. + */ + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; } - if (unlikely(!p)) return NULL; - bio = p + front_pad; - bio_init(bio, NULL, 0); + bio = p + bs->front_pad; + if (nr_iovecs > BIO_INLINE_VECS) { + struct bio_vec *bvl = NULL; - if (nr_iovecs > inline_vecs) { - unsigned long idx = 0; - - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); } - if (unlikely(!bvl)) goto err_free; - bio->bi_flags |= idx << BVEC_POOL_OFFSET; + bio_init(bio, bvl, nr_iovecs); } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; + bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS); + } else { + bio_init(bio, NULL, 0); } bio->bi_pool = bs; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bvl; return bio; err_free: @@ -529,6 +468,31 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); +/** + * bio_kmalloc - kmalloc a bio for I/O + * @gfp_mask: the GFP_* mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * + * Use kmalloc to allocate and initialize a bio. + * + * Returns: Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs) +{ + struct bio *bio; + + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); + if (unlikely(!bio)) + return NULL; + bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs); + bio->bi_pool = NULL; + return bio; +} +EXPORT_SYMBOL(bio_kmalloc); + void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { unsigned long flags; @@ -607,16 +571,7 @@ void bio_truncate(struct bio *bio, unsigned new_size) */ void guard_bio_eod(struct bio *bio) { - sector_t maxsector; - struct block_device *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = bdev_nr_sectors(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; @@ -673,17 +628,18 @@ EXPORT_SYMBOL(bio_put); */ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { - BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); + WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs); /* - * most users will be overriding ->bi_disk with a new target, + * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_disk = bio_src->bi_disk; - bio->bi_partno = bio_src->bi_partno; + bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; @@ -730,7 +686,7 @@ EXPORT_SYMBOL(bio_clone_fast); const char *bio_devname(struct bio *bio, char *buf) { - return disk_name(bio->bi_disk, bio->bi_partno, buf); + return bdevname(bio->bi_bdev, buf); } EXPORT_SYMBOL(bio_devname); @@ -870,7 +826,7 @@ EXPORT_SYMBOL(bio_add_pc_page); int bio_add_zone_append_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; bool same_page = false; if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) @@ -993,21 +949,18 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - const struct bio_vec *bv = iter->bvec; - unsigned int len; - size_t size; - - if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) - return -EINVAL; - - len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); - size = bio_add_page(bio, bv->bv_page, len, - bv->bv_offset + iter->iov_offset); - if (unlikely(size != len)) - return -EINVAL; - iov_iter_advance(iter, size); + WARN_ON_ONCE(bio->bi_max_vecs); + + bio->bi_vcnt = iter->nr_segs; + bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_bvec_done = iter->iov_offset; + bio->bi_iter.bi_size = iter->count; + bio_set_flag(bio, BIO_NO_PAGE_REF); + bio_set_flag(bio, BIO_CLONED); + + iov_iter_advance(iter, iter->count); return 0; } @@ -1070,7 +1023,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; unsigned int max_append_sectors = queue_max_zone_append_sectors(q); struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; @@ -1121,41 +1074,40 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. If we're adding kernel pages, and the caller told us it's safe to - * do so, we just have to add the pages to the bio directly. We don't grab an - * extra reference to those pages (the user should already have that), and we - * don't put the page on IO completion. The caller needs to check if the bio is - * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be - * released. + * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided + * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs + * to ensure the bvecs and pages stay referenced until the submitted I/O is + * completed by a call to ->ki_complete() or returns with an error other than + * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF + * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. + * + * It's intended for direct IO, so doesn't do PSI tracking, the caller is + * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - const bool is_bvec = iov_iter_is_bvec(iter); - int ret; + int ret = 0; - if (WARN_ON_ONCE(bio->bi_vcnt)) - return -EINVAL; + if (iov_iter_is_bvec(iter)) { + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return -EINVAL; + return bio_iov_bvec_set(bio, iter); + } do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - if (WARN_ON_ONCE(is_bvec)) - return -EINVAL; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) ret = __bio_iov_append_get_pages(bio, iter); - } else { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); - } + else + ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - if (is_bvec) - bio_set_flag(bio, BIO_NO_PAGE_REF); + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); @@ -1178,7 +1130,8 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); unsigned long hang_check; bio->bi_private = &done; @@ -1455,8 +1408,8 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) - rq_qos_done_bio(bio->bi_disk->queue, bio); + if (bio->bi_bdev) + rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); /* * Need to have a real endio function for chained bios, otherwise @@ -1471,8 +1424,8 @@ again: goto again; } - if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_disk->queue, bio); + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } @@ -1559,7 +1512,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ int biovec_init_pool(mempool_t *pool, int pool_entries) { - struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; + struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } @@ -1612,15 +1565,17 @@ int bioset_init(struct bio_set *bs, unsigned int front_pad, int flags) { - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; @@ -1663,39 +1618,19 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -static void __init biovec_init_slabs(void) +static int __init init_bio(void) { int i; - for (i = 0; i < BVEC_POOL_NR; i++) { - int size; - struct biovec_slab *bvs = bvec_slabs + i; + bio_integrity_init(); - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } + for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { + struct biovec_slab *bvs = bvec_slabs + i; - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + bvs->slab = kmem_cache_create(bvs->name, + bvs->nr_vecs * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } -} - -static int __init init_bio(void) -{ - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), - GFP_KERNEL); - - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - - if (!bio_slabs) - panic("bio: can't allocate bios\n"); - - bio_integrity_init(); - biovec_init_slabs(); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) panic("bio: can't allocate bios\n"); |