summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig14
-rw-r--r--block/Kconfig.iosched6
-rw-r--r--block/Makefile4
-rw-r--r--block/bfq-iosched.c17
-rw-r--r--block/bfq-iosched.h6
-rw-r--r--block/bfq-wf2q.c6
-rw-r--r--block/bio-integrity.c21
-rw-r--r--block/bio.c231
-rw-r--r--block/blk-cgroup.c153
-rw-r--r--block/blk-core.c21
-rw-r--r--block/blk-crypto.c2
-rw-r--r--block/blk-flush.c13
-rw-r--r--block/blk-integrity.c12
-rw-r--r--block/blk-iocost.c42
-rw-r--r--block/blk-iolatency.c44
-rw-r--r--block/blk-map.c2
-rw-r--r--block/blk-merge.c20
-rw-r--r--block/blk-mq-sched.c17
-rw-r--r--block/blk-mq-sysfs.c55
-rw-r--r--block/blk-mq.c48
-rw-r--r--block/blk-settings.c34
-rw-r--r--block/blk-sysfs.c35
-rw-r--r--block/blk-throttle.c32
-rw-r--r--block/blk-wbt.c8
-rw-r--r--block/blk-zoned.c6
-rw-r--r--block/blk.h33
-rw-r--r--block/bounce.c39
-rw-r--r--block/cmdline-parser.c255
-rw-r--r--block/disk-events.c69
-rw-r--r--block/elevator.c7
-rw-r--r--block/genhd.c386
-rw-r--r--block/holder.c174
-rw-r--r--block/ioctl.c16
-rw-r--r--block/ioprio.c9
-rw-r--r--block/kyber-iosched.c2
-rw-r--r--block/mq-deadline-cgroup.c126
-rw-r--r--block/mq-deadline-cgroup.h114
-rw-r--r--block/mq-deadline.c (renamed from block/mq-deadline-main.c)121
-rw-r--r--block/partitions/Kconfig1
-rw-r--r--block/partitions/acorn.c4
-rw-r--r--block/partitions/aix.c20
-rw-r--r--block/partitions/amiga.c7
-rw-r--r--block/partitions/atari.c4
-rw-r--r--block/partitions/check.h2
-rw-r--r--block/partitions/cmdline.c273
-rw-r--r--block/partitions/core.c73
-rw-r--r--block/partitions/efi.c48
-rw-r--r--block/partitions/ibm.c4
-rw-r--r--block/partitions/ldm.c20
-rw-r--r--block/partitions/mac.c2
-rw-r--r--block/partitions/msdos.c6
-rw-r--r--block/partitions/sgi.c5
-rw-r--r--block/partitions/sun.c5
-rw-r--r--block/t10-pi.c16
54 files changed, 1295 insertions, 1395 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 97c1d999b920..8e28ae7718bd 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -94,16 +94,6 @@ config BLK_DEV_THROTTLING_LOW
Note, this is an experimental interface and could be changed someday.
-config BLK_CMDLINE_PARSER
- bool "Block device command line partition parser"
- help
- Enabling this option allows you to specify the partition layout from
- the kernel boot args. This is typically of use for embedded devices
- which don't otherwise have any standardized method for listing the
- partitions on a block device.
-
- See Documentation/block/cmdline-partition.rst for more information.
-
config BLK_WBT
bool "Enable support for block device writeback throttling"
help
@@ -231,4 +221,8 @@ config BLK_MQ_RDMA
config BLK_PM
def_bool BLOCK && PM
+# do not use in new code
+config BLOCK_HOLDER_DEPRECATED
+ bool
+
source "block/Kconfig.iosched"
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 64053d67a97b..2f2158e05a91 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -9,12 +9,6 @@ config MQ_IOSCHED_DEADLINE
help
MQ version of the deadline IO scheduler.
-config MQ_IOSCHED_DEADLINE_CGROUP
- tristate
- default y
- depends on MQ_IOSCHED_DEADLINE
- depends on BLK_CGROUP
-
config MQ_IOSCHED_KYBER
tristate "Kyber I/O scheduler"
default y
diff --git a/block/Makefile b/block/Makefile
index 640afba070fd..6cf40276d3cf 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -21,13 +21,10 @@ obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
-mq-deadline-y += mq-deadline-main.o
-mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
-obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o
obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
@@ -41,3 +38,4 @@ obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
+obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 727955918563..480e1a134859 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2361,6 +2361,9 @@ static int bfq_request_merge(struct request_queue *q, struct request **req,
__rq = bfq_find_rq_fmerge(bfqd, bio, q);
if (__rq && elv_bio_merge_ok(__rq, bio)) {
*req = __rq;
+
+ if (blk_discard_mergable(__rq))
+ return ELEVATOR_DISCARD_MERGE;
return ELEVATOR_FRONT_MERGE;
}
@@ -2505,7 +2508,7 @@ void bfq_end_wr_async_queues(struct bfq_data *bfqd,
int i, j;
for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_BE_NR; j++)
+ for (j = 0; j < IOPRIO_NR_LEVELS; j++)
if (bfqg->async_bfqq[i][j])
bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
if (bfqg->async_idle_bfqq)
@@ -5266,8 +5269,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
switch (ioprio_class) {
default:
pr_err("bdi %s: bfq: bad prio class %d\n",
- bdi_dev_name(bfqq->bfqd->queue->backing_dev_info),
- ioprio_class);
+ bdi_dev_name(bfqq->bfqd->queue->disk->bdi),
+ ioprio_class);
fallthrough;
case IOPRIO_CLASS_NONE:
/*
@@ -5290,10 +5293,10 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
break;
}
- if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+ if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) {
pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
bfqq->new_ioprio);
- bfqq->new_ioprio = IOPRIO_BE_NR;
+ bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1;
}
bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
@@ -5408,7 +5411,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
case IOPRIO_CLASS_RT:
return &bfqg->async_bfqq[0][ioprio];
case IOPRIO_CLASS_NONE:
- ioprio = IOPRIO_NORM;
+ ioprio = IOPRIO_BE_NORM;
fallthrough;
case IOPRIO_CLASS_BE:
return &bfqg->async_bfqq[1][ioprio];
@@ -6822,7 +6825,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
int i, j;
for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_BE_NR; j++)
+ for (j = 0; j < IOPRIO_NR_LEVELS; j++)
__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 99c2a3cb081e..a73488eec8a4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -931,7 +931,7 @@ struct bfq_group {
void *bfqd;
- struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
struct bfq_queue *async_idle_bfqq;
struct bfq_entity *my_entity;
@@ -948,15 +948,13 @@ struct bfq_group {
struct bfq_entity entity;
struct bfq_sched_data sched_data;
- struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
struct bfq_queue *async_idle_bfqq;
struct rb_root rq_pos_tree;
};
#endif
-struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
-
/* --------------- main algorithm interface ----------------- */
#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 7a462df71f68..b74cc0da118e 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -505,7 +505,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
*/
unsigned short bfq_ioprio_to_weight(int ioprio)
{
- return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+ return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
}
/**
@@ -514,12 +514,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio)
*
* To preserve as much as possible the old only-ioprio user interface,
* 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF.
*/
static unsigned short bfq_weight_to_ioprio(int weight)
{
return max_t(int, 0,
- IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+ IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight);
}
static void bfq_get_entity(struct bfq_entity *entity)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4b4eb8964a6f..6b47cddbbca1 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -104,8 +104,7 @@ void bio_integrity_free(struct bio *bio)
struct bio_set *bs = bio->bi_pool;
if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
- kfree(page_address(bip->bip_vec->bv_page) +
- bip->bip_vec->bv_offset);
+ kfree(bvec_virt(bip->bip_vec));
__bio_integrity_free(bs, bip);
bio->bi_integrity = NULL;
@@ -163,27 +162,23 @@ static blk_status_t bio_integrity_process(struct bio *bio,
struct bio_vec bv;
struct bio_integrity_payload *bip = bio_integrity(bio);
blk_status_t ret = BLK_STS_OK;
- void *prot_buf = page_address(bip->bip_vec->bv_page) +
- bip->bip_vec->bv_offset;
iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
iter.seed = proc_iter->bi_sector;
- iter.prot_buf = prot_buf;
+ iter.prot_buf = bvec_virt(bip->bip_vec);
__bio_for_each_segment(bv, bio, bviter, *proc_iter) {
- void *kaddr = kmap_atomic(bv.bv_page);
+ void *kaddr = bvec_kmap_local(&bv);
- iter.data_buf = kaddr + bv.bv_offset;
+ iter.data_buf = kaddr;
iter.data_size = bv.bv_len;
-
ret = proc_fn(&iter);
- if (ret) {
- kunmap_atomic(kaddr);
- return ret;
- }
+ kunmap_local(kaddr);
+
+ if (ret)
+ break;
- kunmap_atomic(kaddr);
}
return ret;
}
diff --git a/block/bio.c b/block/bio.c
index 1fab762e079b..e16849f46b0e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -25,6 +25,11 @@
#include "blk.h"
#include "blk-rq-qos.h"
+struct bio_alloc_cache {
+ struct bio_list free_list;
+ unsigned int nr;
+};
+
static struct biovec_slab {
int nr_vecs;
char *name;
@@ -246,12 +251,40 @@ static void bio_free(struct bio *bio)
void bio_init(struct bio *bio, struct bio_vec *table,
unsigned short max_vecs)
{
- memset(bio, 0, sizeof(*bio));
+ bio->bi_next = NULL;
+ bio->bi_bdev = NULL;
+ bio->bi_opf = 0;
+ bio->bi_flags = 0;
+ bio->bi_ioprio = 0;
+ bio->bi_write_hint = 0;
+ bio->bi_status = 0;
+ bio->bi_iter.bi_sector = 0;
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_idx = 0;
+ bio->bi_iter.bi_bvec_done = 0;
+ bio->bi_end_io = NULL;
+ bio->bi_private = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ bio->bi_blkg = NULL;
+ bio->bi_issue.value = 0;
+#ifdef CONFIG_BLK_CGROUP_IOCOST
+ bio->bi_iocost_cost = 0;
+#endif
+#endif
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ bio->bi_crypt_context = NULL;
+#endif
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ bio->bi_integrity = NULL;
+#endif
+ bio->bi_vcnt = 0;
+
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
- bio->bi_io_vec = table;
bio->bi_max_vecs = max_vecs;
+ bio->bi_io_vec = table;
+ bio->bi_pool = NULL;
}
EXPORT_SYMBOL(bio_init);
@@ -495,16 +528,11 @@ EXPORT_SYMBOL(bio_kmalloc);
void zero_fill_bio(struct bio *bio)
{
- unsigned long flags;
struct bio_vec bv;
struct bvec_iter iter;
- bio_for_each_segment(bv, bio, iter) {
- char *data = bvec_kmap_irq(&bv, &flags);
- memset(data, 0, bv.bv_len);
- flush_dcache_page(bv.bv_page);
- bvec_kunmap_irq(data, &flags);
- }
+ bio_for_each_segment(bv, bio, iter)
+ memzero_bvec(&bv);
}
EXPORT_SYMBOL(zero_fill_bio);
@@ -591,6 +619,53 @@ void guard_bio_eod(struct bio *bio)
bio_truncate(bio, maxsector << 9);
}
+#define ALLOC_CACHE_MAX 512
+#define ALLOC_CACHE_SLACK 64
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ unsigned int i = 0;
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+ cache->nr--;
+ bio_free(bio);
+ if (++i == nr)
+ break;
+ }
+}
+
+static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct bio_set *bs;
+
+ bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
+ if (bs->cache) {
+ struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
+
+ bio_alloc_cache_prune(cache, -1U);
+ }
+ return 0;
+}
+
+static void bio_alloc_cache_destroy(struct bio_set *bs)
+{
+ int cpu;
+
+ if (!bs->cache)
+ return;
+
+ cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+ for_each_possible_cpu(cpu) {
+ struct bio_alloc_cache *cache;
+
+ cache = per_cpu_ptr(bs->cache, cpu);
+ bio_alloc_cache_prune(cache, -1U);
+ }
+ free_percpu(bs->cache);
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@@ -601,16 +676,23 @@ void guard_bio_eod(struct bio *bio)
**/
void bio_put(struct bio *bio)
{
- if (!bio_flagged(bio, BIO_REFFED))
- bio_free(bio);
- else {
+ if (unlikely(bio_flagged(bio, BIO_REFFED))) {
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+ if (!atomic_dec_and_test(&bio->__bi_cnt))
+ return;
+ }
- /*
- * last put frees it
- */
- if (atomic_dec_and_test(&bio->__bi_cnt))
- bio_free(bio);
+ if (bio_flagged(bio, BIO_PERCPU_CACHE)) {
+ struct bio_alloc_cache *cache;
+
+ bio_uninit(bio);
+ cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+ bio_list_add_head(&cache->free_list, bio);
+ if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
+ bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
+ put_cpu();
+ } else {
+ bio_free(bio);
}
}
EXPORT_SYMBOL(bio_put);
@@ -979,6 +1061,14 @@ static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
return 0;
}
+static void bio_put_pages(struct page **pages, size_t size, size_t off)
+{
+ size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
+
+ for (i = 0; i < nr; i++)
+ put_page(pages[i]);
+}
+
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
/**
@@ -1023,8 +1113,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
if (same_page)
put_page(page);
} else {
- if (WARN_ON_ONCE(bio_full(bio, len)))
- return -EINVAL;
+ if (WARN_ON_ONCE(bio_full(bio, len))) {
+ bio_put_pages(pages + i, left, offset);
+ return -EINVAL;
+ }
__bio_add_page(bio, page, len, offset);
}
offset = 0;
@@ -1069,6 +1161,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
len = min_t(size_t, PAGE_SIZE - offset, left);
if (bio_add_hw_page(q, bio, page, len, offset,
max_append_sectors, &same_page) != len) {
+ bio_put_pages(pages + i, left, offset);
ret = -EINVAL;
break;
}
@@ -1191,27 +1284,15 @@ EXPORT_SYMBOL(bio_advance);
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
{
- struct bio_vec src_bv, dst_bv;
- void *src_p, *dst_p;
- unsigned bytes;
-
while (src_iter->bi_size && dst_iter->bi_size) {
- src_bv = bio_iter_iovec(src, *src_iter);
- dst_bv = bio_iter_iovec(dst, *dst_iter);
-
- bytes = min(src_bv.bv_len, dst_bv.bv_len);
-
- src_p = kmap_atomic(src_bv.bv_page);
- dst_p = kmap_atomic(dst_bv.bv_page);
-
- memcpy(dst_p + dst_bv.bv_offset,
- src_p + src_bv.bv_offset,
- bytes);
+ struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
+ struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
+ unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ void *src_buf;
- kunmap_atomic(dst_p);
- kunmap_atomic(src_p);
-
- flush_dcache_page(dst_bv.bv_page);
+ src_buf = bvec_kmap_local(&src_bv);
+ memcpy_to_bvec(&dst_bv, src_buf);
+ kunmap_local(src_buf);
bio_advance_iter_single(src, src_iter, bytes);
bio_advance_iter_single(dst, dst_iter, bytes);
@@ -1463,12 +1544,15 @@ EXPORT_SYMBOL(bio_split);
* @bio: bio to trim
* @offset: number of sectors to trim from the front of @bio
* @size: size we want to trim @bio to, in sectors
+ *
+ * This function is typically used for bios that are cloned and submitted
+ * to the underlying device in parts.
*/
-void bio_trim(struct bio *bio, int offset, int size)
+void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
- /* 'bio' is a cloned bio which we need to trim to match
- * the given offset and size.
- */
+ if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
+ offset + size > bio->bi_iter.bi_size))
+ return;
size <<= 9;
if (offset == 0 && size == bio->bi_iter.bi_size)
@@ -1479,7 +1563,6 @@ void bio_trim(struct bio *bio, int offset, int size)
if (bio_integrity(bio))
bio_integrity_trim(bio);
-
}
EXPORT_SYMBOL_GPL(bio_trim);
@@ -1502,6 +1585,7 @@ int biovec_init_pool(mempool_t *pool, int pool_entries)
*/
void bioset_exit(struct bio_set *bs)
{
+ bio_alloc_cache_destroy(bs);
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
bs->rescue_workqueue = NULL;
@@ -1563,12 +1647,18 @@ int bioset_init(struct bio_set *bs,
biovec_init_pool(&bs->bvec_pool, pool_size))
goto bad;
- if (!(flags & BIOSET_NEED_RESCUER))
- return 0;
-
- bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
- if (!bs->rescue_workqueue)
- goto bad;
+ if (flags & BIOSET_NEED_RESCUER) {
+ bs->rescue_workqueue = alloc_workqueue("bioset",
+ WQ_MEM_RECLAIM, 0);
+ if (!bs->rescue_workqueue)
+ goto bad;
+ }
+ if (flags & BIOSET_PERCPU_CACHE) {
+ bs->cache = alloc_percpu(struct bio_alloc_cache);
+ if (!bs->cache)
+ goto bad;
+ cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+ }
return 0;
bad:
@@ -1595,6 +1685,46 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
}
EXPORT_SYMBOL(bioset_init_from_src);
+/**
+ * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
+ * @kiocb: kiocb describing the IO
+ * @nr_iovecs: number of iovecs to pre-allocate
+ * @bs: bio_set to allocate from
+ *
+ * Description:
+ * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only
+ * used to check if we should dip into the per-cpu bio_set allocation
+ * cache. The allocation uses GFP_KERNEL internally. On return, the
+ * bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio
+ * MUST be done from process context, not hard/soft IRQ.
+ *
+ */
+struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
+ struct bio_set *bs)
+{
+ struct bio_alloc_cache *cache;
+ struct bio *bio;
+
+ if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
+ return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+
+ cache = per_cpu_ptr(bs->cache, get_cpu());
+ bio = bio_list_pop(&cache->free_list);
+ if (bio) {
+ cache->nr--;
+ put_cpu();
+ bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+ bio->bi_pool = bs;
+ bio_set_flag(bio, BIO_PERCPU_CACHE);
+ return bio;
+ }
+ put_cpu();
+ bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ bio_set_flag(bio, BIO_PERCPU_CACHE);
+ return bio;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_kiocb);
+
static int __init init_bio(void)
{
int i;
@@ -1609,6 +1739,9 @@ static int __init init_bio(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
+ cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
+ bio_cpu_dead);
+
if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
panic("bio: can't allocate bios\n");
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 575d7a2e7203..3c88a79a319b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -489,10 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
const char *blkg_dev_name(struct blkcg_gq *blkg)
{
- /* some drivers (floppy) instantiate a queue w/o disk registered */
- if (blkg->q->backing_dev_info->dev)
- return bdi_dev_name(blkg->q->backing_dev_info);
- return NULL;
+ if (!blkg->q->disk || !blkg->q->disk->bdi->dev)
+ return NULL;
+ return bdi_dev_name(blkg->q->disk->bdi);
}
/**
@@ -790,6 +789,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct blkcg_gq *parent = blkg->parent;
struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
struct blkg_iostat cur, delta;
+ unsigned long flags;
unsigned int seq;
/* fetch the current per-cpu values */
@@ -799,21 +799,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
} while (u64_stats_fetch_retry(&bisc->sync, seq));
/* propagate percpu delta to global */
- u64_stats_update_begin(&blkg->iostat.sync);
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
blkg_iostat_set(&delta, &cur);
blkg_iostat_sub(&delta, &bisc->last);
blkg_iostat_add(&blkg->iostat.cur, &delta);
blkg_iostat_add(&bisc->last, &delta);
- u64_stats_update_end(&blkg->iostat.sync);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
/* propagate global delta to parent (unless that's root) */
if (parent && parent->parent) {
- u64_stats_update_begin(&parent->iostat.sync);
+ flags = u64_stats_update_begin_irqsave(&parent->iostat.sync);
blkg_iostat_set(&delta, &blkg->iostat.cur);
blkg_iostat_sub(&delta, &blkg->iostat.last);
blkg_iostat_add(&parent->iostat.cur, &delta);
blkg_iostat_add(&blkg->iostat.last, &delta);
- u64_stats_update_end(&parent->iostat.sync);
+ u64_stats_update_end_irqrestore(&parent->iostat.sync, flags);
}
}
@@ -848,6 +848,7 @@ static void blkcg_fill_root_iostats(void)
memset(&tmp, 0, sizeof(tmp));
for_each_possible_cpu(cpu) {
struct disk_stats *cpu_dkstats;
+ unsigned long flags;
cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
tmp.ios[BLKG_IOSTAT_READ] +=
@@ -864,104 +865,86 @@ static void blkcg_fill_root_iostats(void)
tmp.bytes[BLKG_IOSTAT_DISCARD] +=
cpu_dkstats->sectors[STAT_DISCARD] << 9;
- u64_stats_update_begin(&blkg->iostat.sync);
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
blkg_iostat_set(&blkg->iostat.cur, &tmp);
- u64_stats_update_end(&blkg->iostat.sync);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
}
}
-static int blkcg_print_stat(struct seq_file *sf, void *v)
+static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
{
- struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
- struct blkcg_gq *blkg;
-
- if (!seq_css(sf)->parent)
- blkcg_fill_root_iostats();
- else
- cgroup_rstat_flush(blkcg->css.cgroup);
-
- rcu_read_lock();
-
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
- struct blkg_iostat_set *bis = &blkg->iostat;
- const char *dname;
- char *buf;
- u64 rbytes, wbytes, rios, wios, dbytes, dios;
- size_t size = seq_get_buf(sf, &buf), off = 0;
- int i;
- bool has_stats = false;
- unsigned seq;
+ struct blkg_iostat_set *bis = &blkg->iostat;
+ u64 rbytes, wbytes, rios, wios, dbytes, dios;
+ bool has_stats = false;
+ const char *dname;
+ unsigned seq;
+ int i;
- spin_lock_irq(&blkg->q->queue_lock);
+ if (!blkg->online)
+ return;
- if (!blkg->online)
- goto skip;
+ dname = blkg_dev_name(blkg);
+ if (!dname)
+ return;
- dname = blkg_dev_name(blkg);
- if (!dname)
- goto skip;
+ seq_printf(s, "%s ", dname);
- /*
- * Hooray string manipulation, count is the size written NOT
- * INCLUDING THE \0, so size is now count+1 less than what we
- * had before, but we want to start writing the next bit from
- * the \0 so we only add count to buf.
- */
- off += scnprintf(buf+off, size-off, "%s ", dname);
+ do {
+ seq = u64_stats_fetch_begin(&bis->sync);
+
+ rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
+ wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
+ dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
+ rios = bis->cur.ios[BLKG_IOSTAT_READ];
+ wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
+ dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
+ } while (u64_stats_fetch_retry(&bis->sync, seq));
+
+ if (rbytes || wbytes || rios || wios) {
+ has_stats = true;
+ seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+ rbytes, wbytes, rios, wios,
+ dbytes, dios);
+ }
- do {
- seq = u64_stats_fetch_begin(&bis->sync);
+ if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
+ has_stats = true;
+ seq_printf(s, " use_delay=%d delay_nsec=%llu",
+ atomic_read(&blkg->use_delay),
+ atomic64_read(&blkg->delay_nsec));
+ }
- rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
- wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
- dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
- rios = bis->cur.ios[BLKG_IOSTAT_READ];
- wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
- dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
- } while (u64_stats_fetch_retry(&bis->sync, seq));
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
- if (rbytes || wbytes || rios || wios) {
- has_stats = true;
- off += scnprintf(buf+off, size-off,
- "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
- rbytes, wbytes, rios, wios,
- dbytes, dios);
- }
+ if (!blkg->pd[i] || !pol->pd_stat_fn)
+ continue;
- if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
+ if (pol->pd_stat_fn(blkg->pd[i], s))
has_stats = true;
- off += scnprintf(buf+off, size-off,
- " use_delay=%d delay_nsec=%llu",
- atomic_read(&blkg->use_delay),
- (unsigned long long)atomic64_read(&blkg->delay_nsec));
- }
+ }
- for (i = 0; i < BLKCG_MAX_POLS; i++) {
- struct blkcg_policy *pol = blkcg_policy[i];
- size_t written;
+ if (has_stats)
+ seq_printf(s, "\n");
+}
- if (!blkg->pd[i] || !pol->pd_stat_fn)
- continue;
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct blkcg_gq *blkg;
- written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
- if (written)
- has_stats = true;
- off += written;
- }
+ if (!seq_css(sf)->parent)
+ blkcg_fill_root_iostats();
+ else
+ cgroup_rstat_flush(blkcg->css.cgroup);
- if (has_stats) {
- if (off < size - 1) {
- off += scnprintf(buf+off, size-off, "\n");
- seq_commit(sf, off);
- } else {
- seq_commit(sf, -1);
- }
- }
- skip:
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ spin_lock_irq(&blkg->q->queue_lock);
+ blkcg_print_one_stat(blkg, sf);
spin_unlock_irq(&blkg->q->queue_lock);
}
-
rcu_read_unlock();
return 0;
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 04477697ee4b..5454db2fa263 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -14,7 +14,6 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
@@ -122,7 +121,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
- refcount_set(&rq->ref, 1);
blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -394,10 +392,7 @@ void blk_cleanup_queue(struct request_queue *q)
/* for synchronous bio-based driver finish in-flight integrity i/o */
blk_flush_integrity();
- /* @q won't process any more request, flush async actions */
- del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
blk_sync_queue(q);
-
if (queue_is_mq(q))
blk_mq_exit_queue(q);
@@ -534,20 +529,14 @@ struct request_queue *blk_alloc_queue(int node_id)
if (ret)
goto fail_id;
- q->backing_dev_info = bdi_alloc(node_id);
- if (!q->backing_dev_info)
- goto fail_split;
-
q->stats = blk_alloc_queue_stats();
if (!q->stats)
- goto fail_stats;
+ goto fail_split;
q->node = node_id;
atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
- timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
- laptop_mode_timer_fn, 0);
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
@@ -572,7 +561,7 @@ struct request_queue *blk_alloc_queue(int node_id)
if (percpu_ref_init(&q->q_usage_counter,
blk_queue_usage_counter_release,
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
- goto fail_bdi;
+ goto fail_stats;
if (blkcg_init_queue(q))
goto fail_ref;
@@ -585,10 +574,8 @@ struct request_queue *blk_alloc_queue(int node_id)
fail_ref:
percpu_ref_exit(&q->q_usage_counter);
-fail_bdi:
- blk_free_queue_stats(q->stats);
fail_stats:
- bdi_put(q->backing_dev_info);
+ blk_free_queue_stats(q->stats);
fail_split:
bioset_exit(&q->bio_split);
fail_id:
@@ -834,7 +821,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
}
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
- bio->bi_opf &= ~REQ_HIPRI;
+ bio_clear_hipri(bio);
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index c5bdaafffa29..103c2e2d50d6 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -332,7 +332,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
if (mode->keysize == 0)
return -EINVAL;
- if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE)
+ if (dun_bytes == 0 || dun_bytes > mode->ivsize)
return -EINVAL;
if (!is_power_of_2(data_unit_size))
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 1002f6c58181..4201728bf3a5 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -262,6 +262,11 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
}
+bool is_flush_rq(struct request *rq)
+{
+ return rq->end_io == flush_end_io;
+}
+
/**
* blk_kick_flush - consider issuing flush request
* @q: request_queue being kicked
@@ -329,6 +334,14 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->rq_flags |= RQF_FLUSH_SEQ;
flush_rq->rq_disk = first_rq->rq_disk;
flush_rq->end_io = flush_end_io;
+ /*
+ * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
+ * implied in refcount_inc_not_zero() called from
+ * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref
+ * and READ flush_rq->end_io
+ */
+ smp_wmb();
+ refcount_set(&flush_rq->ref, 1);
blk_flush_queue_rq(flush_rq, false);
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 410da060d1f5..69a12177dfb6 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -431,13 +431,15 @@ void blk_integrity_unregister(struct gendisk *disk)
}
EXPORT_SYMBOL(blk_integrity_unregister);
-void blk_integrity_add(struct gendisk *disk)
+int blk_integrity_add(struct gendisk *disk)
{
- if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
- &disk_to_dev(disk)->kobj, "%s", "integrity"))
- return;
+ int ret;
- kobject_uevent(&disk->integrity_kobj, KOBJ_ADD);
+ ret = kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
+ &disk_to_dev(disk)->kobj, "%s", "integrity");
+ if (!ret)
+ kobject_uevent(&disk->integrity_kobj, KOBJ_ADD);
+ return ret;
}
void blk_integrity_del(struct gendisk *disk)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index c2d6bc88d3f1..b3880e4ba22a 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1440,16 +1440,17 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
return -1;
iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
+ wait->committed = true;
/*
* autoremove_wake_function() removes the wait entry only when it
- * actually changed the task state. We want the wait always
- * removed. Remove explicitly and use default_wake_function().
+ * actually changed the task state. We want the wait always removed.
+ * Remove explicitly and use default_wake_function(). Note that the
+ * order of operations is important as finish_wait() tests whether
+ * @wq_entry is removed without grabbing the lock.
*/
- list_del_init(&wq_entry->entry);
- wait->committed = true;
-
default_wake_function(wq_entry, mode, flags, key);
+ list_del_init_careful(&wq_entry->entry);
return 0;
}
@@ -2987,34 +2988,29 @@ static void ioc_pd_free(struct blkg_policy_data *pd)
kfree(iocg);
}
-static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
+static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
{
struct ioc_gq *iocg = pd_to_iocg(pd);
struct ioc *ioc = iocg->ioc;
- size_t pos = 0;
if (!ioc->enabled)
- return 0;
+ return false;
if (iocg->level == 0) {
unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
ioc->vtime_base_rate * 10000,
VTIME_PER_USEC);
- pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
- vp10k / 100, vp10k % 100);
+ seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
}
- pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
- iocg->last_stat.usage_us);
+ seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
if (blkcg_debug_stats)
- pos += scnprintf(buf + pos, size - pos,
- " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
- iocg->last_stat.wait_us,
- iocg->last_stat.indebt_us,
- iocg->last_stat.indelay_us);
-
- return pos;
+ seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
+ iocg->last_stat.wait_us,
+ iocg->last_stat.indebt_us,
+ iocg->last_stat.indelay_us);
+ return true;
}
static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
@@ -3060,19 +3056,19 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
return -EINVAL;
- spin_lock(&blkcg->lock);
+ spin_lock_irq(&blkcg->lock);
iocc->dfl_weight = v * WEIGHT_ONE;
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct ioc_gq *iocg = blkg_to_iocg(blkg);
if (iocg) {
- spin_lock_irq(&iocg->ioc->lock);
+ spin_lock(&iocg->ioc->lock);
ioc_now(iocg->ioc, &now);
weight_updated(iocg, &now);
- spin_unlock_irq(&iocg->ioc->lock);
+ spin_unlock(&iocg->ioc->lock);
}
}
- spin_unlock(&blkcg->lock);
+ spin_unlock_irq(&blkcg->lock);
return nbytes;
}
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 81be0096411d..c0545f9da549 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -833,7 +833,11 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
enable = iolatency_set_min_lat_nsec(blkg, lat_val);
if (enable) {
- WARN_ON_ONCE(!blk_get_queue(blkg->q));
+ if (!blk_get_queue(blkg->q)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
blkg_get(blkg);
}
@@ -886,8 +890,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v)
return 0;
}
-static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
- size_t size)
+static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
{
struct latency_stat stat;
int cpu;
@@ -902,39 +905,40 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
preempt_enable();
if (iolat->rq_depth.max_depth == UINT_MAX)
- return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
- (unsigned long long)stat.ps.missed,
- (unsigned long long)stat.ps.total);
- return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
- (unsigned long long)stat.ps.missed,
- (unsigned long long)stat.ps.total,
- iolat->rq_depth.max_depth);
+ seq_printf(s, " missed=%llu total=%llu depth=max",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total);
+ else
+ seq_printf(s, " missed=%llu total=%llu depth=%u",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total,
+ iolat->rq_depth.max_depth);
+ return true;
}
-static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
- size_t size)
+static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
unsigned long long avg_lat;
unsigned long long cur_win;
if (!blkcg_debug_stats)
- return 0;
+ return false;
if (iolat->ssd)
- return iolatency_ssd_stat(iolat, buf, size);
+ return iolatency_ssd_stat(iolat, s);
avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
if (iolat->rq_depth.max_depth == UINT_MAX)
- return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
- avg_lat, cur_win);
-
- return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
- iolat->rq_depth.max_depth, avg_lat, cur_win);
+ seq_printf(s, " depth=max avg_lat=%llu win=%llu",
+ avg_lat, cur_win);
+ else
+ seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
+ iolat->rq_depth.max_depth, avg_lat, cur_win);
+ return true;
}
-
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
struct request_queue *q,
struct blkcg *blkcg)
diff --git a/block/blk-map.c b/block/blk-map.c
index 3743158ddaeb..d1448aaad980 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -400,7 +400,7 @@ static void bio_copy_kern_endio_read(struct bio *bio)
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all) {
- memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
+ memcpy_from_bvec(p, bvec);
p += bvec->bv_len;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index a11b3b53717e..7a5c81c02c80 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -285,7 +285,7 @@ split:
* iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed.
*/
- bio->bi_opf &= ~REQ_HIPRI;
+ bio_clear_hipri(bio);
return bio_split(bio, sectors, GFP_NOIO, bs);
}
@@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
trace_block_split(split, (*bio)->bi_iter.bi_sector);
submit_bio_noacct(*bio);
*bio = split;
+
+ blk_throtl_charge_bio_split(*bio);
}
}
@@ -705,22 +707,6 @@ static void blk_account_io_merge_request(struct request *req)
}
}
-/*
- * Two cases of handling DISCARD merge:
- * If max_discard_segments > 1, the driver takes every bio
- * as a range and send them to controller together. The ranges
- * needn't to be contiguous.
- * Otherwise, the bios/requests will be handled as same as
- * others which should be contiguous.
- */
-static inline bool blk_discard_mergable(struct request *req)
-{
- if (req_op(req) == REQ_OP_DISCARD &&
- queue_max_discard_segments(req->q) > 1)
- return true;
- return false;
-}
-
static enum elv_merge blk_try_req_merge(struct request *req,
struct request *next)
{
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c838d81ac058..0f006cabfd91 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -515,17 +515,6 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
percpu_ref_put(&q->q_usage_counter);
}
-static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
-{
- if (hctx->sched_tags) {
- blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
- blk_mq_free_rq_map(hctx->sched_tags, set->flags);
- hctx->sched_tags = NULL;
- }
-}
-
static int blk_mq_sched_alloc_tags(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
@@ -539,8 +528,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
- if (ret)
- blk_mq_sched_free_tags(set, hctx, hctx_idx);
+ if (ret) {
+ blk_mq_free_rq_map(hctx->sched_tags, set->flags);
+ hctx->sched_tags = NULL;
+ }
return ret;
}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 7b52e7657b2d..253c857cba47 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -45,60 +45,12 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
kfree(hctx);
}
-struct blk_mq_ctx_sysfs_entry {
- struct attribute attr;
- ssize_t (*show)(struct blk_mq_ctx *, char *);
- ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
-};
-
struct blk_mq_hw_ctx_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
};
-static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
- char *page)
-{
- struct blk_mq_ctx_sysfs_entry *entry;
- struct blk_mq_ctx *ctx;
- struct request_queue *q;
- ssize_t res;
-
- entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
- ctx = container_of(kobj, struct blk_mq_ctx, kobj);
- q = ctx->queue;
-
- if (!entry->show)
- return -EIO;
-
- mutex_lock(&q->sysfs_lock);
- res = entry->show(ctx, page);
- mutex_unlock(&q->sysfs_lock);
- return res;
-}
-
-static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
- const char *page, size_t length)
-{
- struct blk_mq_ctx_sysfs_entry *entry;
- struct blk_mq_ctx *ctx;
- struct request_queue *q;
- ssize_t res;
-
- entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
- ctx = container_of(kobj, struct blk_mq_ctx, kobj);
- q = ctx->queue;
-
- if (!entry->store)
- return -EIO;
-
- mutex_lock(&q->sysfs_lock);
- res = entry->store(ctx, page, length);
- mutex_unlock(&q->sysfs_lock);
- return res;
-}
-
static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
struct attribute *attr, char *page)
{
@@ -198,23 +150,16 @@ static struct attribute *default_hw_ctx_attrs[] = {
};
ATTRIBUTE_GROUPS(default_hw_ctx);
-static const struct sysfs_ops blk_mq_sysfs_ops = {
- .show = blk_mq_sysfs_show,
- .store = blk_mq_sysfs_store,
-};
-
static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
.show = blk_mq_hw_sysfs_show,
.store = blk_mq_hw_sysfs_store,
};
static struct kobj_type blk_mq_ktype = {
- .sysfs_ops = &blk_mq_sysfs_ops,
.release = blk_mq_sysfs_release,
};
static struct kobj_type blk_mq_ctx_ktype = {
- .sysfs_ops = &blk_mq_sysfs_ops,
.release = blk_mq_ctx_sysfs_release,
};
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 495f508c6300..65d3a63aecc6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -525,7 +525,7 @@ void blk_mq_free_request(struct request *rq)
__blk_mq_dec_active_requests(hctx);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
- laptop_io_completion(q->backing_dev_info);
+ laptop_io_completion(q->disk->bdi);
rq_qos_done(q, rq);
@@ -606,7 +606,7 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
* This is probably worse than completing the request on a different
* cache domain.
*/
- if (force_irqthreads)
+ if (force_irqthreads())
return false;
/* same CPU or cache domain? Complete locally */
@@ -911,7 +911,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
void blk_mq_put_rq_ref(struct request *rq)
{
- if (is_flush_rq(rq, rq->mq_hctx))
+ if (is_flush_rq(rq))
rq->end_io(rq, 0);
else if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
@@ -923,34 +923,14 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
unsigned long *next = priv;
/*
- * Just do a quick check if it is expired before locking the request in
- * so we're not unnecessarilly synchronizing across CPUs.
- */
- if (!blk_mq_req_expired(rq, next))
- return true;
-
- /*
- * We have reason to believe the request may be expired. Take a
- * reference on the request to lock this request lifetime into its
- * currently allocated context to prevent it from being reallocated in
- * the event the completion by-passes this timeout handler.
- *
- * If the reference was already released, then the driver beat the
- * timeout handler to posting a natural completion.
- */
- if (!refcount_inc_not_zero(&rq->ref))
- return true;
-
- /*
- * The request is now locked and cannot be reallocated underneath the
- * timeout handler's processing. Re-verify this exact request is truly
- * expired; if it is not expired, then the request was completed and
- * reallocated as a new request.
+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
+ * be reallocated underneath the timeout handler's processing, then
+ * the expire check is reliable. If the request is not expired, then
+ * it was completed and reallocated as a new request after returning
+ * from blk_mq_check_expired().
*/
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
-
- blk_mq_put_rq_ref(rq);
return true;
}
@@ -2994,10 +2974,12 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (shared)
+ if (shared) {
hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
- else
+ } else {
+ blk_mq_tag_idle(hctx);
hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
+ }
}
}
@@ -3133,7 +3115,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
-struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata)
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
+ struct lock_class_key *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
@@ -3142,12 +3125,11 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata)
if (IS_ERR(q))
return ERR_CAST(q);
- disk = __alloc_disk_node(0, set->numa_node);
+ disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
blk_cleanup_queue(q);
return ERR_PTR(-ENOMEM);
}
- disk->queue = q;
return disk;
}
EXPORT_SYMBOL(__blk_mq_alloc_disk);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 902c40d67120..a7c857ad7d10 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -8,6 +8,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/pagemap.h>
+#include <linux/backing-dev-defs.h>
#include <linux/gcd.h>
#include <linux/lcm.h>
#include <linux/jiffies.h>
@@ -140,7 +141,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
limits->logical_block_size >> SECTOR_SHIFT);
limits->max_sectors = max_sectors;
- q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
+ if (!q->disk)
+ return;
+ q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -380,18 +383,19 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
}
EXPORT_SYMBOL(blk_queue_alignment_offset);
-void blk_queue_update_readahead(struct request_queue *q)
+void disk_update_readahead(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
+
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
- q->backing_dev_info->ra_pages =
+ disk->bdi->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
- q->backing_dev_info->io_pages =
- queue_max_sectors(q) >> (PAGE_SHIFT - 9);
+ disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9);
}
-EXPORT_SYMBOL_GPL(blk_queue_update_readahead);
+EXPORT_SYMBOL_GPL(disk_update_readahead);
/**
* blk_limits_io_min - set minimum request size for a device
@@ -471,7 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt);
void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
{
blk_limits_io_opt(&q->limits, opt);
- q->backing_dev_info->ra_pages =
+ if (!q->disk)
+ return;
+ q->disk->bdi->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
}
EXPORT_SYMBOL(blk_queue_io_opt);
@@ -661,17 +667,11 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
struct request_queue *t = disk->queue;
if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits,
- get_start_sect(bdev) + (offset >> 9)) < 0) {
- char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
-
- disk_name(disk, 0, top);
- bdevname(bdev, bottom);
-
- printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
- top, bottom);
- }
+ get_start_sect(bdev) + (offset >> 9)) < 0)
+ pr_notice("%s: Warning: Device %pg is misaligned\n",
+ disk->disk_name, bdev);
- blk_queue_update_readahead(disk->queue);
+ disk_update_readahead(disk);
}
EXPORT_SYMBOL(disk_stack_limits);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 370d83c18057..614d9d47de36 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -88,9 +88,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
static ssize_t queue_ra_show(struct request_queue *q, char *page)
{
- unsigned long ra_kb = q->backing_dev_info->ra_pages <<
- (PAGE_SHIFT - 10);
+ unsigned long ra_kb;
+ if (!q->disk)
+ return -EINVAL;
+ ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10);
return queue_var_show(ra_kb, page);
}
@@ -98,13 +100,14 @@ static ssize_t
queue_ra_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long ra_kb;
- ssize_t ret = queue_var_store(&ra_kb, page, count);
+ ssize_t ret;
+ if (!q->disk)
+ return -EINVAL;
+ ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
return ret;
-
- q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
-
+ q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
return ret;
}
@@ -251,7 +254,8 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
spin_lock_irq(&q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
- q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
+ if (q->disk)
+ q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
spin_unlock_irq(&q->queue_lock);
return ret;
@@ -766,13 +770,6 @@ static void blk_exit_queue(struct request_queue *q)
* e.g. blkcg_print_blkgs() to crash.
*/
blkcg_exit_queue(q);
-
- /*
- * Since the cgroup code may dereference the @q->backing_dev_info
- * pointer, only decrease its reference count after having removed the
- * association with the block cgroup controller.
- */
- bdi_put(q->backing_dev_info);
}
/**
@@ -859,15 +856,6 @@ int blk_register_queue(struct gendisk *disk)
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
- if (WARN_ON(!q))
- return -ENXIO;
-
- WARN_ONCE(blk_queue_registered(q),
- "%s is registering an already registered queue\n",
- kobject_name(&dev->kobj));
-
- blk_queue_update_readahead(q);
-
ret = blk_trace_init_sysfs(dev);
if (ret)
return ret;
@@ -941,7 +929,6 @@ unlock:
return ret;
}
-EXPORT_SYMBOL_GPL(blk_register_queue);
/**
* blk_unregister_queue - counterpart of blk_register_queue()
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b1b22d863bdf..55c49015e533 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -178,6 +178,9 @@ struct throtl_grp {
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
+ atomic_t io_split_cnt[2];
+ atomic_t last_io_split_cnt[2];
+
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
@@ -777,6 +780,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
+ atomic_set(&tg->io_split_cnt[rw], 0);
+
/*
* Previous slice has expired. We must have trimmed it after last
* bio dispatch. That means since start of last slice, we never used
@@ -799,6 +804,9 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
+
+ atomic_set(&tg->io_split_cnt[rw], 0);
+
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -1031,6 +1039,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
+ if (iops_limit != UINT_MAX)
+ tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);
+
if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait)
@@ -2052,12 +2063,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
}
if (tg->iops[READ][LIMIT_LOW]) {
+ tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);
iops = tg->last_io_disp[READ] * HZ / elapsed_time;
if (iops >= tg->iops[READ][LIMIT_LOW])
tg->last_low_overflow_time[READ] = now;
}
if (tg->iops[WRITE][LIMIT_LOW]) {
+ tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);
iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
if (iops >= tg->iops[WRITE][LIMIT_LOW])
tg->last_low_overflow_time[WRITE] = now;
@@ -2176,6 +2189,25 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
}
#endif
+void blk_throtl_charge_bio_split(struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+ struct throtl_grp *parent = blkg_to_tg(blkg);
+ struct throtl_service_queue *parent_sq;
+ bool rw = bio_data_dir(bio);
+
+ do {
+ if (!parent->has_rules[rw])
+ break;
+
+ atomic_inc(&parent->io_split_cnt[rw]);
+ atomic_inc(&parent->last_io_split_cnt[rw]);
+
+ parent_sq = parent->service_queue.parent_sq;
+ parent = sq_to_tg(parent_sq);
+ } while (parent);
+}
+
bool blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 3ed71b8da887..874c1c37bf0c 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
+ struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb;
return time_before(jiffies, wb->dirty_sleep + HZ);
}
@@ -234,7 +234,7 @@ enum {
static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
- struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+ struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi;
struct rq_depth *rqd = &rwb->rq_depth;
u64 thislat;
@@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
- struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+ struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi;
struct rq_depth *rqd = &rwb->rq_depth;
trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
@@ -359,7 +359,7 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
status = latency_exceeded(rwb, cb->stat);
- trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step,
+ trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step,
inflight);
/*
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 86fce751bb17..1d0c76c18fc5 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -360,9 +360,6 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!blk_queue_is_zoned(q))
return -ENOTTY;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
return -EFAULT;
@@ -421,9 +418,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
if (!blk_queue_is_zoned(q))
return -ENOTTY;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
if (!(mode & FMODE_WRITE))
return -EBADF;
diff --git a/block/blk.h b/block/blk.h
index 4b885c0f6708..8c96b0c90c48 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -44,11 +44,7 @@ static inline void __blk_get_queue(struct request_queue *q)
kobject_get(&q->kobj);
}
-static inline bool
-is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx)
-{
- return hctx->fq->flush_rq == req;
-}
+bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
gfp_t flags);
@@ -132,7 +128,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
bip_next->bip_vec[0].bv_offset);
}
-void blk_integrity_add(struct gendisk *);
+int blk_integrity_add(struct gendisk *disk);
void blk_integrity_del(struct gendisk *);
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline bool blk_integrity_merge_rq(struct request_queue *rq,
@@ -166,8 +162,9 @@ static inline bool bio_integrity_endio(struct bio *bio)
static inline void bio_integrity_free(struct bio *bio)
{
}
-static inline void blk_integrity_add(struct gendisk *disk)
+static inline int blk_integrity_add(struct gendisk *disk)
{
+ return 0;
}
static inline void blk_integrity_del(struct gendisk *disk)
{
@@ -293,11 +290,13 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
+extern void blk_throtl_charge_bio_split(struct bio *bio);
bool blk_throtl_bio(struct bio *bio);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
+static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
@@ -344,15 +343,14 @@ static inline void blk_queue_clear_zone_settings(struct request_queue *q) {}
int blk_alloc_ext_minor(void);
void blk_free_ext_minor(unsigned int minor);
-char *disk_name(struct gendisk *hd, int partno, char *buf);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
-int bdev_add_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length);
-int bdev_del_partition(struct block_device *bdev, int partno);
-int bdev_resize_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length);
+int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length);
+int bdev_del_partition(struct gendisk *disk, int partno);
+int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length);
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
@@ -360,7 +358,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct request_queue *blk_alloc_queue(int node_id);
-void disk_alloc_events(struct gendisk *disk);
+int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
@@ -368,4 +366,11 @@ extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;
+static inline void bio_clear_hipri(struct bio *bio)
+{
+ /* can't support alloc cache if we turn off polling */
+ bio_clear_flag(bio, BIO_PERCPU_CACHE);
+ bio->bi_opf &= ~REQ_HIPRI;
+}
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index 94081e013c58..05fc7148489d 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -68,25 +68,12 @@ static __init int init_emergency_pool(void)
__initcall(init_emergency_pool);
/*
- * highmem version, map in to vec
- */
-static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
-{
- unsigned char *vto;
-
- vto = kmap_atomic(to->bv_page);
- memcpy(vto + to->bv_offset, vfrom, to->bv_len);
- kunmap_atomic(vto);
-}
-
-/*
* Simple bounce buffer support for highmem pages. Depending on the
* queue gfp mask set, *to may or may not be a highmem page. kmap it
* always, it will do the Right Thing
*/
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
- unsigned char *vfrom;
struct bio_vec tovec, fromvec;
struct bvec_iter iter;
/*
@@ -104,11 +91,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
* been modified by the block layer, so use the original
* copy, bounce_copy_vec already uses tovec->bv_len
*/
- vfrom = page_address(fromvec.bv_page) +
- tovec.bv_offset;
-
- bounce_copy_vec(&tovec, vfrom);
- flush_dcache_page(tovec.bv_page);
+ memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) +
+ tovec.bv_offset);
}
bio_advance_iter(from, &from_iter, tovec.bv_len);
}
@@ -255,24 +239,19 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
* because the 'bio' is single-page bvec.
*/
for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
- struct page *page = to->bv_page;
+ struct page *bounce_page;
- if (!PageHighMem(page))
+ if (!PageHighMem(to->bv_page))
continue;
- to->bv_page = mempool_alloc(&page_pool, GFP_NOIO);
- inc_zone_page_state(to->bv_page, NR_BOUNCE);
+ bounce_page = mempool_alloc(&page_pool, GFP_NOIO);
+ inc_zone_page_state(bounce_page, NR_BOUNCE);
if (rw == WRITE) {
- char *vto, *vfrom;
-
- flush_dcache_page(page);
-
- vto = page_address(to->bv_page) + to->bv_offset;
- vfrom = kmap_atomic(page) + to->bv_offset;
- memcpy(vto, vfrom, to->bv_len);
- kunmap_atomic(vfrom);
+ flush_dcache_page(to->bv_page);
+ memcpy_from_bvec(page_address(bounce_page), to);
}
+ to->bv_page = bounce_page;
}
trace_block_bio_bounce(*bio_orig);
diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c
deleted file mode 100644
index f2a14571882b..000000000000
--- a/block/cmdline-parser.c
+++ /dev/null
@@ -1,255 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Parse command line, get partition information
- *
- * Written by Cai Zhiyong <caizhiyong@huawei.com>
- *
- */
-#include <linux/export.h>
-#include <linux/cmdline-parser.h>
-
-static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
-{
- int ret = 0;
- struct cmdline_subpart *new_subpart;
-
- *subpart = NULL;
-
- new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
- if (!new_subpart)
- return -ENOMEM;
-
- if (*partdef == '-') {
- new_subpart->size = (sector_t)(~0ULL);
- partdef++;
- } else {
- new_subpart->size = (sector_t)memparse(partdef, &partdef);
- if (new_subpart->size < (sector_t)PAGE_SIZE) {
- pr_warn("cmdline partition size is invalid.");
- ret = -EINVAL;
- goto fail;
- }
- }
-
- if (*partdef == '@') {
- partdef++;
- new_subpart->from = (sector_t)memparse(partdef, &partdef);
- } else {
- new_subpart->from = (sector_t)(~0ULL);
- }
-
- if (*partdef == '(') {
- int length;
- char *next = strchr(++partdef, ')');
-
- if (!next) {
- pr_warn("cmdline partition format is invalid.");
- ret = -EINVAL;
- goto fail;
- }
-
- length = min_t(int, next - partdef,
- sizeof(new_subpart->name) - 1);
- strncpy(new_subpart->name, partdef, length);
- new_subpart->name[length] = '\0';
-
- partdef = ++next;
- } else
- new_subpart->name[0] = '\0';
-
- new_subpart->flags = 0;
-
- if (!strncmp(partdef, "ro", 2)) {
- new_subpart->flags |= PF_RDONLY;
- partdef += 2;
- }
-
- if (!strncmp(partdef, "lk", 2)) {
- new_subpart->flags |= PF_POWERUP_LOCK;
- partdef += 2;
- }
-
- *subpart = new_subpart;
- return 0;
-fail:
- kfree(new_subpart);
- return ret;
-}
-
-static void free_subpart(struct cmdline_parts *parts)
-{
- struct cmdline_subpart *subpart;
-
- while (parts->subpart) {
- subpart = parts->subpart;
- parts->subpart = subpart->next_subpart;
- kfree(subpart);
- }
-}
-
-static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
-{
- int ret = -EINVAL;
- char *next;
- int length;
- struct cmdline_subpart **next_subpart;
- struct cmdline_parts *newparts;
- char buf[BDEVNAME_SIZE + 32 + 4];
-
- *parts = NULL;
-
- newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
- if (!newparts)
- return -ENOMEM;
-
- next = strchr(bdevdef, ':');
- if (!next) {
- pr_warn("cmdline partition has no block device.");
- goto fail;
- }
-
- length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
- strncpy(newparts->name, bdevdef, length);
- newparts->name[length] = '\0';
- newparts->nr_subparts = 0;
-
- next_subpart = &newparts->subpart;
-
- while (next && *(++next)) {
- bdevdef = next;
- next = strchr(bdevdef, ',');
-
- length = (!next) ? (sizeof(buf) - 1) :
- min_t(int, next - bdevdef, sizeof(buf) - 1);
-
- strncpy(buf, bdevdef, length);
- buf[length] = '\0';
-
- ret = parse_subpart(next_subpart, buf);
- if (ret)
- goto fail;
-
- newparts->nr_subparts++;
- next_subpart = &(*next_subpart)->next_subpart;
- }
-
- if (!newparts->subpart) {
- pr_warn("cmdline partition has no valid partition.");
- ret = -EINVAL;
- goto fail;
- }
-
- *parts = newparts;
-
- return 0;
-fail:
- free_subpart(newparts);
- kfree(newparts);
- return ret;
-}
-
-void cmdline_parts_free(struct cmdline_parts **parts)
-{
- struct cmdline_parts *next_parts;
-
- while (*parts) {
- next_parts = (*parts)->next_parts;
- free_subpart(*parts);
- kfree(*parts);
- *parts = next_parts;
- }
-}
-EXPORT_SYMBOL(cmdline_parts_free);
-
-int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
-{
- int ret;
- char *buf;
- char *pbuf;
- char *next;
- struct cmdline_parts **next_parts;
-
- *parts = NULL;
-
- next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- next_parts = parts;
-
- while (next && *pbuf) {
- next = strchr(pbuf, ';');
- if (next)
- *next = '\0';
-
- ret = parse_parts(next_parts, pbuf);
- if (ret)
- goto fail;
-
- if (next)
- pbuf = ++next;
-
- next_parts = &(*next_parts)->next_parts;
- }
-
- if (!*parts) {
- pr_warn("cmdline partition has no valid partition.");
- ret = -EINVAL;
- goto fail;
- }
-
- ret = 0;
-done:
- kfree(buf);
- return ret;
-
-fail:
- cmdline_parts_free(parts);
- goto done;
-}
-EXPORT_SYMBOL(cmdline_parts_parse);
-
-struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
- const char *bdev)
-{
- while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
- parts = parts->next_parts;
- return parts;
-}
-EXPORT_SYMBOL(cmdline_parts_find);
-
-/*
- * add_part()
- * 0 success.
- * 1 can not add so many partitions.
- */
-int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
- int slot,
- int (*add_part)(int, struct cmdline_subpart *, void *),
- void *param)
-{
- sector_t from = 0;
- struct cmdline_subpart *subpart;
-
- for (subpart = parts->subpart; subpart;
- subpart = subpart->next_subpart, slot++) {
- if (subpart->from == (sector_t)(~0ULL))
- subpart->from = from;
- else
- from = subpart->from;
-
- if (from >= disk_size)
- break;
-
- if (subpart->size > (disk_size - from))
- subpart->size = disk_size - from;
-
- from += subpart->size;
-
- if (add_part(slot, subpart, param))
- break;
- }
-
- return slot;
-}
-EXPORT_SYMBOL(cmdline_parts_set);
diff --git a/block/disk-events.c b/block/disk-events.c
index a75931ff5da4..8d5496e7592a 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -163,15 +163,31 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
spin_unlock_irq(&ev->lock);
}
+/*
+ * Tell userland about new events. Only the events listed in @disk->events are
+ * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are
+ * processed internally but never get reported to userland.
+ */
+static void disk_event_uevent(struct gendisk *disk, unsigned int events)
+{
+ char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
+ int nr_events = 0, i;
+
+ for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
+ if (events & disk->events & (1 << i))
+ envp[nr_events++] = disk_uevents[i];
+
+ if (nr_events)
+ kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+}
+
static void disk_check_events(struct disk_events *ev,
unsigned int *clearing_ptr)
{
struct gendisk *disk = ev->disk;
- char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
unsigned int clearing = *clearing_ptr;
unsigned int events;
unsigned long intv;
- int nr_events = 0, i;
/* check events */
events = disk->fops->check_events(disk, clearing);
@@ -190,19 +206,11 @@ static void disk_check_events(struct disk_events *ev,
spin_unlock_irq(&ev->lock);
- /*
- * Tell userland about new events. Only the events listed in
- * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
- * is set. Otherwise, events are processed internally but never
- * get reported to userland.
- */
- for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
- if ((events & disk->events & (1 << i)) &&
- (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
- envp[nr_events++] = disk_uevents[i];
+ if (events & DISK_EVENT_MEDIA_CHANGE)
+ inc_diskseq(disk);
- if (nr_events)
- kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+ if (disk->event_flags & DISK_EVENT_FLAG_UEVENT)
+ disk_event_uevent(disk, events);
}
/**
@@ -281,6 +289,32 @@ bool bdev_check_media_change(struct block_device *bdev)
}
EXPORT_SYMBOL(bdev_check_media_change);
+/**
+ * disk_force_media_change - force a media change event
+ * @disk: the disk which will raise the event
+ * @events: the events to raise
+ *
+ * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present,
+ * attempt to free all dentries and inodes and invalidates all block
+ * device page cache entries in that case.
+ *
+ * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not.
+ */
+bool disk_force_media_change(struct gendisk *disk, unsigned int events)
+{
+ disk_event_uevent(disk, events);
+
+ if (!(events & DISK_EVENT_MEDIA_CHANGE))
+ return false;
+
+ if (__invalidate_device(disk->part0, true))
+ pr_warn("VFS: busy inodes on changed media %s\n",
+ disk->disk_name);
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+ return true;
+}
+EXPORT_SYMBOL_GPL(disk_force_media_change);
+
/*
* Separate this part out so that a different pointer for clearing_ptr can be
* passed in for disk_clear_events.
@@ -410,17 +444,17 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
/*
* disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
*/
-void disk_alloc_events(struct gendisk *disk)
+int disk_alloc_events(struct gendisk *disk)
{
struct disk_events *ev;
if (!disk->fops->check_events || !disk->events)
- return;
+ return 0;
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
if (!ev) {
pr_warn("%s: failed to initialize events\n", disk->disk_name);
- return;
+ return -ENOMEM;
}
INIT_LIST_HEAD(&ev->node);
@@ -432,6 +466,7 @@ void disk_alloc_events(struct gendisk *disk)
INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
disk->ev = ev;
+ return 0;
}
void disk_add_events(struct gendisk *disk)
diff --git a/block/elevator.c b/block/elevator.c
index 52ada14cfe45..ff45d8388f48 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -336,6 +336,9 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
if (__rq && elv_bio_merge_ok(__rq, bio)) {
*req = __rq;
+
+ if (blk_discard_mergable(__rq))
+ return ELEVATOR_DISCARD_MERGE;
return ELEVATOR_BACK_MERGE;
}
@@ -630,6 +633,9 @@ static inline bool elv_support_iosched(struct request_queue *q)
*/
static struct elevator_type *elevator_get_default(struct request_queue *q)
{
+ if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
+ return NULL;
+
if (q->nr_hw_queues != 1 &&
!blk_mq_is_sbitmap_shared(q->tag_set->flags))
return NULL;
@@ -702,7 +708,6 @@ void elevator_init_mq(struct request_queue *q)
elevator_put(e);
}
}
-EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */
/*
* switch to new_e io scheduler. be careful not to introduce deadlocks -
diff --git a/block/genhd.c b/block/genhd.c
index af4d2ab4a633..567549a011d1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -29,6 +29,23 @@
static struct kobject *block_depr;
+/*
+ * Unique, monotonically increasing sequential number associated with block
+ * devices instances (i.e. incremented each time a device is attached).
+ * Associating uevents with block devices in userspace is difficult and racy:
+ * the uevent netlink socket is lossy, and on slow and overloaded systems has
+ * a very high latency.
+ * Block devices do not have exclusive owners in userspace, any process can set
+ * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
+ * can be reused again and again).
+ * A userspace process setting up a block device and watching for its events
+ * cannot thus reliably tell whether an event relates to the device it just set
+ * up or another earlier instance with the same name.
+ * This sequential number allows userspace processes to solve this problem, and
+ * uniquely associate an uevent to the lifetime to a device.
+ */
+static atomic64_t diskseq;
+
/* for extended dynamic devt allocation, currently only one major is used */
#define NR_EXT_DEVT (1 << MINORBITS)
static DEFINE_IDA(ext_devt_ida);
@@ -60,7 +77,8 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
* initial capacity during probing.
*/
if (size == capacity ||
- (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
+ !disk_live(disk) ||
+ (disk->flags & GENHD_FL_HIDDEN))
return false;
pr_info("%s: detected capacity change from %lld to %lld\n",
@@ -78,11 +96,17 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
EXPORT_SYMBOL_GPL(set_capacity_and_notify);
/*
- * Format the device name of the indicated disk into the supplied buffer and
- * return a pointer to that same buffer for convenience.
+ * Format the device name of the indicated block device into the supplied buffer
+ * and return a pointer to that same buffer for convenience.
+ *
+ * Note: do not use this in new code, use the %pg specifier to sprintf and
+ * printk insted.
*/
-char *disk_name(struct gendisk *hd, int partno, char *buf)
+const char *bdevname(struct block_device *bdev, char *buf)
{
+ struct gendisk *hd = bdev->bd_disk;
+ int partno = bdev->bd_partno;
+
if (!partno)
snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
@@ -92,11 +116,6 @@ char *disk_name(struct gendisk *hd, int partno, char *buf)
return buf;
}
-
-const char *bdevname(struct block_device *bdev, char *buf)
-{
- return disk_name(bdev->bd_disk, bdev->bd_partno, buf);
-}
EXPORT_SYMBOL(bdevname);
static void part_stat_read_all(struct block_device *part,
@@ -294,54 +313,19 @@ void unregister_blkdev(unsigned int major, const char *name)
EXPORT_SYMBOL(unregister_blkdev);
-/**
- * blk_mangle_minor - scatter minor numbers apart
- * @minor: minor number to mangle
- *
- * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
- * is enabled. Mangling twice gives the original value.
- *
- * RETURNS:
- * Mangled value.
- *
- * CONTEXT:
- * Don't care.
- */
-static int blk_mangle_minor(int minor)
-{
-#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
- int i;
-
- for (i = 0; i < MINORBITS / 2; i++) {
- int low = minor & (1 << i);
- int high = minor & (1 << (MINORBITS - 1 - i));
- int distance = MINORBITS - 1 - 2 * i;
-
- minor ^= low | high; /* clear both bits */
- low <<= distance; /* swap the positions */
- high >>= distance;
- minor |= low | high; /* and set */
- }
-#endif
- return minor;
-}
-
int blk_alloc_ext_minor(void)
{
int idx;
idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
- if (idx < 0) {
- if (idx == -ENOSPC)
- return -EBUSY;
- return idx;
- }
- return blk_mangle_minor(idx);
+ if (idx == -ENOSPC)
+ return -EBUSY;
+ return idx;
}
void blk_free_ext_minor(unsigned int minor)
{
- ida_free(&ext_devt_ida, blk_mangle_minor(minor));
+ ida_free(&ext_devt_ida, minor);
}
static char *bdevt_str(dev_t devt, char *buf)
@@ -390,78 +374,20 @@ static void disk_scan_partitions(struct gendisk *disk)
blkdev_put(bdev, FMODE_READ);
}
-static void register_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups)
-{
- struct device *ddev = disk_to_dev(disk);
- int err;
-
- ddev->parent = parent;
-
- dev_set_name(ddev, "%s", disk->disk_name);
-
- /* delay uevents, until we scanned partition table */
- dev_set_uevent_suppress(ddev, 1);
-
- if (groups) {
- WARN_ON(ddev->groups);
- ddev->groups = groups;
- }
- if (device_add(ddev))
- return;
- if (!sysfs_deprecated) {
- err = sysfs_create_link(block_depr, &ddev->kobj,
- kobject_name(&ddev->kobj));
- if (err) {
- device_del(ddev);
- return;
- }
- }
-
- /*
- * avoid probable deadlock caused by allocating memory with
- * GFP_KERNEL in runtime_resume callback of its all ancestor
- * devices
- */
- pm_runtime_set_memalloc_noio(ddev, true);
-
- disk->part0->bd_holder_dir =
- kobject_create_and_add("holders", &ddev->kobj);
- disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-
- if (disk->flags & GENHD_FL_HIDDEN)
- return;
-
- disk_scan_partitions(disk);
-
- /* announce the disk and partitions after all partitions are created */
- dev_set_uevent_suppress(ddev, 0);
- disk_uevent(disk, KOBJ_ADD);
-
- if (disk->queue->backing_dev_info->dev) {
- err = sysfs_create_link(&ddev->kobj,
- &disk->queue->backing_dev_info->dev->kobj,
- "bdi");
- WARN_ON(err);
- }
-}
-
/**
- * __device_add_disk - add disk information to kernel list
+ * device_add_disk - add disk information to kernel list
* @parent: parent device for the disk
* @disk: per-device partitioning information
* @groups: Additional per-device sysfs groups
- * @register_queue: register the queue if set to true
*
* This function registers the partitioning information in @disk
* with the kernel.
- *
- * FIXME: error handling
*/
-static void __device_add_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups,
- bool register_queue)
+int device_add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups)
+
{
+ struct device *ddev = disk_to_dev(disk);
int ret;
/*
@@ -470,8 +396,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
* elevator if one is needed, that is, for devices requesting queue
* registration.
*/
- if (register_queue)
- elevator_init_mq(disk->queue);
+ elevator_init_mq(disk->queue);
/*
* If the driver provides an explicit major number it also must provide
@@ -481,7 +406,8 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
* and all partitions from the extended dev_t space.
*/
if (disk->major) {
- WARN_ON(!disk->minors);
+ if (WARN_ON(!disk->minors))
+ return -EINVAL;
if (disk->minors > DISK_MAX_PARTS) {
pr_err("block: can't allocate more than %d partitions\n",
@@ -489,21 +415,65 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
disk->minors = DISK_MAX_PARTS;
}
} else {
- WARN_ON(disk->minors);
+ if (WARN_ON(disk->minors))
+ return -EINVAL;
ret = blk_alloc_ext_minor();
- if (ret < 0) {
- WARN_ON(1);
- return;
- }
+ if (ret < 0)
+ return ret;
disk->major = BLOCK_EXT_MAJOR;
- disk->first_minor = MINOR(ret);
+ disk->first_minor = ret;
disk->flags |= GENHD_FL_EXT_DEVT;
}
- disk->flags |= GENHD_FL_UP;
+ ret = disk_alloc_events(disk);
+ if (ret)
+ goto out_free_ext_minor;
+
+ /* delay uevents, until we scanned partition table */
+ dev_set_uevent_suppress(ddev, 1);
+
+ ddev->parent = parent;
+ ddev->groups = groups;
+ dev_set_name(ddev, "%s", disk->disk_name);
+ if (!(disk->flags & GENHD_FL_HIDDEN))
+ ddev->devt = MKDEV(disk->major, disk->first_minor);
+ ret = device_add(ddev);
+ if (ret)
+ goto out_disk_release_events;
+ if (!sysfs_deprecated) {
+ ret = sysfs_create_link(block_depr, &ddev->kobj,
+ kobject_name(&ddev->kobj));
+ if (ret)
+ goto out_device_del;
+ }
+
+ /*
+ * avoid probable deadlock caused by allocating memory with
+ * GFP_KERNEL in runtime_resume callback of its all ancestor
+ * devices
+ */
+ pm_runtime_set_memalloc_noio(ddev, true);
+
+ ret = blk_integrity_add(disk);
+ if (ret)
+ goto out_del_block_link;
+
+ disk->part0->bd_holder_dir =
+ kobject_create_and_add("holders", &ddev->kobj);
+ if (!disk->part0->bd_holder_dir)
+ goto out_del_integrity;
+ disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+ if (!disk->slave_dir)
+ goto out_put_holder_dir;
+
+ ret = bd_register_pending_holders(disk);
+ if (ret < 0)
+ goto out_put_slave_dir;
- disk_alloc_events(disk);
+ ret = blk_register_queue(disk);
+ if (ret)
+ goto out_put_slave_dir;
if (disk->flags & GENHD_FL_HIDDEN) {
/*
@@ -513,48 +483,56 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->flags |= GENHD_FL_NO_PART_SCAN;
} else {
- struct backing_dev_info *bdi = disk->queue->backing_dev_info;
- struct device *dev = disk_to_dev(disk);
-
- /* Register BDI before referencing it from bdev */
- dev->devt = MKDEV(disk->major, disk->first_minor);
- ret = bdi_register(bdi, "%u:%u",
+ ret = bdi_register(disk->bdi, "%u:%u",
disk->major, disk->first_minor);
- WARN_ON(ret);
- bdi_set_owner(bdi, dev);
- bdev_add(disk->part0, dev->devt);
- }
- register_disk(parent, disk, groups);
- if (register_queue)
- blk_register_queue(disk);
+ if (ret)
+ goto out_unregister_queue;
+ bdi_set_owner(disk->bdi, ddev);
+ ret = sysfs_create_link(&ddev->kobj,
+ &disk->bdi->dev->kobj, "bdi");
+ if (ret)
+ goto out_unregister_bdi;
- /*
- * Take an extra ref on queue which will be put on disk_release()
- * so that it sticks around as long as @disk is there.
- */
- if (blk_get_queue(disk->queue))
- set_bit(GD_QUEUE_REF, &disk->state);
- else
- WARN_ON_ONCE(1);
+ bdev_add(disk->part0, ddev->devt);
+ disk_scan_partitions(disk);
- disk_add_events(disk);
- blk_integrity_add(disk);
-}
+ /*
+ * Announce the disk and partitions after all partitions are
+ * created. (for hidden disks uevents remain suppressed forever)
+ */
+ dev_set_uevent_suppress(ddev, 0);
+ disk_uevent(disk, KOBJ_ADD);
+ }
-void device_add_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups)
+ disk_update_readahead(disk);
+ disk_add_events(disk);
+ return 0;
-{
- __device_add_disk(parent, disk, groups, true);
+out_unregister_bdi:
+ if (!(disk->flags & GENHD_FL_HIDDEN))
+ bdi_unregister(disk->bdi);
+out_unregister_queue:
+ blk_unregister_queue(disk);
+out_put_slave_dir:
+ kobject_put(disk->slave_dir);
+out_put_holder_dir:
+ kobject_put(disk->part0->bd_holder_dir);
+out_del_integrity:
+ blk_integrity_del(disk);
+out_del_block_link:
+ if (!sysfs_deprecated)
+ sysfs_remove_link(block_depr, dev_name(ddev));
+out_device_del:
+ device_del(ddev);
+out_disk_release_events:
+ disk_release_events(disk);
+out_free_ext_minor:
+ if (disk->major == BLOCK_EXT_MAJOR)
+ blk_free_ext_minor(disk->first_minor);
+ return WARN_ON_ONCE(ret); /* keep until all callers handle errors */
}
EXPORT_SYMBOL(device_add_disk);
-void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
-{
- __device_add_disk(parent, disk, NULL, false);
-}
-EXPORT_SYMBOL(device_add_disk_no_queue_reg);
-
/**
* del_gendisk - remove the gendisk
* @disk: the struct gendisk to remove
@@ -578,26 +556,20 @@ void del_gendisk(struct gendisk *disk)
{
might_sleep();
- if (WARN_ON_ONCE(!disk->queue))
+ if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
return;
blk_integrity_del(disk);
disk_del_events(disk);
mutex_lock(&disk->open_mutex);
- disk->flags &= ~GENHD_FL_UP;
+ remove_inode_hash(disk->part0->bd_inode);
blk_drop_partitions(disk);
mutex_unlock(&disk->open_mutex);
fsync_bdev(disk->part0);
__invalidate_device(disk->part0, true);
- /*
- * Unhash the bdev inode for this device so that it can't be looked
- * up any more even if openers still hold references to it.
- */
- remove_inode_hash(disk->part0->bd_inode);
-
set_capacity(disk, 0);
if (!(disk->flags & GENHD_FL_HIDDEN)) {
@@ -607,7 +579,7 @@ void del_gendisk(struct gendisk *disk)
* Unregister bdi before releasing device numbers (as they can
* get reused and we'd get clashes in sysfs).
*/
- bdi_unregister(disk->queue->backing_dev_info);
+ bdi_unregister(disk->bdi);
}
blk_unregister_queue(disk);
@@ -683,7 +655,6 @@ void __init printk_all_partitions(void)
while ((dev = class_dev_iter_next(&iter))) {
struct gendisk *disk = dev_to_disk(dev);
struct block_device *part;
- char name_buf[BDEVNAME_SIZE];
char devt_buf[BDEVT_SIZE];
unsigned long idx;
@@ -703,11 +674,10 @@ void __init printk_all_partitions(void)
xa_for_each(&disk->part_tbl, idx, part) {
if (!bdev_nr_sectors(part))
continue;
- printk("%s%s %10llu %s %s",
+ printk("%s%s %10llu %pg %s",
bdev_is_partition(part) ? " " : "",
bdevt_str(part->bd_dev, devt_buf),
- bdev_nr_sectors(part) >> 1,
- disk_name(disk, part->bd_partno, name_buf),
+ bdev_nr_sectors(part) >> 1, part,
part->bd_meta_info ?
part->bd_meta_info->uuid : "");
if (bdev_is_partition(part))
@@ -785,7 +755,6 @@ static int show_partition(struct seq_file *seqf, void *v)
struct gendisk *sgp = v;
struct block_device *part;
unsigned long idx;
- char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
@@ -798,10 +767,9 @@ static int show_partition(struct seq_file *seqf, void *v)
xa_for_each(&sgp->part_tbl, idx, part) {
if (!bdev_nr_sectors(part))
continue;
- seq_printf(seqf, "%4d %7d %10llu %s\n",
+ seq_printf(seqf, "%4d %7d %10llu %pg\n",
MAJOR(part->bd_dev), MINOR(part->bd_dev),
- bdev_nr_sectors(part) >> 1,
- disk_name(sgp, part->bd_partno, buf));
+ bdev_nr_sectors(part) >> 1, part);
}
rcu_read_unlock();
return 0;
@@ -968,6 +936,14 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
}
+static ssize_t diskseq_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%llu\n", disk->diskseq);
+}
+
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -980,6 +956,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
+static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
@@ -1025,6 +1002,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_events.attr,
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
+ &dev_attr_diskseq.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1074,18 +1052,24 @@ static void disk_release(struct device *dev)
might_sleep();
- if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR)
- blk_free_ext_minor(MINOR(dev->devt));
disk_release_events(disk);
kfree(disk->random);
xa_destroy(&disk->part_tbl);
- bdput(disk->part0);
- if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue)
- blk_put_queue(disk->queue);
- kfree(disk);
+ disk->queue->disk = NULL;
+ blk_put_queue(disk->queue);
+ iput(disk->part0->bd_inode); /* frees the disk */
}
+
+static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
+}
+
struct class block_class = {
.name = "block",
+ .dev_uevent = block_uevent,
};
static char *block_devnode(struct device *dev, umode_t *mode,
@@ -1117,7 +1101,6 @@ static int diskstats_show(struct seq_file *seqf, void *v)
{
struct gendisk *gp = v;
struct block_device *hd;
- char buf[BDEVNAME_SIZE];
unsigned int inflight;
struct disk_stats stat;
unsigned long idx;
@@ -1140,15 +1123,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
else
inflight = part_in_flight(hd);
- seq_printf(seqf, "%4d %7d %s "
+ seq_printf(seqf, "%4d %7d %pg "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
"%u %u %u "
"%lu %lu %lu %u "
"%lu %u"
"\n",
- MAJOR(hd->bd_dev), MINOR(hd->bd_dev),
- disk_name(gp, hd->bd_partno, buf),
+ MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
stat.ios[STAT_READ],
stat.merges[STAT_READ],
stat.sectors[STAT_READ],
@@ -1240,17 +1222,25 @@ dev_t blk_lookup_devt(const char *name, int partno)
return devt;
}
-struct gendisk *__alloc_disk_node(int minors, int node_id)
+struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
+ struct lock_class_key *lkclass)
{
struct gendisk *disk;
+ if (!blk_get_queue(q))
+ return NULL;
+
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (!disk)
- return NULL;
+ goto out_put_queue;
+
+ disk->bdi = bdi_alloc(node_id);
+ if (!disk->bdi)
+ goto out_free_disk;
disk->part0 = bdev_alloc(disk, 0);
if (!disk->part0)
- goto out_free_disk;
+ goto out_free_bdi;
disk->node_id = node_id;
mutex_init(&disk->open_mutex);
@@ -1258,23 +1248,33 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl;
- disk->minors = minors;
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk));
+ inc_diskseq(disk);
+ disk->queue = q;
+ q->disk = disk;
+ lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
+#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
+ INIT_LIST_HEAD(&disk->slave_bdevs);
+#endif
return disk;
out_destroy_part_tbl:
xa_destroy(&disk->part_tbl);
- bdput(disk->part0);
+ iput(disk->part0->bd_inode);
+out_free_bdi:
+ bdi_put(disk->bdi);
out_free_disk:
kfree(disk);
+out_put_queue:
+ blk_put_queue(q);
return NULL;
}
EXPORT_SYMBOL(__alloc_disk_node);
-struct gendisk *__blk_alloc_disk(int node)
+struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
@@ -1283,12 +1283,11 @@ struct gendisk *__blk_alloc_disk(int node)
if (!q)
return NULL;
- disk = __alloc_disk_node(0, node);
+ disk = __alloc_disk_node(q, node, lkclass);
if (!disk) {
blk_cleanup_queue(q);
return NULL;
}
- disk->queue = q;
return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);
@@ -1363,3 +1362,8 @@ int bdev_read_only(struct block_device *bdev)
return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
}
EXPORT_SYMBOL(bdev_read_only);
+
+void inc_diskseq(struct gendisk *disk)
+{
+ disk->diskseq = atomic64_inc_return(&diskseq);
+}
diff --git a/block/holder.c b/block/holder.c
new file mode 100644
index 000000000000..9dc084182337
--- /dev/null
+++ b/block/holder.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/genhd.h>
+
+struct bd_holder_disk {
+ struct list_head list;
+ struct block_device *bdev;
+ int refcnt;
+};
+
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
+ struct gendisk *disk)
+{
+ struct bd_holder_disk *holder;
+
+ list_for_each_entry(holder, &disk->slave_bdevs, list)
+ if (holder->bdev == bdev)
+ return holder;
+ return NULL;
+}
+
+static int add_symlink(struct kobject *from, struct kobject *to)
+{
+ return sysfs_create_link(from, to, kobject_name(to));
+}
+
+static void del_symlink(struct kobject *from, struct kobject *to)
+{
+ sysfs_remove_link(from, kobject_name(to));
+}
+
+static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+ int ret;
+
+ ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
+ if (ret)
+ return ret;
+ ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
+ if (ret)
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+ return ret;
+}
+
+/**
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * This functions creates the following sysfs symlinks.
+ *
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
+ * - from "holders" directory of the @bdev to the holder @disk
+ *
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
+ * passed to bd_link_disk_holder(), then:
+ *
+ * /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
+ *
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+ struct bd_holder_disk *holder;
+ int ret = 0;
+
+ mutex_lock(&disk->open_mutex);
+
+ WARN_ON_ONCE(!bdev->bd_holder);
+
+ /* FIXME: remove the following once add_disk() handles errors */
+ if (WARN_ON(!bdev->bd_holder_dir))
+ goto out_unlock;
+
+ holder = bd_find_holder_disk(bdev, disk);
+ if (holder) {
+ holder->refcnt++;
+ goto out_unlock;
+ }
+
+ holder = kzalloc(sizeof(*holder), GFP_KERNEL);
+ if (!holder) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ INIT_LIST_HEAD(&holder->list);
+ holder->bdev = bdev;
+ holder->refcnt = 1;
+ if (disk->slave_dir) {
+ ret = __link_disk_holder(bdev, disk);
+ if (ret) {
+ kfree(holder);
+ goto out_unlock;
+ }
+ }
+
+ list_add(&holder->list, &disk->slave_bdevs);
+ /*
+ * del_gendisk drops the initial reference to bd_holder_dir, so we need
+ * to keep our own here to allow for cleanup past that point.
+ */
+ kobject_get(bdev->bd_holder_dir);
+
+out_unlock:
+ mutex_unlock(&disk->open_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
+
+static void __unlink_disk_holder(struct block_device *bdev,
+ struct gendisk *disk)
+{
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+ del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
+}
+
+/**
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+ struct bd_holder_disk *holder;
+
+ mutex_lock(&disk->open_mutex);
+ holder = bd_find_holder_disk(bdev, disk);
+ if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
+ if (disk->slave_dir)
+ __unlink_disk_holder(bdev, disk);
+ kobject_put(bdev->bd_holder_dir);
+ list_del_init(&holder->list);
+ kfree(holder);
+ }
+ mutex_unlock(&disk->open_mutex);
+}
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
+
+int bd_register_pending_holders(struct gendisk *disk)
+{
+ struct bd_holder_disk *holder;
+ int ret;
+
+ mutex_lock(&disk->open_mutex);
+ list_for_each_entry(holder, &disk->slave_bdevs, list) {
+ ret = __link_disk_holder(holder->bdev, disk);
+ if (ret)
+ goto out_undo;
+ }
+ mutex_unlock(&disk->open_mutex);
+ return 0;
+
+out_undo:
+ list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list)
+ __unlink_disk_holder(holder->bdev, disk);
+ mutex_unlock(&disk->open_mutex);
+ return ret;
+}
diff --git a/block/ioctl.c b/block/ioctl.c
index 24beec9ca9c9..eb0491e90b9a 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -16,6 +16,7 @@
static int blkpg_do_ioctl(struct block_device *bdev,
struct blkpg_partition __user *upart, int op)
{
+ struct gendisk *disk = bdev->bd_disk;
struct blkpg_partition p;
long long start, length;
@@ -30,7 +31,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
return -EINVAL;
if (op == BLKPG_DEL_PARTITION)
- return bdev_del_partition(bdev, p.pno);
+ return bdev_del_partition(disk, p.pno);
start = p.start >> SECTOR_SHIFT;
length = p.length >> SECTOR_SHIFT;
@@ -40,9 +41,9 @@ static int blkpg_do_ioctl(struct block_device *bdev,
/* check if partition is aligned to blocksize */
if (p.start & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- return bdev_add_partition(bdev, p.pno, start, length);
+ return bdev_add_partition(disk, p.pno, start, length);
case BLKPG_RESIZE_PARTITION:
- return bdev_resize_partition(bdev, p.pno, start, length);
+ return bdev_resize_partition(disk, p.pno, start, length);
default:
return -EINVAL;
}
@@ -469,6 +470,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
BLKDEV_DISCARD_SECURE);
case BLKZEROOUT:
return blk_ioctl_zeroout(bdev, mode, arg);
+ case BLKGETDISKSEQ:
+ return put_u64(argp, bdev->bd_disk->diskseq);
case BLKREPORTZONE:
return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
case BLKRESETZONE:
@@ -504,7 +507,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
- bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
+ bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKRRPART:
return blkdev_reread_part(bdev, mode);
@@ -554,7 +557,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKFRAGET:
if (!argp)
return -EINVAL;
- return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512);
+ return put_long(argp,
+ (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
if ((size >> 9) > ~0UL)
@@ -626,7 +630,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
if (!argp)
return -EINVAL;
return compat_put_long(argp,
- (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
+ (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
if ((size >> 9) > ~0UL)
diff --git a/block/ioprio.c b/block/ioprio.c
index bee628f9f1b2..0e4ff245f2bf 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -74,9 +74,8 @@ int ioprio_check_cap(int ioprio)
fallthrough;
/* rt has prio field too */
case IOPRIO_CLASS_BE:
- if (data >= IOPRIO_BE_NR || data < 0)
+ if (data >= IOPRIO_NR_LEVELS || data < 0)
return -EINVAL;
-
break;
case IOPRIO_CLASS_IDLE:
break;
@@ -171,7 +170,7 @@ static int get_task_ioprio(struct task_struct *p)
ret = security_task_getioprio(p);
if (ret)
goto out;
- ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+ ret = IOPRIO_DEFAULT;
task_lock(p);
if (p->io_context)
ret = p->io_context->ioprio;
@@ -183,9 +182,9 @@ out:
int ioprio_best(unsigned short aprio, unsigned short bprio)
{
if (!ioprio_valid(aprio))
- aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+ aprio = IOPRIO_DEFAULT;
if (!ioprio_valid(bprio))
- bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+ bprio = IOPRIO_DEFAULT;
return min(aprio, bprio);
}
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 81e3279ecd57..15a8be57203d 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -596,13 +596,13 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *head = &kcq->rq_list[sched_domain];
spin_lock(&kcq->lock);
+ trace_block_rq_insert(rq);
if (at_head)
list_move(&rq->queuelist, head);
else
list_move_tail(&rq->queuelist, head);
sbitmap_set_bit(&khd->kcq_map[sched_domain],
rq->mq_ctx->index_hw[hctx->type]);
- trace_block_rq_insert(rq);
spin_unlock(&kcq->lock);
}
}
diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c
deleted file mode 100644
index 3b4bfddec39f..000000000000
--- a/block/mq-deadline-cgroup.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/blk-cgroup.h>
-#include <linux/ioprio.h>
-
-#include "mq-deadline-cgroup.h"
-
-static struct blkcg_policy dd_blkcg_policy;
-
-static struct blkcg_policy_data *dd_cpd_alloc(gfp_t gfp)
-{
- struct dd_blkcg *pd;
-
- pd = kzalloc(sizeof(*pd), gfp);
- if (!pd)
- return NULL;
- pd->stats = alloc_percpu_gfp(typeof(*pd->stats),
- GFP_KERNEL | __GFP_ZERO);
- if (!pd->stats) {
- kfree(pd);
- return NULL;
- }
- return &pd->cpd;
-}
-
-static void dd_cpd_free(struct blkcg_policy_data *cpd)
-{
- struct dd_blkcg *dd_blkcg = container_of(cpd, typeof(*dd_blkcg), cpd);
-
- free_percpu(dd_blkcg->stats);
- kfree(dd_blkcg);
-}
-
-static struct dd_blkcg *dd_blkcg_from_pd(struct blkg_policy_data *pd)
-{
- return container_of(blkcg_to_cpd(pd->blkg->blkcg, &dd_blkcg_policy),
- struct dd_blkcg, cpd);
-}
-
-/*
- * Convert an association between a block cgroup and a request queue into a
- * pointer to the mq-deadline information associated with a (blkcg, queue) pair.
- */
-struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio)
-{
- struct blkg_policy_data *pd;
-
- pd = blkg_to_pd(bio->bi_blkg, &dd_blkcg_policy);
- if (!pd)
- return NULL;
-
- return dd_blkcg_from_pd(pd);
-}
-
-static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
-{
- static const char *const prio_class_name[] = {
- [IOPRIO_CLASS_NONE] = "NONE",
- [IOPRIO_CLASS_RT] = "RT",
- [IOPRIO_CLASS_BE] = "BE",
- [IOPRIO_CLASS_IDLE] = "IDLE",
- };
- struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd);
- int res = 0;
- u8 prio;
-
- for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++)
- res += scnprintf(buf + res, size - res,
- " [%s] dispatched=%u inserted=%u merged=%u",
- prio_class_name[prio],
- ddcg_sum(blkcg, dispatched, prio) +
- ddcg_sum(blkcg, merged, prio) -
- ddcg_sum(blkcg, completed, prio),
- ddcg_sum(blkcg, inserted, prio) -
- ddcg_sum(blkcg, completed, prio),
- ddcg_sum(blkcg, merged, prio));
-
- return res;
-}
-
-static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q,
- struct blkcg *blkcg)
-{
- struct dd_blkg *pd;
-
- pd = kzalloc(sizeof(*pd), gfp);
- if (!pd)
- return NULL;
- return &pd->pd;
-}
-
-static void dd_pd_free(struct blkg_policy_data *pd)
-{
- struct dd_blkg *dd_blkg = container_of(pd, typeof(*dd_blkg), pd);
-
- kfree(dd_blkg);
-}
-
-static struct blkcg_policy dd_blkcg_policy = {
- .cpd_alloc_fn = dd_cpd_alloc,
- .cpd_free_fn = dd_cpd_free,
-
- .pd_alloc_fn = dd_pd_alloc,
- .pd_free_fn = dd_pd_free,
- .pd_stat_fn = dd_pd_stat,
-};
-
-int dd_activate_policy(struct request_queue *q)
-{
- return blkcg_activate_policy(q, &dd_blkcg_policy);
-}
-
-void dd_deactivate_policy(struct request_queue *q)
-{
- blkcg_deactivate_policy(q, &dd_blkcg_policy);
-}
-
-int __init dd_blkcg_init(void)
-{
- return blkcg_policy_register(&dd_blkcg_policy);
-}
-
-void __exit dd_blkcg_exit(void)
-{
- blkcg_policy_unregister(&dd_blkcg_policy);
-}
diff --git a/block/mq-deadline-cgroup.h b/block/mq-deadline-cgroup.h
deleted file mode 100644
index 0143fd74f3ce..000000000000
--- a/block/mq-deadline-cgroup.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#if !defined(_MQ_DEADLINE_CGROUP_H_)
-#define _MQ_DEADLINE_CGROUP_H_
-
-#include <linux/blk-cgroup.h>
-
-struct request_queue;
-
-/**
- * struct io_stats_per_prio - I/O statistics per I/O priority class.
- * @inserted: Number of inserted requests.
- * @merged: Number of merged requests.
- * @dispatched: Number of dispatched requests.
- * @completed: Number of I/O completions.
- */
-struct io_stats_per_prio {
- local_t inserted;
- local_t merged;
- local_t dispatched;
- local_t completed;
-};
-
-/* I/O statistics per I/O cgroup per I/O priority class (IOPRIO_CLASS_*). */
-struct blkcg_io_stats {
- struct io_stats_per_prio stats[4];
-};
-
-/**
- * struct dd_blkcg - Per cgroup data.
- * @cpd: blkcg_policy_data structure.
- * @stats: I/O statistics.
- */
-struct dd_blkcg {
- struct blkcg_policy_data cpd; /* must be the first member */
- struct blkcg_io_stats __percpu *stats;
-};
-
-/*
- * Count one event of type 'event_type' and with I/O priority class
- * 'prio_class'.
- */
-#define ddcg_count(ddcg, event_type, prio_class) do { \
-if (ddcg) { \
- struct blkcg_io_stats *io_stats = get_cpu_ptr((ddcg)->stats); \
- \
- BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \
- BUILD_BUG_ON(!__same_type((prio_class), u8)); \
- local_inc(&io_stats->stats[(prio_class)].event_type); \
- put_cpu_ptr(io_stats); \
-} \
-} while (0)
-
-/*
- * Returns the total number of ddcg_count(ddcg, event_type, prio_class) calls
- * across all CPUs. No locking or barriers since it is fine if the returned
- * sum is slightly outdated.
- */
-#define ddcg_sum(ddcg, event_type, prio) ({ \
- unsigned int cpu; \
- u32 sum = 0; \
- \
- BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \
- BUILD_BUG_ON(!__same_type((prio), u8)); \
- for_each_present_cpu(cpu) \
- sum += local_read(&per_cpu_ptr((ddcg)->stats, cpu)-> \
- stats[(prio)].event_type); \
- sum; \
-})
-
-#ifdef CONFIG_BLK_CGROUP
-
-/**
- * struct dd_blkg - Per (cgroup, request queue) data.
- * @pd: blkg_policy_data structure.
- */
-struct dd_blkg {
- struct blkg_policy_data pd; /* must be the first member */
-};
-
-struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio);
-int dd_activate_policy(struct request_queue *q);
-void dd_deactivate_policy(struct request_queue *q);
-int __init dd_blkcg_init(void);
-void __exit dd_blkcg_exit(void);
-
-#else /* CONFIG_BLK_CGROUP */
-
-static inline struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio)
-{
- return NULL;
-}
-
-static inline int dd_activate_policy(struct request_queue *q)
-{
- return 0;
-}
-
-static inline void dd_deactivate_policy(struct request_queue *q)
-{
-}
-
-static inline int dd_blkcg_init(void)
-{
- return 0;
-}
-
-static inline void dd_blkcg_exit(void)
-{
-}
-
-#endif /* CONFIG_BLK_CGROUP */
-
-#endif /* _MQ_DEADLINE_CGROUP_H_ */
diff --git a/block/mq-deadline-main.c b/block/mq-deadline.c
index 6f612e6dc82b..3c3693c34f06 100644
--- a/block/mq-deadline-main.c
+++ b/block/mq-deadline.c
@@ -25,18 +25,12 @@
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
-#include "mq-deadline-cgroup.h"
/*
* See Documentation/block/deadline-iosched.rst
*/
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-/*
- * Time after which to dispatch lower priority requests even if higher
- * priority requests are pending.
- */
-static const int aging_expire = 10 * HZ;
static const int writes_starved = 2; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */
@@ -57,6 +51,14 @@ enum dd_prio {
enum { DD_PRIO_COUNT = 3 };
+/* I/O statistics per I/O priority. */
+struct io_stats_per_prio {
+ local_t inserted;
+ local_t merged;
+ local_t dispatched;
+ local_t completed;
+};
+
/* I/O statistics for all I/O priorities (enum dd_prio). */
struct io_stats {
struct io_stats_per_prio stats[DD_PRIO_COUNT];
@@ -79,9 +81,6 @@ struct deadline_data {
* run time data
*/
- /* Request queue that owns this data structure. */
- struct request_queue *queue;
-
struct dd_per_prio per_prio[DD_PRIO_COUNT];
/* Data direction of latest dispatched request. */
@@ -99,7 +98,6 @@ struct deadline_data {
int writes_starved;
int front_merges;
u32 async_depth;
- int aging_expire;
spinlock_t lock;
spinlock_t zone_lock;
@@ -234,10 +232,8 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
struct deadline_data *dd = q->elevator->elevator_data;
const u8 ioprio_class = dd_rq_ioclass(next);
const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
- struct dd_blkcg *blkcg = next->elv.priv[0];
dd_count(dd, merged, prio);
- ddcg_count(blkcg, merged, ioprio_class);
/*
* if next expires before rq, assign its expire time to rq
@@ -367,15 +363,13 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
/*
* deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc and with a start time <= @latest.
+ * read/write expire, fifo_batch, etc
*/
static struct request *__dd_dispatch_request(struct deadline_data *dd,
- struct dd_per_prio *per_prio,
- u64 latest_start_ns)
+ struct dd_per_prio *per_prio)
{
struct request *rq, *next_rq;
enum dd_data_dir data_dir;
- struct dd_blkcg *blkcg;
enum dd_prio prio;
u8 ioprio_class;
@@ -384,8 +378,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
if (!list_empty(&per_prio->dispatch)) {
rq = list_first_entry(&per_prio->dispatch, struct request,
queuelist);
- if (rq->start_time_ns > latest_start_ns)
- return NULL;
list_del_init(&rq->queuelist);
goto done;
}
@@ -463,8 +455,6 @@ dispatch_find_request:
dd->batching = 0;
dispatch_request:
- if (rq->start_time_ns > latest_start_ns)
- return NULL;
/*
* rq is the selected appropriate request.
*/
@@ -474,8 +464,6 @@ done:
ioprio_class = dd_rq_ioclass(rq);
prio = ioprio_class_to_prio[ioprio_class];
dd_count(dd, dispatched, prio);
- blkcg = rq->elv.priv[0];
- ddcg_count(blkcg, dispatched, ioprio_class);
/*
* If the request needs its target zone locked, do it.
*/
@@ -495,32 +483,15 @@ done:
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- const u64 now_ns = ktime_get_ns();
- struct request *rq = NULL;
+ struct request *rq;
enum dd_prio prio;
spin_lock(&dd->lock);
- /*
- * Start with dispatching requests whose deadline expired more than
- * aging_expire jiffies ago.
- */
- for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
- rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns -
- jiffies_to_nsecs(dd->aging_expire));
- if (rq)
- goto unlock;
- }
- /*
- * Next, dispatch requests in priority order. Ignore lower priority
- * requests if any higher priority requests are pending.
- */
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
- rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns);
- if (rq || dd_queued(dd, prio))
+ rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
+ if (rq)
break;
}
-
-unlock:
spin_unlock(&dd->lock);
return rq;
@@ -569,8 +540,6 @@ static void dd_exit_sched(struct elevator_queue *e)
struct deadline_data *dd = e->elevator_data;
enum dd_prio prio;
- dd_deactivate_policy(dd->queue);
-
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
struct dd_per_prio *per_prio = &dd->per_prio[prio];
@@ -584,7 +553,7 @@ static void dd_exit_sched(struct elevator_queue *e)
}
/*
- * Initialize elevator private data (deadline_data) and associate with blkcg.
+ * initialize elevator private data (deadline_data).
*/
static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
{
@@ -593,12 +562,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
enum dd_prio prio;
int ret = -ENOMEM;
- /*
- * Initialization would be very tricky if the queue is not frozen,
- * hence the warning statement below.
- */
- WARN_ON_ONCE(!percpu_ref_is_zero(&q->q_usage_counter));
-
eq = elevator_alloc(q, e);
if (!eq)
return ret;
@@ -614,8 +577,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
if (!dd->stats)
goto free_dd;
- dd->queue = q;
-
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
struct dd_per_prio *per_prio = &dd->per_prio[prio];
@@ -631,21 +592,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->front_merges = 1;
dd->last_dir = DD_WRITE;
dd->fifo_batch = fifo_batch;
- dd->aging_expire = aging_expire;
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
- ret = dd_activate_policy(q);
- if (ret)
- goto free_stats;
-
- ret = 0;
q->elevator = eq;
return 0;
-free_stats:
- free_percpu(dd->stats);
-
free_dd:
kfree(dd);
@@ -677,6 +629,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
if (elv_bio_merge_ok(__rq, bio)) {
*rq = __rq;
+ if (blk_discard_mergable(__rq))
+ return ELEVATOR_DISCARD_MERGE;
return ELEVATOR_FRONT_MERGE;
}
}
@@ -718,7 +672,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
struct dd_per_prio *per_prio;
enum dd_prio prio;
- struct dd_blkcg *blkcg;
LIST_HEAD(free);
lockdep_assert_held(&dd->lock);
@@ -729,18 +682,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
*/
blk_req_zone_write_unlock(rq);
- /*
- * If a block cgroup has been associated with the submitter and if an
- * I/O priority has been set in the associated block cgroup, use the
- * lowest of the cgroup priority and the request priority for the
- * request. If no priority has been set in the request, use the cgroup
- * priority.
- */
prio = ioprio_class_to_prio[ioprio_class];
dd_count(dd, inserted, prio);
- blkcg = dd_blkcg_from_bio(rq->bio);
- ddcg_count(blkcg, inserted, ioprio_class);
- rq->elv.priv[0] = blkcg;
+ rq->elv.priv[0] = (void *)(uintptr_t)1;
if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
blk_mq_free_requests(&free);
@@ -815,13 +759,18 @@ static void dd_finish_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct deadline_data *dd = q->elevator->elevator_data;
- struct dd_blkcg *blkcg = rq->elv.priv[0];
const u8 ioprio_class = dd_rq_ioclass(rq);
const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
struct dd_per_prio *per_prio = &dd->per_prio[prio];
- dd_count(dd, completed, prio);
- ddcg_count(blkcg, completed, ioprio_class);
+ /*
+ * The block layer core may call dd_finish_request() without having
+ * called dd_insert_requests(). Hence only update statistics for
+ * requests for which dd_insert_requests() has been called. See also
+ * blk_mq_request_bypass_insert().
+ */
+ if (rq->elv.priv[0])
+ dd_count(dd, completed, prio);
if (blk_queue_is_zoned(q)) {
unsigned long flags;
@@ -866,7 +815,6 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \
#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
-SHOW_JIFFIES(deadline_aging_expire_show, dd->aging_expire);
SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
SHOW_INT(deadline_front_merges_show, dd->front_merges);
SHOW_INT(deadline_async_depth_show, dd->front_merges);
@@ -896,7 +844,6 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
-STORE_JIFFIES(deadline_aging_expire_store, &dd->aging_expire, 0, INT_MAX);
STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
@@ -915,7 +862,6 @@ static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(front_merges),
DD_ATTR(async_depth),
DD_ATTR(fifo_batch),
- DD_ATTR(aging_expire),
__ATTR_NULL
};
@@ -1144,26 +1090,11 @@ MODULE_ALIAS("mq-deadline-iosched");
static int __init deadline_init(void)
{
- int ret;
-
- ret = elv_register(&mq_deadline);
- if (ret)
- goto out;
- ret = dd_blkcg_init();
- if (ret)
- goto unreg;
-
-out:
- return ret;
-
-unreg:
- elv_unregister(&mq_deadline);
- goto out;
+ return elv_register(&mq_deadline);
}
static void __exit deadline_exit(void)
{
- dd_blkcg_exit();
elv_unregister(&mq_deadline);
}
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 6e2a649669e5..278593b8e4e9 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -264,7 +264,6 @@ config SYSV68_PARTITION
config CMDLINE_PARTITION
bool "Command line partition support" if PARTITION_ADVANCED
- select BLK_CMDLINE_PARSER
help
Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts.
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
index c64c57b958bf..2c381c694c57 100644
--- a/block/partitions/acorn.c
+++ b/block/partitions/acorn.c
@@ -275,7 +275,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state)
/*
* Work out start of non-adfs partition.
*/
- nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
+ nr_sects = get_capacity(state->disk) - start_sect;
if (start_sect) {
switch (id) {
@@ -540,7 +540,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state)
if (i != 0) {
sector_t size;
- size = get_capacity(state->bdev->bd_disk);
+ size = get_capacity(state->disk);
put_partition(state, slot++, start, size - start);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
}
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index c7b4fd1a4a97..85f4b967565e 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -67,29 +67,13 @@ struct pvd {
#define LVM_MAXLVS 256
/**
- * last_lba(): return number of last logical block of device
- * @bdev: block device
- *
- * Description: Returns last LBA value on success, 0 on error.
- * This is stored (by sd and ide-geometry) in
- * the part[0] entry for this disk, and is the number of
- * physical sectors available on the disk.
- */
-static u64 last_lba(struct block_device *bdev)
-{
- if (!bdev || !bdev->bd_inode)
- return 0;
- return (bdev->bd_inode->i_size >> 9) - 1ULL;
-}
-
-/**
* read_lba(): Read bytes from disk, starting at given LBA
* @state
* @lba
* @buffer
* @count
*
- * Description: Reads @count bytes from @state->bdev into @buffer.
+ * Description: Reads @count bytes from @state->disk into @buffer.
* Returns number of bytes read on success, 0 on error.
*/
static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
@@ -97,7 +81,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
{
size_t totalreadcount = 0;
- if (!buffer || lba + count / 512 > last_lba(state->bdev))
+ if (!buffer || lba + count / 512 > get_capacity(state->disk) - 1ULL)
return 0;
while (count) {
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
index 9526491d9aed..5c8624e26a54 100644
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -34,7 +34,6 @@ int amiga_partition(struct parsed_partitions *state)
int start_sect, nr_sects, blk, part, res = 0;
int blksize = 1; /* Multiplier for disk block size */
int slot = 1;
- char b[BDEVNAME_SIZE];
for (blk = 0; ; blk++, put_dev_sector(sect)) {
if (blk == RDB_ALLOCATION_LIMIT)
@@ -42,7 +41,7 @@ int amiga_partition(struct parsed_partitions *state)
data = read_part_sector(state, blk, &sect);
if (!data) {
pr_err("Dev %s: unable to read RDB block %d\n",
- bdevname(state->bdev, b), blk);
+ state->disk->disk_name, blk);
res = -1;
goto rdb_done;
}
@@ -64,7 +63,7 @@ int amiga_partition(struct parsed_partitions *state)
}
pr_err("Dev %s: RDB in block %d has bad checksum\n",
- bdevname(state->bdev, b), blk);
+ state->disk->disk_name, blk);
}
/* blksize is blocks per 512 byte standard block */
@@ -84,7 +83,7 @@ int amiga_partition(struct parsed_partitions *state)
data = read_part_sector(state, blk, &sect);
if (!data) {
pr_err("Dev %s: unable to read partition block %d\n",
- bdevname(state->bdev, b), blk);
+ state->disk->disk_name, blk);
res = -1;
goto rdb_done;
}
diff --git a/block/partitions/atari.c b/block/partitions/atari.c
index 2305840c8522..da5994175416 100644
--- a/block/partitions/atari.c
+++ b/block/partitions/atari.c
@@ -47,7 +47,7 @@ int atari_partition(struct parsed_partitions *state)
* ATARI partition scheme supports 512 lba only. If this is not
* the case, bail early to avoid miscalculating hd_size.
*/
- if (bdev_logical_block_size(state->bdev) != 512)
+ if (queue_logical_block_size(state->disk->queue) != 512)
return 0;
rs = read_part_sector(state, 0, &sect);
@@ -55,7 +55,7 @@ int atari_partition(struct parsed_partitions *state)
return -1;
/* Verify this is an Atari rootsector: */
- hd_size = state->bdev->bd_inode->i_size >> 9;
+ hd_size = get_capacity(state->disk);
if (!VALID_PARTITION(&rs->part[0], hd_size) &&
!VALID_PARTITION(&rs->part[1], hd_size) &&
!VALID_PARTITION(&rs->part[2], hd_size) &&
diff --git a/block/partitions/check.h b/block/partitions/check.h
index c577e9ee67f0..d5b28e309d64 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -9,7 +9,7 @@
* description.
*/
struct parsed_partitions {
- struct block_device *bdev;
+ struct gendisk *disk;
char name[BDEVNAME_SIZE];
struct {
sector_t from;
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index 8f545c36cde4..1af610f0ba8c 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -14,20 +14,248 @@
* For further information, see "Documentation/block/cmdline-partition.rst"
*
*/
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "check.h"
-#include <linux/cmdline-parser.h>
-#include "check.h"
+/* partition flags */
+#define PF_RDONLY 0x01 /* Device is read only */
+#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */
+
+struct cmdline_subpart {
+ char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */
+ sector_t from;
+ sector_t size;
+ int flags;
+ struct cmdline_subpart *next_subpart;
+};
+
+struct cmdline_parts {
+ char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */
+ unsigned int nr_subparts;
+ struct cmdline_subpart *subpart;
+ struct cmdline_parts *next_parts;
+};
+
+static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
+{
+ int ret = 0;
+ struct cmdline_subpart *new_subpart;
+
+ *subpart = NULL;
+
+ new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
+ if (!new_subpart)
+ return -ENOMEM;
+
+ if (*partdef == '-') {
+ new_subpart->size = (sector_t)(~0ULL);
+ partdef++;
+ } else {
+ new_subpart->size = (sector_t)memparse(partdef, &partdef);
+ if (new_subpart->size < (sector_t)PAGE_SIZE) {
+ pr_warn("cmdline partition size is invalid.");
+ ret = -EINVAL;
+ goto fail;
+ }
+ }
+
+ if (*partdef == '@') {
+ partdef++;
+ new_subpart->from = (sector_t)memparse(partdef, &partdef);
+ } else {
+ new_subpart->from = (sector_t)(~0ULL);
+ }
+
+ if (*partdef == '(') {
+ int length;
+ char *next = strchr(++partdef, ')');
+
+ if (!next) {
+ pr_warn("cmdline partition format is invalid.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ length = min_t(int, next - partdef,
+ sizeof(new_subpart->name) - 1);
+ strncpy(new_subpart->name, partdef, length);
+ new_subpart->name[length] = '\0';
+
+ partdef = ++next;
+ } else
+ new_subpart->name[0] = '\0';
+
+ new_subpart->flags = 0;
+
+ if (!strncmp(partdef, "ro", 2)) {
+ new_subpart->flags |= PF_RDONLY;
+ partdef += 2;
+ }
+
+ if (!strncmp(partdef, "lk", 2)) {
+ new_subpart->flags |= PF_POWERUP_LOCK;
+ partdef += 2;
+ }
+
+ *subpart = new_subpart;
+ return 0;
+fail:
+ kfree(new_subpart);
+ return ret;
+}
+
+static void free_subpart(struct cmdline_parts *parts)
+{
+ struct cmdline_subpart *subpart;
+
+ while (parts->subpart) {
+ subpart = parts->subpart;
+ parts->subpart = subpart->next_subpart;
+ kfree(subpart);
+ }
+}
+
+static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+{
+ int ret = -EINVAL;
+ char *next;
+ int length;
+ struct cmdline_subpart **next_subpart;
+ struct cmdline_parts *newparts;
+ char buf[BDEVNAME_SIZE + 32 + 4];
+
+ *parts = NULL;
+
+ newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
+ if (!newparts)
+ return -ENOMEM;
+
+ next = strchr(bdevdef, ':');
+ if (!next) {
+ pr_warn("cmdline partition has no block device.");
+ goto fail;
+ }
+
+ length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
+ strncpy(newparts->name, bdevdef, length);
+ newparts->name[length] = '\0';
+ newparts->nr_subparts = 0;
+
+ next_subpart = &newparts->subpart;
+
+ while (next && *(++next)) {
+ bdevdef = next;
+ next = strchr(bdevdef, ',');
+
+ length = (!next) ? (sizeof(buf) - 1) :
+ min_t(int, next - bdevdef, sizeof(buf) - 1);
+
+ strncpy(buf, bdevdef, length);
+ buf[length] = '\0';
+
+ ret = parse_subpart(next_subpart, buf);
+ if (ret)
+ goto fail;
+
+ newparts->nr_subparts++;
+ next_subpart = &(*next_subpart)->next_subpart;
+ }
+
+ if (!newparts->subpart) {
+ pr_warn("cmdline partition has no valid partition.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ *parts = newparts;
+
+ return 0;
+fail:
+ free_subpart(newparts);
+ kfree(newparts);
+ return ret;
+}
+
+static void cmdline_parts_free(struct cmdline_parts **parts)
+{
+ struct cmdline_parts *next_parts;
+
+ while (*parts) {
+ next_parts = (*parts)->next_parts;
+ free_subpart(*parts);
+ kfree(*parts);
+ *parts = next_parts;
+ }
+}
+
+static int cmdline_parts_parse(struct cmdline_parts **parts,
+ const char *cmdline)
+{
+ int ret;
+ char *buf;
+ char *pbuf;
+ char *next;
+ struct cmdline_parts **next_parts;
+
+ *parts = NULL;
+
+ next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ next_parts = parts;
+
+ while (next && *pbuf) {
+ next = strchr(pbuf, ';');
+ if (next)
+ *next = '\0';
+
+ ret = parse_parts(next_parts, pbuf);
+ if (ret)
+ goto fail;
+
+ if (next)
+ pbuf = ++next;
+
+ next_parts = &(*next_parts)->next_parts;
+ }
+
+ if (!*parts) {
+ pr_warn("cmdline partition has no valid partition.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ret = 0;
+done:
+ kfree(buf);
+ return ret;
+
+fail:
+ cmdline_parts_free(parts);
+ goto done;
+}
+
+static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
+ const char *bdev)
+{
+ while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
+ parts = parts->next_parts;
+ return parts;
+}
static char *cmdline;
static struct cmdline_parts *bdev_parts;
-static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
+static int add_part(int slot, struct cmdline_subpart *subpart,
+ struct parsed_partitions *state)
{
int label_min;
struct partition_meta_info *info;
char tmp[sizeof(info->volname) + 4];
- struct parsed_partitions *state = (struct parsed_partitions *)param;
if (slot >= state->limit)
return 1;
@@ -50,6 +278,35 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
return 0;
}
+static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+ struct parsed_partitions *state)
+{
+ sector_t from = 0;
+ struct cmdline_subpart *subpart;
+ int slot = 1;
+
+ for (subpart = parts->subpart; subpart;
+ subpart = subpart->next_subpart, slot++) {
+ if (subpart->from == (sector_t)(~0ULL))
+ subpart->from = from;
+ else
+ from = subpart->from;
+
+ if (from >= disk_size)
+ break;
+
+ if (subpart->size > (disk_size - from))
+ subpart->size = disk_size - from;
+
+ from += subpart->size;
+
+ if (add_part(slot, subpart, state))
+ break;
+ }
+
+ return slot;
+}
+
static int __init cmdline_parts_setup(char *s)
{
cmdline = s;
@@ -123,7 +380,6 @@ static void cmdline_parts_verifier(int slot, struct parsed_partitions *state)
int cmdline_partition(struct parsed_partitions *state)
{
sector_t disk_size;
- char bdev[BDEVNAME_SIZE];
struct cmdline_parts *parts;
if (cmdline) {
@@ -140,14 +396,13 @@ int cmdline_partition(struct parsed_partitions *state)
if (!bdev_parts)
return 0;
- bdevname(state->bdev, bdev);
- parts = cmdline_parts_find(bdev_parts, bdev);
+ parts = cmdline_parts_find(bdev_parts, state->disk->disk_name);
if (!parts)
return 0;
- disk_size = get_capacity(state->bdev->bd_disk) << 9;
+ disk_size = get_capacity(state->disk) << 9;
- cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state);
+ cmdline_parts_set(parts, disk_size, state);
cmdline_parts_verifier(1, state);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 4230d4f71879..58c4c362c94f 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -135,8 +135,8 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
}
state->pp_buf[0] = '\0';
- state->bdev = hd->part0;
- disk_name(hd, 0, state->name);
+ state->disk = hd;
+ snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name);
snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
if (isdigit(state->name[strlen(state->name)-1]))
sprintf(state->name, "p");
@@ -259,9 +259,8 @@ static const struct attribute_group *part_attr_groups[] = {
static void part_release(struct device *dev)
{
- if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR)
- blk_free_ext_minor(MINOR(dev->devt));
- bdput(dev_to_bdev(dev));
+ put_disk(dev_to_bdev(dev)->bd_disk);
+ iput(dev_to_bdev(dev)->bd_inode);
}
static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
@@ -281,12 +280,10 @@ struct device_type part_type = {
.uevent = part_uevent,
};
-/*
- * Must be called either with open_mutex held, before a disk can be opened or
- * after all disk users are gone.
- */
static void delete_partition(struct block_device *part)
{
+ lockdep_assert_held(&part->bd_disk->open_mutex);
+
fsync_bdev(part);
__invalidate_device(part, true);
@@ -351,20 +348,17 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
if (xa_load(&disk->part_tbl, partno))
return ERR_PTR(-EBUSY);
+ /* ensure we always have a reference to the whole disk */
+ get_device(disk_to_dev(disk));
+
+ err = -ENOMEM;
bdev = bdev_alloc(disk, partno);
if (!bdev)
- return ERR_PTR(-ENOMEM);
+ goto out_put_disk;
bdev->bd_start_sect = start;
bdev_set_nr_sectors(bdev, len);
- if (info) {
- err = -ENOMEM;
- bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL);
- if (!bdev->bd_meta_info)
- goto out_bdput;
- }
-
pdev = &bdev->bd_device;
dname = dev_name(ddev);
if (isdigit(dname[strlen(dname) - 1]))
@@ -388,6 +382,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
}
pdev->devt = devt;
+ if (info) {
+ err = -ENOMEM;
+ bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL);
+ if (!bdev->bd_meta_info)
+ goto out_put;
+ }
+
/* delay uevent until 'holders' subdir is created */
dev_set_uevent_suppress(pdev, 1);
err = device_add(pdev);
@@ -417,14 +418,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
kobject_uevent(&pdev->kobj, KOBJ_ADD);
return bdev;
-out_bdput:
- bdput(bdev);
- return ERR_PTR(err);
out_del:
kobject_put(bdev->bd_holder_dir);
device_del(pdev);
out_put:
put_device(pdev);
+out_put_disk:
+ put_disk(disk);
return ERR_PTR(err);
}
@@ -449,15 +449,14 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
return overlap;
}
-int bdev_add_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length)
+int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length)
{
struct block_device *part;
- struct gendisk *disk = bdev->bd_disk;
int ret;
mutex_lock(&disk->open_mutex);
- if (!(disk->flags & GENHD_FL_UP)) {
+ if (!disk_live(disk)) {
ret = -ENXIO;
goto out;
}
@@ -475,13 +474,13 @@ out:
return ret;
}
-int bdev_del_partition(struct block_device *bdev, int partno)
+int bdev_del_partition(struct gendisk *disk, int partno)
{
struct block_device *part = NULL;
int ret = -ENXIO;
- mutex_lock(&bdev->bd_disk->open_mutex);
- part = xa_load(&bdev->bd_disk->part_tbl, partno);
+ mutex_lock(&disk->open_mutex);
+ part = xa_load(&disk->part_tbl, partno);
if (!part)
goto out_unlock;
@@ -492,18 +491,18 @@ int bdev_del_partition(struct block_device *bdev, int partno)
delete_partition(part);
ret = 0;
out_unlock:
- mutex_unlock(&bdev->bd_disk->open_mutex);
+ mutex_unlock(&disk->open_mutex);
return ret;
}
-int bdev_resize_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length)
+int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length)
{
struct block_device *part = NULL;
int ret = -ENXIO;
- mutex_lock(&bdev->bd_disk->open_mutex);
- part = xa_load(&bdev->bd_disk->part_tbl, partno);
+ mutex_lock(&disk->open_mutex);
+ part = xa_load(&disk->part_tbl, partno);
if (!part)
goto out_unlock;
@@ -512,14 +511,14 @@ int bdev_resize_partition(struct block_device *bdev, int partno,
goto out_unlock;
ret = -EBUSY;
- if (partition_overlaps(bdev->bd_disk, start, length, partno))
+ if (partition_overlaps(disk, start, length, partno))
goto out_unlock;
bdev_set_nr_sectors(part, length);
ret = 0;
out_unlock:
- mutex_unlock(&bdev->bd_disk->open_mutex);
+ mutex_unlock(&disk->open_mutex);
return ret;
}
@@ -667,7 +666,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate)
lockdep_assert_held(&disk->open_mutex);
- if (!(disk->flags & GENHD_FL_UP))
+ if (!disk_live(disk))
return -ENXIO;
rescan:
@@ -715,10 +714,10 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
{
- struct address_space *mapping = state->bdev->bd_inode->i_mapping;
+ struct address_space *mapping = state->disk->part0->bd_inode->i_mapping;
struct page *page;
- if (n >= get_capacity(state->bdev->bd_disk)) {
+ if (n >= get_capacity(state->disk)) {
state->access_beyond_eod = true;
return NULL;
}
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index e2716792ecc1..7ca5c4c374d4 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -124,19 +124,17 @@ efi_crc32(const void *buf, unsigned long len)
/**
* last_lba(): return number of last logical block of device
- * @bdev: block device
+ * @disk: block device
*
* Description: Returns last LBA value on success, 0 on error.
* This is stored (by sd and ide-geometry) in
* the part[0] entry for this disk, and is the number of
* physical sectors available on the disk.
*/
-static u64 last_lba(struct block_device *bdev)
+static u64 last_lba(struct gendisk *disk)
{
- if (!bdev || !bdev->bd_inode)
- return 0;
- return div_u64(bdev->bd_inode->i_size,
- bdev_logical_block_size(bdev)) - 1ULL;
+ return div_u64(disk->part0->bd_inode->i_size,
+ queue_logical_block_size(disk->queue)) - 1ULL;
}
static inline int pmbr_part_valid(gpt_mbr_record *part)
@@ -231,17 +229,17 @@ done:
* @buffer: destination buffer
* @count: bytes to read
*
- * Description: Reads @count bytes from @state->bdev into @buffer.
+ * Description: Reads @count bytes from @state->disk into @buffer.
* Returns number of bytes read on success, 0 on error.
*/
static size_t read_lba(struct parsed_partitions *state,
u64 lba, u8 *buffer, size_t count)
{
size_t totalreadcount = 0;
- struct block_device *bdev = state->bdev;
- sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
+ sector_t n = lba *
+ (queue_logical_block_size(state->disk->queue) / 512);
- if (!buffer || lba > last_lba(bdev))
+ if (!buffer || lba > last_lba(state->disk))
return 0;
while (count) {
@@ -302,14 +300,14 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
* @lba: the Logical Block Address of the partition table
*
* Description: returns GPT header on success, NULL on error. Allocates
- * and fills a GPT header starting at @ from @state->bdev.
+ * and fills a GPT header starting at @ from @state->disk.
* Note: remember to free gpt when finished with it.
*/
static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
u64 lba)
{
gpt_header *gpt;
- unsigned ssz = bdev_logical_block_size(state->bdev);
+ unsigned ssz = queue_logical_block_size(state->disk->queue);
gpt = kmalloc(ssz, GFP_KERNEL);
if (!gpt)
@@ -356,10 +354,10 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
/* Check the GUID Partition Table header size is too big */
if (le32_to_cpu((*gpt)->header_size) >
- bdev_logical_block_size(state->bdev)) {
+ queue_logical_block_size(state->disk->queue)) {
pr_debug("GUID Partition Table Header size is too large: %u > %u\n",
le32_to_cpu((*gpt)->header_size),
- bdev_logical_block_size(state->bdev));
+ queue_logical_block_size(state->disk->queue));
goto fail;
}
@@ -395,7 +393,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
/* Check the first_usable_lba and last_usable_lba are
* within the disk.
*/
- lastlba = last_lba(state->bdev);
+ lastlba = last_lba(state->disk);
if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -587,13 +585,15 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
gpt_header *pgpt = NULL, *agpt = NULL;
gpt_entry *pptes = NULL, *aptes = NULL;
legacy_mbr *legacymbr;
- sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9;
+ struct gendisk *disk = state->disk;
+ const struct block_device_operations *fops = disk->fops;
+ sector_t total_sectors = get_capacity(state->disk);
u64 lastlba;
if (!ptes)
return 0;
- lastlba = last_lba(state->bdev);
+ lastlba = last_lba(state->disk);
if (!force_gpt) {
/* This will be added to the EFI Spec. per Intel after v1.02. */
legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL);
@@ -621,6 +621,16 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
if (!good_agpt && force_gpt)
good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
+ if (!good_agpt && force_gpt && fops->alternative_gpt_sector) {
+ sector_t agpt_sector;
+ int err;
+
+ err = fops->alternative_gpt_sector(disk, &agpt_sector);
+ if (!err)
+ good_agpt = is_gpt_valid(state, agpt_sector,
+ &agpt, &aptes);
+ }
+
/* The obviously unsuccessful case */
if (!good_pgpt && !good_agpt)
goto fail;
@@ -705,7 +715,7 @@ int efi_partition(struct parsed_partitions *state)
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
- unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+ unsigned ssz = queue_logical_block_size(state->disk->queue) / 512;
if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
@@ -722,7 +732,7 @@ int efi_partition(struct parsed_partitions *state)
u64 size = le64_to_cpu(ptes[i].ending_lba) -
le64_to_cpu(ptes[i].starting_lba) + 1ULL;
- if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
+ if (!is_pte_valid(&ptes[i], last_lba(state->disk)))
continue;
put_partition(state, i+1, start * ssz, size * ssz);
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 4b044e620d35..9bca396aef4a 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -290,8 +290,8 @@ static int find_cms1_partitions(struct parsed_partitions *state,
int ibm_partition(struct parsed_partitions *state)
{
int (*fn)(struct gendisk *disk, dasd_information2_t *info);
- struct block_device *bdev = state->bdev;
- struct gendisk *disk = bdev->bd_disk;
+ struct gendisk *disk = state->disk;
+ struct block_device *bdev = disk->part0;
int blocksize, res;
loff_t i_size, offset, size;
dasd_information2_t *info;
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index cc86534c80ad..27f6c7d9c776 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
*
* Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
@@ -304,7 +304,7 @@ static bool ldm_validate_privheads(struct parsed_partitions *state,
}
}
- num_sects = state->bdev->bd_inode->i_size >> 9;
+ num_sects = get_capacity(state->disk);
if ((ph[0]->config_start > num_sects) ||
((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -339,11 +339,11 @@ out:
/**
* ldm_validate_tocblocks - Validate the table of contents and its backups
* @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @state->bdev, of the database
+ * @base: Offset, into @state->disk, of the database
* @ldb: Cache of the database structures
*
* Find and compare the four tables of contents of the LDM Database stored on
- * @state->bdev and return the parsed information into @toc1.
+ * @state->disk and return the parsed information into @toc1.
*
* The offsets and sizes of the configs are range-checked against a privhead.
*
@@ -486,8 +486,8 @@ out:
* only likely to happen if the underlying device is strange. If that IS
* the case we should return zero to let someone else try.
*
- * Return: 'true' @state->bdev is a dynamic disk
- * 'false' @state->bdev is not a dynamic disk, or an error occurred
+ * Return: 'true' @state->disk is a dynamic disk
+ * 'false' @state->disk is not a dynamic disk, or an error occurred
*/
static bool ldm_validate_partition_table(struct parsed_partitions *state)
{
@@ -1340,7 +1340,7 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
/**
* ldm_get_vblks - Read the on-disk database of VBLKs into memory
* @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @state->bdev, of the database
+ * @base: Offset, into @state->disk, of the database
* @ldb: Cache of the database structures
*
* To use the information from the VBLKs, they need to be read from the disk,
@@ -1432,10 +1432,10 @@ static void ldm_free_vblks (struct list_head *lh)
* example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
* and so on: the actual data containing partitions.
*
- * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
- * 0 Success, @state->bdev is not a dynamic disk
+ * Return: 1 Success, @state->disk is a dynamic disk and we handled it
+ * 0 Success, @state->disk is not a dynamic disk
* -1 An error occurred before enough information had been read
- * Or @state->bdev is a dynamic disk, but it may be corrupted
+ * Or @state->disk is a dynamic disk, but it may be corrupted
*/
int ldm_partition(struct parsed_partitions *state)
{
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index b6095335636c..7b521df00a39 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -133,7 +133,7 @@ int mac_partition(struct parsed_partitions *state)
}
#ifdef CONFIG_PPC_PMAC
if (found_root_goodness)
- note_bootable_part(state->bdev->bd_dev, found_root,
+ note_bootable_part(state->disk->part0->bd_dev, found_root,
found_root_goodness);
#endif
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index f5102596a984..b5d5c229cc3b 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -135,11 +135,12 @@ static void parse_extended(struct parsed_partitions *state,
Sector sect;
unsigned char *data;
sector_t this_sector, this_size;
- sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+ sector_t sector_size;
int loopct = 0; /* number of links followed
without finding a data partition */
int i;
+ sector_size = queue_logical_block_size(state->disk->queue) / 512;
this_sector = first_sector;
this_size = first_size;
@@ -579,7 +580,7 @@ static struct {
int msdos_partition(struct parsed_partitions *state)
{
- sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+ sector_t sector_size;
Sector sect;
unsigned char *data;
struct msdos_partition *p;
@@ -587,6 +588,7 @@ int msdos_partition(struct parsed_partitions *state)
int slot;
u32 disksig;
+ sector_size = queue_logical_block_size(state->disk->queue) / 512;
data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c
index 4273f1bb0515..9cc6b8c1eea4 100644
--- a/block/partitions/sgi.c
+++ b/block/partitions/sgi.c
@@ -43,7 +43,6 @@ int sgi_partition(struct parsed_partitions *state)
Sector sect;
struct sgi_disklabel *label;
struct sgi_partition *p;
- char b[BDEVNAME_SIZE];
label = read_part_sector(state, 0, &sect);
if (!label)
@@ -52,7 +51,7 @@ int sgi_partition(struct parsed_partitions *state)
magic = label->magic_mushroom;
if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
/*printk("Dev %s SGI disklabel: bad magic %08x\n",
- bdevname(bdev, b), be32_to_cpu(magic));*/
+ state->disk->disk_name, be32_to_cpu(magic));*/
put_dev_sector(sect);
return 0;
}
@@ -63,7 +62,7 @@ int sgi_partition(struct parsed_partitions *state)
}
if(csum) {
printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
- bdevname(state->bdev, b));
+ state->disk->disk_name);
put_dev_sector(sect);
return 0;
}
diff --git a/block/partitions/sun.c b/block/partitions/sun.c
index 47dc53eccf77..ddf9e6def4b2 100644
--- a/block/partitions/sun.c
+++ b/block/partitions/sun.c
@@ -65,7 +65,6 @@ int sun_partition(struct parsed_partitions *state)
} * label;
struct sun_partition *p;
unsigned long spc;
- char b[BDEVNAME_SIZE];
int use_vtoc;
int nparts;
@@ -76,7 +75,7 @@ int sun_partition(struct parsed_partitions *state)
p = label->partitions;
if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
- bdevname(bdev, b), be16_to_cpu(label->magic)); */
+ state->disk->disk_name, be16_to_cpu(label->magic)); */
put_dev_sector(sect);
return 0;
}
@@ -86,7 +85,7 @@ int sun_partition(struct parsed_partitions *state)
csum ^= *ush--;
if (csum) {
printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
- bdevname(state->bdev, b));
+ state->disk->disk_name);
put_dev_sector(sect);
return 0;
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index d910534b3a41..00c203b2a921 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -147,11 +147,10 @@ static void t10_pi_type1_prepare(struct request *rq)
break;
bip_for_each_vec(iv, bip, iter) {
- void *p, *pmap;
unsigned int j;
+ void *p;
- pmap = kmap_atomic(iv.bv_page);
- p = pmap + iv.bv_offset;
+ p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) {
struct t10_pi_tuple *pi = p;
@@ -161,8 +160,7 @@ static void t10_pi_type1_prepare(struct request *rq)
ref_tag++;
p += tuple_sz;
}
-
- kunmap_atomic(pmap);
+ kunmap_local(p);
}
bip->bip_flags |= BIP_MAPPED_INTEGRITY;
@@ -195,11 +193,10 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
struct bvec_iter iter;
bip_for_each_vec(iv, bip, iter) {
- void *p, *pmap;
unsigned int j;
+ void *p;
- pmap = kmap_atomic(iv.bv_page);
- p = pmap + iv.bv_offset;
+ p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
struct t10_pi_tuple *pi = p;
@@ -210,8 +207,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
intervals--;
p += tuple_sz;
}
-
- kunmap_atomic(pmap);
+ kunmap_local(p);
}
}
}