summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/locking.rst3
-rw-r--r--block/Kconfig2
-rw-r--r--block/bfq-iosched.c9
-rw-r--r--block/blk-cgroup.c32
-rw-r--r--block/blk-core.c239
-rw-r--r--block/blk-integrity.c4
-rw-r--r--block/blk-iocost.c1547
-rw-r--r--block/blk-map.c177
-rw-r--r--block/blk-merge.c201
-rw-r--r--block/blk-mq-debugfs.c11
-rw-r--r--block/blk-mq-sched.c124
-rw-r--r--block/blk-mq-sched.h3
-rw-r--r--block/blk-mq-tag.c152
-rw-r--r--block/blk-mq-tag.h56
-rw-r--r--block/blk-mq.c86
-rw-r--r--block/blk-mq.h76
-rw-r--r--block/blk-settings.c40
-rw-r--r--block/blk-sysfs.c277
-rw-r--r--block/blk-throttle.c59
-rw-r--r--block/blk.h25
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/genhd.c156
-rw-r--r--block/ioctl.c29
-rw-r--r--block/ioprio.c2
-rw-r--r--block/kyber-iosched.c6
-rw-r--r--block/mq-deadline.c6
-rw-r--r--block/partitions/core.c27
-rw-r--r--block/scsi_ioctl.c2
-rw-r--r--drivers/block/amiflop.c2
-rw-r--r--drivers/block/aoe/aoeblk.c3
-rw-r--r--drivers/block/aoe/aoecmd.c4
-rw-r--r--drivers/block/ataflop.c7
-rw-r--r--drivers/block/brd.c1
-rw-r--r--drivers/block/drbd/drbd_nl.c16
-rw-r--r--drivers/block/floppy.c8
-rw-r--r--drivers/block/loop.c4
-rw-r--r--drivers/block/nbd.c15
-rw-r--r--drivers/block/paride/pcd.c2
-rw-r--r--drivers/block/pktcdvd.c94
-rw-r--r--drivers/block/rbd.c4
-rw-r--r--drivers/block/rnbd/rnbd-clt.c12
-rw-r--r--drivers/block/swim.c22
-rw-r--r--drivers/block/swim3.c4
-rw-r--r--drivers/block/virtio_blk.c4
-rw-r--r--drivers/block/xsysace.c26
-rw-r--r--drivers/block/zram/zram_drv.c30
-rw-r--r--drivers/cdrom/gdrom.c2
-rw-r--r--drivers/char/raw.c56
-rw-r--r--drivers/ide/ide-cd.c16
-rw-r--r--drivers/ide/ide-disk.c5
-rw-r--r--drivers/ide/ide-floppy.c2
-rw-r--r--drivers/ide/ide-gd.c48
-rw-r--r--drivers/md/bcache/request.c10
-rw-r--r--drivers/md/bcache/super.c5
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-table.c9
-rw-r--r--drivers/md/dm.c15
-rw-r--r--drivers/md/md-cluster.c6
-rw-r--r--drivers/md/md-linear.c2
-rw-r--r--drivers/md/md.c20
-rw-r--r--drivers/md/md.h2
-rw-r--r--drivers/md/raid0.c16
-rw-r--r--drivers/md/raid10.c46
-rw-r--r--drivers/md/raid5.c31
-rw-r--r--drivers/mmc/core/queue.c3
-rw-r--r--drivers/mtd/mtdcore.c2
-rw-r--r--drivers/nvdimm/blk.c3
-rw-r--r--drivers/nvdimm/btt.c5
-rw-r--r--drivers/nvdimm/bus.c9
-rw-r--r--drivers/nvdimm/nd.h2
-rw-r--r--drivers/nvdimm/pmem.c4
-rw-r--r--drivers/nvme/host/core.c53
-rw-r--r--drivers/nvme/host/multipath.c10
-rw-r--r--drivers/nvme/host/nvme.h13
-rw-r--r--drivers/s390/block/dasd_genhd.c15
-rw-r--r--drivers/s390/block/dasd_ioctl.c9
-rw-r--r--drivers/scsi/iscsi_tcp.c4
-rw-r--r--drivers/scsi/sd.c13
-rw-r--r--drivers/scsi/sr.c36
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/block_dev.c175
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/buffer.c16
-rw-r--r--fs/fs-writeback.c7
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/namei.c4
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/ocfs2/cluster/heartbeat.c28
-rw-r--r--fs/super.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--include/linux/backing-dev.h78
-rw-r--r--include/linux/blk-mq.h15
-rw-r--r--include/linux/blk_types.h7
-rw-r--r--include/linux/blkdev.h44
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/genhd.h15
-rw-r--r--include/linux/ide.h2
-rw-r--r--include/linux/suspend.h4
-rw-r--r--include/linux/swap.h3
-rw-r--r--include/trace/events/iocost.h26
-rw-r--r--include/uapi/linux/capability.h2
-rw-r--r--kernel/power/swap.c21
-rw-r--r--kernel/power/user.c26
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--mm/backing-dev.c14
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/page-writeback.c18
-rw-r--r--mm/page_io.c18
-rw-r--r--mm/swapfile.c49
-rw-r--r--tools/cgroup/iocost_monitor.py54
117 files changed, 2663 insertions, 2094 deletions
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 64f94a18d97e..c0f2c7586531 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -488,9 +488,6 @@ getgeo: no
swap_slot_free_notify: no (see below)
======================= ===================
-unlock_native_capacity and revalidate_disk are called only from
-check_disk_change().
-
swap_slot_free_notify is called with swap_lock and sometimes the page lock
held.
diff --git a/block/Kconfig b/block/Kconfig
index bbad5e8bbffe..a2297edfdde8 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -161,8 +161,6 @@ config BLK_WBT_MQ
depends on BLK_WBT
help
Enable writeback throttling by default on multiqueue devices.
- Multiqueue currently doesn't have support for IO scheduling,
- enabling this option is recommended.
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index fa98470df3f0..9e81d1052091 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4640,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ if (!atomic_read(&hctx->elevator_queued))
+ return false;
+
/*
* Avoiding lock: a race on bfqd->busy_queues should cause at
* most a call to dispatch for nothing
@@ -5554,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
bfq_insert_request(hctx, rq, at_head);
+ atomic_inc(&hctx->elevator_queued);
}
}
@@ -5921,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq)
bfq_completed_request(bfqq, bfqd);
bfq_finish_requeue_request_body(bfqq);
+ atomic_dec(&rq->mq_hctx->elevator_queued);
spin_unlock_irqrestore(&bfqd->lock, flags);
} else {
@@ -6360,8 +6365,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int min_shallow;
- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+ min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
}
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c195365c9817..f9b55614d67d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -119,6 +119,8 @@ static void blkg_async_bio_workfn(struct work_struct *work)
async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio;
+ struct blk_plug plug;
+ bool need_plug = false;
/* as long as there are pending bios, @blkg can't go away */
spin_lock_bh(&blkg->async_bio_lock);
@@ -126,8 +128,15 @@ static void blkg_async_bio_workfn(struct work_struct *work)
bio_list_init(&blkg->async_bios);
spin_unlock_bh(&blkg->async_bio_lock);
+ /* start plug only when bio_list contains at least 2 bios */
+ if (bios.head && bios.head->bi_next) {
+ need_plug = true;
+ blk_start_plug(&plug);
+ }
while ((bio = bio_list_pop(&bios)))
submit_bio(bio);
+ if (need_plug)
+ blk_finish_plug(&plug);
}
/**
@@ -1613,16 +1622,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
unsigned long pflags;
+ bool clamp;
u64 now = ktime_to_ns(ktime_get());
u64 exp;
u64 delay_nsec = 0;
int tok;
while (blkg->parent) {
- if (atomic_read(&blkg->use_delay)) {
+ int use_delay = atomic_read(&blkg->use_delay);
+
+ if (use_delay) {
+ u64 this_delay;
+
blkcg_scale_delay(blkg, now);
- delay_nsec = max_t(u64, delay_nsec,
- atomic64_read(&blkg->delay_nsec));
+ this_delay = atomic64_read(&blkg->delay_nsec);
+ if (this_delay > delay_nsec) {
+ delay_nsec = this_delay;
+ clamp = use_delay > 0;
+ }
}
blkg = blkg->parent;
}
@@ -1634,10 +1651,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
* Let's not sleep for all eternity if we've amassed a huge delay.
* Swapping or metadata IO can accumulate 10's of seconds worth of
* delay, and we want userspace to be able to do _something_ so cap the
- * delays at 1 second. If there's 10's of seconds worth of delay then
- * the tasks will be delayed for 1 second for every syscall.
+ * delays at 0.25s. If there's 10's of seconds worth of delay then the
+ * tasks will be delayed for 0.25 second for every syscall. If
+ * blkcg_set_delay() was used as indicated by negative use_delay, the
+ * caller is responsible for regulating the range.
*/
- delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+ if (clamp)
+ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
if (use_memdelay)
psi_memstall_enter(&pflags);
diff --git a/block/blk-core.c b/block/blk-core.c
index 10c08ac50697..1cc4fa6bc7fe 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
- rq->tag = -1;
- rq->internal_tag = -1;
+ rq->tag = BLK_MQ_NO_TAG;
+ rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
refcount_set(&rq->ref, 1);
@@ -538,11 +538,10 @@ struct request_queue *blk_alloc_queue(int node_id)
if (!q->stats)
goto fail_stats;
- q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
- q->backing_dev_info->io_pages = VM_READAHEAD_PAGES;
- q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->node = node_id;
+ atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
+
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
laptop_mode_timer_fn, 0);
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
@@ -643,162 +642,6 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
-static void blk_account_io_merge_bio(struct request *req)
-{
- if (!blk_do_io_stat(req))
- return;
-
- part_stat_lock();
- part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
- part_stat_unlock();
-}
-
-bool bio_attempt_back_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs)
-{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
-
- if (!ll_back_merge_fn(req, bio, nr_segs))
- return false;
-
- trace_block_bio_backmerge(req->q, req, bio);
- rq_qos_merge(req->q, req, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
- blk_rq_set_mixed_merge(req);
-
- req->biotail->bi_next = bio;
- req->biotail = bio;
- req->__data_len += bio->bi_iter.bi_size;
-
- bio_crypt_free_ctx(bio);
-
- blk_account_io_merge_bio(req);
- return true;
-}
-
-bool bio_attempt_front_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs)
-{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
-
- if (!ll_front_merge_fn(req, bio, nr_segs))
- return false;
-
- trace_block_bio_frontmerge(req->q, req, bio);
- rq_qos_merge(req->q, req, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
- blk_rq_set_mixed_merge(req);
-
- bio->bi_next = req->bio;
- req->bio = bio;
-
- req->__sector = bio->bi_iter.bi_sector;
- req->__data_len += bio->bi_iter.bi_size;
-
- bio_crypt_do_front_merge(req, bio);
-
- blk_account_io_merge_bio(req);
- return true;
-}
-
-bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
-{
- unsigned short segments = blk_rq_nr_discard_segments(req);
-
- if (segments >= queue_max_discard_segments(q))
- goto no_merge;
- if (blk_rq_sectors(req) + bio_sectors(bio) >
- blk_rq_get_max_sectors(req, blk_rq_pos(req)))
- goto no_merge;
-
- rq_qos_merge(q, req, bio);
-
- req->biotail->bi_next = bio;
- req->biotail = bio;
- req->__data_len += bio->bi_iter.bi_size;
- req->nr_phys_segments = segments + 1;
-
- blk_account_io_merge_bio(req);
- return true;
-no_merge:
- req_set_nomerge(q, req);
- return false;
-}
-
-/**
- * blk_attempt_plug_merge - try to merge with %current's plugged list
- * @q: request_queue new bio is being queued at
- * @bio: new bio being queued
- * @nr_segs: number of segments in @bio
- * @same_queue_rq: pointer to &struct request that gets filled in when
- * another request associated with @q is found on the plug list
- * (optional, may be %NULL)
- *
- * Determine whether @bio being queued on @q can be merged with a request
- * on %current's plugged list. Returns %true if merge was successful,
- * otherwise %false.
- *
- * Plugging coalesces IOs from the same issuer for the same purpose without
- * going through @q->queue_lock. As such it's more of an issuing mechanism
- * than scheduling, and the request, while may have elvpriv data, is not
- * added on the elevator at this point. In addition, we don't have
- * reliable access to the elevator outside queue lock. Only check basic
- * merging parameters without querying the elevator.
- *
- * Caller must ensure !blk_queue_nomerges(q) beforehand.
- */
-bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, struct request **same_queue_rq)
-{
- struct blk_plug *plug;
- struct request *rq;
- struct list_head *plug_list;
-
- plug = blk_mq_plug(q, bio);
- if (!plug)
- return false;
-
- plug_list = &plug->mq_list;
-
- list_for_each_entry_reverse(rq, plug_list, queuelist) {
- bool merged = false;
-
- if (rq->q == q && same_queue_rq) {
- /*
- * Only blk-mq multiple hardware queues case checks the
- * rq in the same queue, there should be only one such
- * rq in a queue
- **/
- *same_queue_rq = rq;
- }
-
- if (rq->q != q || !blk_rq_merge_ok(rq, bio))
- continue;
-
- switch (blk_try_merge(rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- merged = bio_attempt_back_merge(rq, bio, nr_segs);
- break;
- case ELEVATOR_FRONT_MERGE:
- merged = bio_attempt_front_merge(rq, bio, nr_segs);
- break;
- case ELEVATOR_DISCARD_MERGE:
- merged = bio_attempt_discard_merge(q, rq, bio);
- break;
- default:
- break;
- }
-
- if (merged)
- return true;
- }
-
- return false;
-}
-
static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
char b[BDEVNAME_SIZE];
@@ -1301,14 +1144,28 @@ EXPORT_SYMBOL(submit_bio);
* limits when retrying requests on other queues. Those requests need
* to be checked against the new queue limits again during dispatch.
*/
-static int blk_cloned_rq_check_limits(struct request_queue *q,
+static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
struct request *rq)
{
- if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
+ unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+
+ if (blk_rq_sectors(rq) > max_sectors) {
+ /*
+ * SCSI device does not have a good way to return if
+ * Write Same/Zero is actually supported. If a device rejects
+ * a non-read/write command (discard, write same,etc.) the
+ * low-level device driver will set the relevant queue limit to
+ * 0 to prevent blk-lib from issuing more of the offending
+ * operations. Commands queued prior to the queue limit being
+ * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
+ * errors being propagated to upper layers.
+ */
+ if (max_sectors == 0)
+ return BLK_STS_NOTSUPP;
+
printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
- __func__, blk_rq_sectors(rq),
- blk_queue_get_max_sectors(q, req_op(rq)));
- return -EIO;
+ __func__, blk_rq_sectors(rq), max_sectors);
+ return BLK_STS_IOERR;
}
/*
@@ -1321,10 +1178,10 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
if (rq->nr_phys_segments > queue_max_segments(q)) {
printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
__func__, rq->nr_phys_segments, queue_max_segments(q));
- return -EIO;
+ return BLK_STS_IOERR;
}
- return 0;
+ return BLK_STS_OK;
}
/**
@@ -1334,8 +1191,11 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
*/
blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
{
- if (blk_cloned_rq_check_limits(q, rq))
- return BLK_STS_IOERR;
+ blk_status_t ret;
+
+ ret = blk_cloned_rq_check_limits(q, rq);
+ if (ret != BLK_STS_OK)
+ return ret;
if (rq->rq_disk &&
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
@@ -1461,10 +1321,9 @@ void blk_account_io_start(struct request *rq)
part_stat_unlock();
}
-unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
- unsigned int op)
+static unsigned long __part_start_io_acct(struct hd_struct *part,
+ unsigned int sectors, unsigned int op)
{
- struct hd_struct *part = &disk->part0;
const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies);
@@ -1477,12 +1336,26 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
return now;
}
+
+unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
+ struct bio *bio)
+{
+ *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
+
+ return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
+}
+EXPORT_SYMBOL_GPL(part_start_io_acct);
+
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
+ unsigned int op)
+{
+ return __part_start_io_acct(&disk->part0, sectors, op);
+}
EXPORT_SYMBOL(disk_start_io_acct);
-void disk_end_io_acct(struct gendisk *disk, unsigned int op,
- unsigned long start_time)
+static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
+ unsigned long start_time)
{
- struct hd_struct *part = &disk->part0;
const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies);
unsigned long duration = now - start_time;
@@ -1493,6 +1366,20 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock();
}
+
+void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+ unsigned long start_time)
+{
+ __part_end_io_acct(part, bio_op(bio), start_time);
+ hd_struct_put(part);
+}
+EXPORT_SYMBOL_GPL(part_end_io_acct);
+
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
+ unsigned long start_time)
+{
+ __part_end_io_acct(&disk->part0, op, start_time);
+}
EXPORT_SYMBOL(disk_end_io_acct);
/*
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index c03705cbb9c9..2b36a8f9b813 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -408,7 +408,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
- disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
if (disk->queue->ksm) {
@@ -428,7 +428,7 @@ EXPORT_SYMBOL(blk_integrity_register);
*/
void blk_integrity_unregister(struct gendisk *disk)
{
- disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue);
memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
}
EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index d37b55db2409..ef9476fca1d8 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -68,7 +68,7 @@
* gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
* 12.5% each. The distribution mechanism only cares about these flattened
* shares. They're called hweights (hierarchical weights) and always add
- * upto 1 (HWEIGHT_WHOLE).
+ * upto 1 (WEIGHT_ONE).
*
* A given cgroup's vtime runs slower in inverse proportion to its hweight.
* For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
@@ -179,6 +179,8 @@
#include <linux/parser.h>
#include <linux/sched/signal.h>
#include <linux/blk-cgroup.h>
+#include <asm/local.h>
+#include <asm/local64.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
@@ -215,36 +217,21 @@ enum {
MAX_PERIOD = USEC_PER_SEC,
/*
- * A cgroup's vtime can run 50% behind the device vtime, which
+ * iocg->vtime is targeted at 50% behind the device vtime, which
* serves as its IO credit buffer. Surplus weight adjustment is
* immediately canceled if the vtime margin runs below 10%.
*/
- MARGIN_PCT = 50,
- INUSE_MARGIN_PCT = 10,
+ MARGIN_MIN_PCT = 10,
+ MARGIN_LOW_PCT = 20,
+ MARGIN_TARGET_PCT = 50,
- /* Have some play in waitq timer operations */
- WAITQ_TIMER_MARGIN_PCT = 5,
+ INUSE_ADJ_STEP_PCT = 25,
- /*
- * vtime can wrap well within a reasonable uptime when vrate is
- * consistently raised. Don't trust recorded cgroup vtime if the
- * period counter indicates that it's older than 5mins.
- */
- VTIME_VALID_DUR = 300 * USEC_PER_SEC,
-
- /*
- * Remember the past three non-zero usages and use the max for
- * surplus calculation. Three slots guarantee that we remember one
- * full period usage from the last active stretch even after
- * partial deactivation and re-activation periods. Don't start
- * giving away weight before collecting two data points to prevent
- * hweight adjustments based on one partial activation period.
- */
- NR_USAGE_SLOTS = 3,
- MIN_VALID_USAGES = 2,
+ /* Have some play in timer operations */
+ TIMER_SLACK_PCT = 1,
/* 1/64k is granular enough and can easily be handled w/ u32 */
- HWEIGHT_WHOLE = 1 << 16,
+ WEIGHT_ONE = 1 << 16,
/*
* As vtime is used to calculate the cost of each IO, it needs to
@@ -275,16 +262,40 @@ enum {
/* unbusy hysterisis */
UNBUSY_THR_PCT = 75,
- /* don't let cmds which take a very long time pin lagging for too long */
- MAX_LAGGING_PERIODS = 10,
+ /*
+ * The effect of delay is indirect and non-linear and a huge amount of
+ * future debt can accumulate abruptly while unthrottled. Linearly scale
+ * up delay as debt is going up and then let it decay exponentially.
+ * This gives us quick ramp ups while delay is accumulating and long
+ * tails which can help reducing the frequency of debt explosions on
+ * unthrottle. The parameters are experimentally determined.
+ *
+ * The delay mechanism provides adequate protection and behavior in many
+ * cases. However, this is far from ideal and falls shorts on both
+ * fronts. The debtors are often throttled too harshly costing a
+ * significant level of fairness and possibly total work while the
+ * protection against their impacts on the system can be choppy and
+ * unreliable.
+ *
+ * The shortcoming primarily stems from the fact that, unlike for page
+ * cache, the kernel doesn't have well-defined back-pressure propagation
+ * mechanism and policies for anonymous memory. Fully addressing this
+ * issue will likely require substantial improvements in the area.
+ */
+ MIN_DELAY_THR_PCT = 500,
+ MAX_DELAY_THR_PCT = 25000,
+ MIN_DELAY = 250,
+ MAX_DELAY = 250 * USEC_PER_MSEC,
/*
- * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
- * donate the surplus.
+ * Halve debts if total usage keeps staying under 25% w/o any shortages
+ * for over 100ms.
*/
- SURPLUS_SCALE_PCT = 125, /* * 125% */
- SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
- SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
+ DEBT_BUSY_USAGE_PCT = 25,
+ DEBT_REDUCTION_IDLE_DUR = 100 * USEC_PER_MSEC,
+
+ /* don't let cmds which take a very long time pin lagging for too long */
+ MAX_LAGGING_PERIODS = 10,
/* switch iff the conditions are met for longer than this */
AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
@@ -372,9 +383,15 @@ struct ioc_params {
u32 too_slow_vrate_pct;
};
+struct ioc_margins {
+ s64 min;
+ s64 low;
+ s64 target;
+};
+
struct ioc_missed {
- u32 nr_met;
- u32 nr_missed;
+ local_t nr_met;
+ local_t nr_missed;
u32 last_met;
u32 last_missed;
};
@@ -382,7 +399,7 @@ struct ioc_missed {
struct ioc_pcpu_stat {
struct ioc_missed missed[2];
- u64 rq_wait_ns;
+ local64_t rq_wait_ns;
u64 last_rq_wait_ns;
};
@@ -393,8 +410,9 @@ struct ioc {
bool enabled;
struct ioc_params params;
+ struct ioc_margins margins;
u32 period_us;
- u32 margin_us;
+ u32 timer_slack_ns;
u64 vrate_min;
u64 vrate_max;
@@ -405,18 +423,22 @@ struct ioc {
enum ioc_running running;
atomic64_t vtime_rate;
+ u64 vtime_base_rate;
+ s64 vtime_err;
seqcount_spinlock_t period_seqcount;
- u32 period_at; /* wallclock starttime */
+ u64 period_at; /* wallclock starttime */
u64 period_at_vtime; /* vtime starttime */
atomic64_t cur_period; /* inc'd each period */
int busy_level; /* saturation history */
- u64 inuse_margin_vtime;
bool weights_updated;
atomic_t hweight_gen; /* for lazy hweights */
+ /* the last time debt cancel condition wasn't met */
+ u64 debt_busy_at;
+
u64 autop_too_fast_at;
u64 autop_too_slow_at;
int autop_idx;
@@ -424,6 +446,17 @@ struct ioc {
bool user_cost_model:1;
};
+struct iocg_pcpu_stat {
+ local64_t abs_vusage;
+};
+
+struct iocg_stat {
+ u64 usage_us;
+ u64 wait_us;
+ u64 indebt_us;
+ u64 indelay_us;
+};
+
/* per device-cgroup pair */
struct ioc_gq {
struct blkg_policy_data pd;
@@ -443,12 +476,17 @@ struct ioc_gq {
*
* `last_inuse` remembers `inuse` while an iocg is idle to persist
* surplus adjustments.
+ *
+ * `inuse` may be adjusted dynamically during period. `saved_*` are used
+ * to determine and track adjustments.
*/
u32 cfg_weight;
u32 weight;
u32 active;
u32 inuse;
+
u32 last_inuse;
+ s64 saved_margin;
sector_t cursor; /* to detect randio */
@@ -461,14 +499,14 @@ struct ioc_gq {
* `vtime_done` is the same but progressed on completion rather
* than issue. The delta behind `vtime` represents the cost of
* currently in-flight IOs.
- *
- * `last_vtime` is used to remember `vtime` at the end of the last
- * period to calculate utilization.
*/
atomic64_t vtime;
atomic64_t done_vtime;
u64 abs_vdebt;
- u64 last_vtime;
+
+ /* current delay in effect and when it started */
+ u64 delay;
+ u64 delay_at;
/*
* The period this iocg was last active in. Used for deactivation
@@ -477,21 +515,35 @@ struct ioc_gq {
atomic64_t active_period;
struct list_head active_list;
- /* see __propagate_active_weight() and current_hweight() for details */
+ /* see __propagate_weights() and current_hweight() for details */
u64 child_active_sum;
u64 child_inuse_sum;
+ u64 child_adjusted_sum;
int hweight_gen;
u32 hweight_active;
u32 hweight_inuse;
- bool has_surplus;
+ u32 hweight_donating;
+ u32 hweight_after_donation;
+
+ struct list_head walk_list;
+ struct list_head surplus_list;
struct wait_queue_head waitq;
struct hrtimer waitq_timer;
- struct hrtimer delay_timer;
- /* usage is recorded as fractions of HWEIGHT_WHOLE */
- int usage_idx;
- u32 usages[NR_USAGE_SLOTS];
+ /* timestamp at the latest activation */
+ u64 activated_at;
+
+ /* statistics */
+ struct iocg_pcpu_stat __percpu *pcpu_stat;
+ struct iocg_stat local_stat;
+ struct iocg_stat desc_stat;
+ struct iocg_stat last_stat;
+ u64 last_stat_abs_vusage;
+ u64 usage_delta_us;
+ u64 wait_since;
+ u64 indebt_since;
+ u64 indelay_since;
/* this iocg's depth in the hierarchy and ancestors including self */
int level;
@@ -506,7 +558,7 @@ struct ioc_cgrp {
struct ioc_now {
u64 now_ns;
- u32 now;
+ u64 now;
u64 vnow;
u64 vrate;
};
@@ -656,7 +708,7 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
*/
static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
{
- return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
+ return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
}
/*
@@ -664,18 +716,56 @@ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
*/
static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
{
- return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
+ return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
}
-static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
+static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
+ u64 abs_cost, u64 cost)
{
+ struct iocg_pcpu_stat *gcs;
+
bio->bi_iocost_cost = cost;
atomic64_add(cost, &iocg->vtime);
+
+ gcs = get_cpu_ptr(iocg->pcpu_stat);
+ local64_add(abs_cost, &gcs->abs_vusage);
+ put_cpu_ptr(gcs);
+}
+
+static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
+{
+ if (lock_ioc) {
+ spin_lock_irqsave(&iocg->ioc->lock, *flags);
+ spin_lock(&iocg->waitq.lock);
+ } else {
+ spin_lock_irqsave(&iocg->waitq.lock, *flags);
+ }
+}
+
+static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
+{
+ if (unlock_ioc) {
+ spin_unlock(&iocg->waitq.lock);
+ spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
+ } else {
+ spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
+ }
}
#define CREATE_TRACE_POINTS
#include <trace/events/iocost.h>
+static void ioc_refresh_margins(struct ioc *ioc)
+{
+ struct ioc_margins *margins = &ioc->margins;
+ u32 period_us = ioc->period_us;
+ u64 vrate = ioc->vtime_base_rate;
+
+ margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
+ margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
+ margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
+}
+
/* latency Qos params changed, update period_us and all the dependent params */
static void ioc_refresh_period_us(struct ioc *ioc)
{
@@ -709,9 +799,10 @@ static void ioc_refresh_period_us(struct ioc *ioc)
/* calculate dependent params */
ioc->period_us = period_us;
- ioc->margin_us = period_us * MARGIN_PCT / 100;
- ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
- period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
+ ioc->timer_slack_ns = div64_u64(
+ (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
+ 100);
+ ioc_refresh_margins(ioc);
}
static int ioc_autop_idx(struct ioc *ioc)
@@ -738,8 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc)
return idx;
/* step up/down based on the vrate */
- vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
- VTIME_PER_USEC);
+ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
now_ns = ktime_get_ns();
if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
@@ -847,6 +937,43 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force)
return true;
}
+/*
+ * When an iocg accumulates too much vtime or gets deactivated, we throw away
+ * some vtime, which lowers the overall device utilization. As the exact amount
+ * which is being thrown away is known, we can compensate by accelerating the
+ * vrate accordingly so that the extra vtime generated in the current period
+ * matches what got lost.
+ */
+static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
+{
+ s64 pleft = ioc->period_at + ioc->period_us - now->now;
+ s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
+ s64 vcomp, vcomp_min, vcomp_max;
+
+ lockdep_assert_held(&ioc->lock);
+
+ /* we need some time left in this period */
+ if (pleft <= 0)
+ goto done;
+
+ /*
+ * Calculate how much vrate should be adjusted to offset the error.
+ * Limit the amount of adjustment and deduct the adjusted amount from
+ * the error.
+ */
+ vcomp = -div64_s64(ioc->vtime_err, pleft);
+ vcomp_min = -(ioc->vtime_base_rate >> 1);
+ vcomp_max = ioc->vtime_base_rate;
+ vcomp = clamp(vcomp, vcomp_min, vcomp_max);
+
+ ioc->vtime_err += vcomp * pleft;
+
+ atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
+done:
+ /* bound how much error can accumulate */
+ ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
+}
+
/* take a snapshot of the current [v]time and vrate */
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{
@@ -886,16 +1013,25 @@ static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
/*
* Update @iocg's `active` and `inuse` to @active and @inuse, update level
- * weight sums and propagate upwards accordingly.
+ * weight sums and propagate upwards accordingly. If @save, the current margin
+ * is saved to be used as reference for later inuse in-period adjustments.
*/
-static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
+static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
+ bool save, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
int lvl;
lockdep_assert_held(&ioc->lock);
- inuse = min(active, inuse);
+ inuse = clamp_t(u32, inuse, 1, active);
+
+ iocg->last_inuse = iocg->inuse;
+ if (save)
+ iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
+
+ if (active == iocg->active && inuse == iocg->inuse)
+ return;
for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
struct ioc_gq *parent = iocg->ancestors[lvl];
@@ -933,7 +1069,7 @@ static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse
ioc->weights_updated = true;
}
-static void commit_active_weights(struct ioc *ioc)
+static void commit_weights(struct ioc *ioc)
{
lockdep_assert_held(&ioc->lock);
@@ -945,10 +1081,11 @@ static void commit_active_weights(struct ioc *ioc)
}
}
-static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
+static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
+ bool save, struct ioc_now *now)
{
- __propagate_active_weight(iocg, active, inuse);
- commit_active_weights(iocg->ioc);
+ __propagate_weights(iocg, active, inuse, save, now);
+ commit_weights(iocg->ioc);
}
static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
@@ -964,9 +1101,9 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep
goto out;
/*
- * Paired with wmb in commit_active_weights(). If we saw the
- * updated hweight_gen, all the weight updates from
- * __propagate_active_weight() are visible too.
+ * Paired with wmb in commit_weights(). If we saw the updated
+ * hweight_gen, all the weight updates from __propagate_weights() are
+ * visible too.
*
* We can race with weight updates during calculation and get it
* wrong. However, hweight_gen would have changed and a future
@@ -975,12 +1112,12 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep
*/
smp_rmb();
- hwa = hwi = HWEIGHT_WHOLE;
+ hwa = hwi = WEIGHT_ONE;
for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
struct ioc_gq *parent = iocg->ancestors[lvl];
struct ioc_gq *child = iocg->ancestors[lvl + 1];
- u32 active_sum = READ_ONCE(parent->child_active_sum);
- u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
+ u64 active_sum = READ_ONCE(parent->child_active_sum);
+ u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
u32 active = READ_ONCE(child->active);
u32 inuse = READ_ONCE(child->inuse);
@@ -988,11 +1125,11 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep
if (!active_sum || !inuse_sum)
continue;
- active_sum = max(active, active_sum);
- hwa = hwa * active / active_sum; /* max 16bits * 10000 */
+ active_sum = max_t(u64, active, active_sum);
+ hwa = div64_u64((u64)hwa * active, active_sum);
- inuse_sum = max(inuse, inuse_sum);
- hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
+ inuse_sum = max_t(u64, inuse, inuse_sum);
+ hwi = div64_u64((u64)hwi * inuse, inuse_sum);
}
iocg->hweight_active = max_t(u32, hwa, 1);
@@ -1005,7 +1142,33 @@ out:
*hw_inusep = iocg->hweight_inuse;
}
-static void weight_updated(struct ioc_gq *iocg)
+/*
+ * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
+ * other weights stay unchanged.
+ */
+static u32 current_hweight_max(struct ioc_gq *iocg)
+{
+ u32 hwm = WEIGHT_ONE;
+ u32 inuse = iocg->active;
+ u64 child_inuse_sum;
+ int lvl;
+
+ lockdep_assert_held(&iocg->ioc->lock);
+
+ for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
+ struct ioc_gq *parent = iocg->ancestors[lvl];
+ struct ioc_gq *child = iocg->ancestors[lvl + 1];
+
+ child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
+ hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
+ inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
+ parent->child_active_sum);
+ }
+
+ return max_t(u32, hwm, 1);
+}
+
+static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
struct blkcg_gq *blkg = iocg_to_blkg(iocg);
@@ -1016,16 +1179,15 @@ static void weight_updated(struct ioc_gq *iocg)
weight = iocg->cfg_weight ?: iocc->dfl_weight;
if (weight != iocg->weight && iocg->active)
- propagate_active_weight(iocg, weight,
- DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
+ propagate_weights(iocg, weight, iocg->inuse, true, now);
iocg->weight = weight;
}
static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
- u64 last_period, cur_period, max_period_delta;
- u64 vtime, vmargin, vmin;
+ u64 last_period, cur_period;
+ u64 vtime, vtarget;
int i;
/*
@@ -1064,22 +1226,15 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
goto fail_unlock;
/*
- * vtime may wrap when vrate is raised substantially due to
- * underestimated IO costs. Look at the period and ignore its
- * vtime if the iocg has been idle for too long. Also, cap the
- * budget it can start with to the margin.
+ * Always start with the target budget. On deactivation, we throw away
+ * anything above it.
*/
- max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
+ vtarget = now->vnow - ioc->margins.target;
vtime = atomic64_read(&iocg->vtime);
- vmargin = ioc->margin_us * now->vrate;
- vmin = now->vnow - vmargin;
- if (last_period + max_period_delta < cur_period ||
- time_before64(vtime, vmin)) {
- atomic64_add(vmin - vtime, &iocg->vtime);
- atomic64_add(vmin - vtime, &iocg->done_vtime);
- vtime = vmin;
- }
+ atomic64_add(vtarget - vtime, &iocg->vtime);
+ atomic64_add(vtarget - vtime, &iocg->done_vtime);
+ vtime = vtarget;
/*
* Activate, propagate weight and start period timer if not
@@ -1088,16 +1243,18 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
*/
iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
list_add(&iocg->active_list, &ioc->active_iocgs);
- propagate_active_weight(iocg, iocg->weight,
- iocg->last_inuse ?: iocg->weight);
+
+ propagate_weights(iocg, iocg->weight,
+ iocg->last_inuse ?: iocg->weight, true, now);
TRACE_IOCG_PATH(iocg_activate, iocg, now,
last_period, cur_period, vtime);
- iocg->last_vtime = vtime;
+ iocg->activated_at = now->now;
if (ioc->running == IOC_IDLE) {
ioc->running = IOC_RUNNING;
+ ioc->debt_busy_at = now->now;
ioc_start_period(ioc, now);
}
@@ -1110,6 +1267,110 @@ fail_unlock:
return false;
}
+static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct blkcg_gq *blkg = iocg_to_blkg(iocg);
+ u64 tdelta, delay, new_delay;
+ s64 vover, vover_pct;
+ u32 hwa;
+
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ /* calculate the current delay in effect - 1/2 every second */
+ tdelta = now->now - iocg->delay_at;
+ if (iocg->delay)
+ delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
+ else
+ delay = 0;
+
+ /* calculate the new delay from the debt amount */
+ current_hweight(iocg, &hwa, NULL);
+ vover = atomic64_read(&iocg->vtime) +
+ abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
+ vover_pct = div64_s64(100 * vover,
+ ioc->period_us * ioc->vtime_base_rate);
+
+ if (vover_pct <= MIN_DELAY_THR_PCT)
+ new_delay = 0;
+ else if (vover_pct >= MAX_DELAY_THR_PCT)
+ new_delay = MAX_DELAY;
+ else
+ new_delay = MIN_DELAY +
+ div_u64((MAX_DELAY - MIN_DELAY) *
+ (vover_pct - MIN_DELAY_THR_PCT),
+ MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
+
+ /* pick the higher one and apply */
+ if (new_delay > delay) {
+ iocg->delay = new_delay;
+ iocg->delay_at = now->now;
+ delay = new_delay;
+ }
+
+ if (delay >= MIN_DELAY) {
+ if (!iocg->indelay_since)
+ iocg->indelay_since = now->now;
+ blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
+ return true;
+ } else {
+ if (iocg->indelay_since) {
+ iocg->local_stat.indelay_us += now->now - iocg->indelay_since;
+ iocg->indelay_since = 0;
+ }
+ iocg->delay = 0;
+ blkcg_clear_delay(blkg);
+ return false;
+ }
+}
+
+static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
+ struct ioc_now *now)
+{
+ struct iocg_pcpu_stat *gcs;
+
+ lockdep_assert_held(&iocg->ioc->lock);
+ lockdep_assert_held(&iocg->waitq.lock);
+ WARN_ON_ONCE(list_empty(&iocg->active_list));
+
+ /*
+ * Once in debt, debt handling owns inuse. @iocg stays at the minimum
+ * inuse donating all of it share to others until its debt is paid off.
+ */
+ if (!iocg->abs_vdebt && abs_cost) {
+ iocg->indebt_since = now->now;
+ propagate_weights(iocg, iocg->active, 0, false, now);
+ }
+
+ iocg->abs_vdebt += abs_cost;
+
+ gcs = get_cpu_ptr(iocg->pcpu_stat);
+ local64_add(abs_cost, &gcs->abs_vusage);
+ put_cpu_ptr(gcs);
+}
+
+static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
+ struct ioc_now *now)
+{
+ lockdep_assert_held(&iocg->ioc->lock);
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ /* make sure that nobody messed with @iocg */
+ WARN_ON_ONCE(list_empty(&iocg->active_list));
+ WARN_ON_ONCE(iocg->inuse > 1);
+
+ iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
+
+ /* if debt is paid in full, restore inuse */
+ if (!iocg->abs_vdebt) {
+ iocg->local_stat.indebt_us += now->now - iocg->indebt_since;
+ iocg->indebt_since = 0;
+
+ propagate_weights(iocg, iocg->active, iocg->last_inuse,
+ false, now);
+ }
+}
+
static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
int flags, void *key)
{
@@ -1122,7 +1383,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
if (ctx->vbudget < 0)
return -1;
- iocg_commit_bio(ctx->iocg, wait->bio, cost);
+ iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
/*
* autoremove_wake_function() removes the wait entry only when it
@@ -1136,132 +1397,106 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
return 0;
}
-static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
+/*
+ * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
+ * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
+ * addition to iocg->waitq.lock.
+ */
+static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
+ struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
struct iocg_wake_ctx ctx = { .iocg = iocg };
- u64 margin_ns = (u64)(ioc->period_us *
- WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
- u64 vdebt, vshortage, expires, oexpires;
+ u64 vshortage, expires, oexpires;
s64 vbudget;
- u32 hw_inuse;
+ u32 hwa;
lockdep_assert_held(&iocg->waitq.lock);
- current_hweight(iocg, NULL, &hw_inuse);
+ current_hweight(iocg, &hwa, NULL);
vbudget = now->vnow - atomic64_read(&iocg->vtime);
/* pay off debt */
- vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
- if (vdebt && vbudget > 0) {
- u64 delta = min_t(u64, vbudget, vdebt);
- u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
- iocg->abs_vdebt);
+ if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
+ u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
+ u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
+ u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
+
+ lockdep_assert_held(&ioc->lock);
+
+ atomic64_add(vpay, &iocg->vtime);
+ atomic64_add(vpay, &iocg->done_vtime);
+ iocg_pay_debt(iocg, abs_vpay, now);
+ vbudget -= vpay;
+ }
+
+ if (iocg->abs_vdebt || iocg->delay)
+ iocg_kick_delay(iocg, now);
- atomic64_add(delta, &iocg->vtime);
- atomic64_add(delta, &iocg->done_vtime);
- iocg->abs_vdebt -= abs_delta;
+ /*
+ * Debt can still be outstanding if we haven't paid all yet or the
+ * caller raced and called without @pay_debt. Shouldn't wake up waiters
+ * under debt. Make sure @vbudget reflects the outstanding amount and is
+ * not positive.
+ */
+ if (iocg->abs_vdebt) {
+ s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
+ vbudget = min_t(s64, 0, vbudget - vdebt);
}
/*
- * Wake up the ones which are due and see how much vtime we'll need
- * for the next one.
+ * Wake up the ones which are due and see how much vtime we'll need for
+ * the next one. As paying off debt restores hw_inuse, it must be read
+ * after the above debt payment.
*/
- ctx.hw_inuse = hw_inuse;
- ctx.vbudget = vbudget - vdebt;
+ ctx.vbudget = vbudget;
+ current_hweight(iocg, NULL, &ctx.hw_inuse);
+
__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
- if (!waitqueue_active(&iocg->waitq))
+
+ if (!waitqueue_active(&iocg->waitq)) {
+ if (iocg->wait_since) {
+ iocg->local_stat.wait_us += now->now - iocg->wait_since;
+ iocg->wait_since = 0;
+ }
return;
+ }
+
+ if (!iocg->wait_since)
+ iocg->wait_since = now->now;
+
if (WARN_ON_ONCE(ctx.vbudget >= 0))
return;
- /* determine next wakeup, add a quarter margin to guarantee chunking */
+ /* determine next wakeup, add a timer margin to guarantee chunking */
vshortage = -ctx.vbudget;
expires = now->now_ns +
- DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
- expires += margin_ns / 4;
+ DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
+ NSEC_PER_USEC;
+ expires += ioc->timer_slack_ns;
/* if already active and close enough, don't bother */
oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
if (hrtimer_is_queued(&iocg->waitq_timer) &&
- abs(oexpires - expires) <= margin_ns / 4)
+ abs(oexpires - expires) <= ioc->timer_slack_ns)
return;
hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
- margin_ns / 4, HRTIMER_MODE_ABS);
+ ioc->timer_slack_ns, HRTIMER_MODE_ABS);
}
static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
{
struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
+ bool pay_debt = READ_ONCE(iocg->abs_vdebt);
struct ioc_now now;
unsigned long flags;
ioc_now(iocg->ioc, &now);
- spin_lock_irqsave(&iocg->waitq.lock, flags);
- iocg_kick_waitq(iocg, &now);
- spin_unlock_irqrestore(&iocg->waitq.lock, flags);
-
- return HRTIMER_NORESTART;
-}
-
-static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
-{
- struct ioc *ioc = iocg->ioc;
- struct blkcg_gq *blkg = iocg_to_blkg(iocg);
- u64 vtime = atomic64_read(&iocg->vtime);
- u64 vmargin = ioc->margin_us * now->vrate;
- u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
- u64 delta_ns, expires, oexpires;
- u32 hw_inuse;
-
- lockdep_assert_held(&iocg->waitq.lock);
-
- /* debt-adjust vtime */
- current_hweight(iocg, NULL, &hw_inuse);
- vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
-
- /*
- * Clear or maintain depending on the overage. Non-zero vdebt is what
- * guarantees that @iocg is online and future iocg_kick_delay() will
- * clear use_delay. Don't leave it on when there's no vdebt.
- */
- if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
- blkcg_clear_delay(blkg);
- return false;
- }
- if (!atomic_read(&blkg->use_delay) &&
- time_before_eq64(vtime, now->vnow + vmargin))
- return false;
-
- /* use delay */
- delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
- now->vrate) * NSEC_PER_USEC;
- blkcg_set_delay(blkg, delta_ns);
- expires = now->now_ns + delta_ns;
-
- /* if already active and close enough, don't bother */
- oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
- if (hrtimer_is_queued(&iocg->delay_timer) &&
- abs(oexpires - expires) <= margin_ns / 4)
- return true;
-
- hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
- margin_ns / 4, HRTIMER_MODE_ABS);
- return true;
-}
-
-static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
-{
- struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
- struct ioc_now now;
- unsigned long flags;
-
- spin_lock_irqsave(&iocg->waitq.lock, flags);
- ioc_now(iocg->ioc, &now);
- iocg_kick_delay(iocg, &now);
- spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+ iocg_lock(iocg, pay_debt, &flags);
+ iocg_kick_waitq(iocg, pay_debt, &now);
+ iocg_unlock(iocg, pay_debt, &flags);
return HRTIMER_NORESTART;
}
@@ -1278,8 +1513,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
u64 this_rq_wait_ns;
for (rw = READ; rw <= WRITE; rw++) {
- u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
- u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
+ u32 this_met = local_read(&stat->missed[rw].nr_met);
+ u32 this_missed = local_read(&stat->missed[rw].nr_missed);
nr_met[rw] += this_met - stat->missed[rw].last_met;
nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
@@ -1287,7 +1522,7 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
stat->missed[rw].last_missed = this_missed;
}
- this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
+ this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
stat->last_rq_wait_ns = this_rq_wait_ns;
}
@@ -1322,18 +1557,426 @@ static bool iocg_is_idle(struct ioc_gq *iocg)
return true;
}
-/* returns usage with margin added if surplus is large enough */
-static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
+/*
+ * Call this function on the target leaf @iocg's to build pre-order traversal
+ * list of all the ancestors in @inner_walk. The inner nodes are linked through
+ * ->walk_list and the caller is responsible for dissolving the list after use.
+ */
+static void iocg_build_inner_walk(struct ioc_gq *iocg,
+ struct list_head *inner_walk)
{
- /* add margin */
- usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
- usage += SURPLUS_SCALE_ABS;
+ int lvl;
- /* don't bother if the surplus is too small */
- if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
- return 0;
+ WARN_ON_ONCE(!list_empty(&iocg->walk_list));
+
+ /* find the first ancestor which hasn't been visited yet */
+ for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
+ if (!list_empty(&iocg->ancestors[lvl]->walk_list))
+ break;
+ }
+
+ /* walk down and visit the inner nodes to get pre-order traversal */
+ while (++lvl <= iocg->level - 1) {
+ struct ioc_gq *inner = iocg->ancestors[lvl];
+
+ /* record traversal order */
+ list_add_tail(&inner->walk_list, inner_walk);
+ }
+}
+
+/* collect per-cpu counters and propagate the deltas to the parent */
+static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct iocg_stat new_stat;
+ u64 abs_vusage = 0;
+ u64 vusage_delta;
+ int cpu;
+
+ lockdep_assert_held(&iocg->ioc->lock);
+
+ /* collect per-cpu counters */
+ for_each_possible_cpu(cpu) {
+ abs_vusage += local64_read(
+ per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
+ }
+ vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
+ iocg->last_stat_abs_vusage = abs_vusage;
+
+ iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
+ iocg->local_stat.usage_us += iocg->usage_delta_us;
+
+ /* propagate upwards */
+ new_stat.usage_us =
+ iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
+ new_stat.wait_us =
+ iocg->local_stat.wait_us + iocg->desc_stat.wait_us;
+ new_stat.indebt_us =
+ iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us;
+ new_stat.indelay_us =
+ iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us;
+
+ /* propagate the deltas to the parent */
+ if (iocg->level > 0) {
+ struct iocg_stat *parent_stat =
+ &iocg->ancestors[iocg->level - 1]->desc_stat;
+
+ parent_stat->usage_us +=
+ new_stat.usage_us - iocg->last_stat.usage_us;
+ parent_stat->wait_us +=
+ new_stat.wait_us - iocg->last_stat.wait_us;
+ parent_stat->indebt_us +=
+ new_stat.indebt_us - iocg->last_stat.indebt_us;
+ parent_stat->indelay_us +=
+ new_stat.indelay_us - iocg->last_stat.indelay_us;
+ }
+
+ iocg->last_stat = new_stat;
+}
- return usage;
+/* get stat counters ready for reading on all active iocgs */
+static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
+{
+ LIST_HEAD(inner_walk);
+ struct ioc_gq *iocg, *tiocg;
+
+ /* flush leaves and build inner node walk list */
+ list_for_each_entry(iocg, target_iocgs, active_list) {
+ iocg_flush_stat_one(iocg, now);
+ iocg_build_inner_walk(iocg, &inner_walk);
+ }
+
+ /* keep flushing upwards by walking the inner list backwards */
+ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
+ iocg_flush_stat_one(iocg, now);
+ list_del_init(&iocg->walk_list);
+ }
+}
+
+/*
+ * Determine what @iocg's hweight_inuse should be after donating unused
+ * capacity. @hwm is the upper bound and used to signal no donation. This
+ * function also throws away @iocg's excess budget.
+ */
+static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
+ u32 usage, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ u64 vtime = atomic64_read(&iocg->vtime);
+ s64 excess, delta, target, new_hwi;
+
+ /* debt handling owns inuse for debtors */
+ if (iocg->abs_vdebt)
+ return 1;
+
+ /* see whether minimum margin requirement is met */
+ if (waitqueue_active(&iocg->waitq) ||
+ time_after64(vtime, now->vnow - ioc->margins.min))
+ return hwm;
+
+ /* throw away excess above target */
+ excess = now->vnow - vtime - ioc->margins.target;
+ if (excess > 0) {
+ atomic64_add(excess, &iocg->vtime);
+ atomic64_add(excess, &iocg->done_vtime);
+ vtime += excess;
+ ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
+ }
+
+ /*
+ * Let's say the distance between iocg's and device's vtimes as a
+ * fraction of period duration is delta. Assuming that the iocg will
+ * consume the usage determined above, we want to determine new_hwi so
+ * that delta equals MARGIN_TARGET at the end of the next period.
+ *
+ * We need to execute usage worth of IOs while spending the sum of the
+ * new budget (1 - MARGIN_TARGET) and the leftover from the last period
+ * (delta):
+ *
+ * usage = (1 - MARGIN_TARGET + delta) * new_hwi
+ *
+ * Therefore, the new_hwi is:
+ *
+ * new_hwi = usage / (1 - MARGIN_TARGET + delta)
+ */
+ delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
+ now->vnow - ioc->period_at_vtime);
+ target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
+ new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
+
+ return clamp_t(s64, new_hwi, 1, hwm);
+}
+
+/*
+ * For work-conservation, an iocg which isn't using all of its share should
+ * donate the leftover to other iocgs. There are two ways to achieve this - 1.
+ * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
+ *
+ * #1 is mathematically simpler but has the drawback of requiring synchronous
+ * global hweight_inuse updates when idle iocg's get activated or inuse weights
+ * change due to donation snapbacks as it has the possibility of grossly
+ * overshooting what's allowed by the model and vrate.
+ *
+ * #2 is inherently safe with local operations. The donating iocg can easily
+ * snap back to higher weights when needed without worrying about impacts on
+ * other nodes as the impacts will be inherently correct. This also makes idle
+ * iocg activations safe. The only effect activations have is decreasing
+ * hweight_inuse of others, the right solution to which is for those iocgs to
+ * snap back to higher weights.
+ *
+ * So, we go with #2. The challenge is calculating how each donating iocg's
+ * inuse should be adjusted to achieve the target donation amounts. This is done
+ * using Andy's method described in the following pdf.
+ *
+ * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
+ *
+ * Given the weights and target after-donation hweight_inuse values, Andy's
+ * method determines how the proportional distribution should look like at each
+ * sibling level to maintain the relative relationship between all non-donating
+ * pairs. To roughly summarize, it divides the tree into donating and
+ * non-donating parts, calculates global donation rate which is used to
+ * determine the target hweight_inuse for each node, and then derives per-level
+ * proportions.
+ *
+ * The following pdf shows that global distribution calculated this way can be
+ * achieved by scaling inuse weights of donating leaves and propagating the
+ * adjustments upwards proportionally.
+ *
+ * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
+ *
+ * Combining the above two, we can determine how each leaf iocg's inuse should
+ * be adjusted to achieve the target donation.
+ *
+ * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
+ *
+ * The inline comments use symbols from the last pdf.
+ *
+ * b is the sum of the absolute budgets in the subtree. 1 for the root node.
+ * f is the sum of the absolute budgets of non-donating nodes in the subtree.
+ * t is the sum of the absolute budgets of donating nodes in the subtree.
+ * w is the weight of the node. w = w_f + w_t
+ * w_f is the non-donating portion of w. w_f = w * f / b
+ * w_b is the donating portion of w. w_t = w * t / b
+ * s is the sum of all sibling weights. s = Sum(w) for siblings
+ * s_f and s_t are the non-donating and donating portions of s.
+ *
+ * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
+ * w_pt is the donating portion of the parent's weight and w'_pt the same value
+ * after adjustments. Subscript r denotes the root node's values.
+ */
+static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
+{
+ LIST_HEAD(over_hwa);
+ LIST_HEAD(inner_walk);
+ struct ioc_gq *iocg, *tiocg, *root_iocg;
+ u32 after_sum, over_sum, over_target, gamma;
+
+ /*
+ * It's pretty unlikely but possible for the total sum of
+ * hweight_after_donation's to be higher than WEIGHT_ONE, which will
+ * confuse the following calculations. If such condition is detected,
+ * scale down everyone over its full share equally to keep the sum below
+ * WEIGHT_ONE.
+ */
+ after_sum = 0;
+ over_sum = 0;
+ list_for_each_entry(iocg, surpluses, surplus_list) {
+ u32 hwa;
+
+ current_hweight(iocg, &hwa, NULL);
+ after_sum += iocg->hweight_after_donation;
+
+ if (iocg->hweight_after_donation > hwa) {
+ over_sum += iocg->hweight_after_donation;
+ list_add(&iocg->walk_list, &over_hwa);
+ }
+ }
+
+ if (after_sum >= WEIGHT_ONE) {
+ /*
+ * The delta should be deducted from the over_sum, calculate
+ * target over_sum value.
+ */
+ u32 over_delta = after_sum - (WEIGHT_ONE - 1);
+ WARN_ON_ONCE(over_sum <= over_delta);
+ over_target = over_sum - over_delta;
+ } else {
+ over_target = 0;
+ }
+
+ list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
+ if (over_target)
+ iocg->hweight_after_donation =
+ div_u64((u64)iocg->hweight_after_donation *
+ over_target, over_sum);
+ list_del_init(&iocg->walk_list);
+ }
+
+ /*
+ * Build pre-order inner node walk list and prepare for donation
+ * adjustment calculations.
+ */
+ list_for_each_entry(iocg, surpluses, surplus_list) {
+ iocg_build_inner_walk(iocg, &inner_walk);
+ }
+
+ root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
+ WARN_ON_ONCE(root_iocg->level > 0);
+
+ list_for_each_entry(iocg, &inner_walk, walk_list) {
+ iocg->child_adjusted_sum = 0;
+ iocg->hweight_donating = 0;
+ iocg->hweight_after_donation = 0;
+ }
+
+ /*
+ * Propagate the donating budget (b_t) and after donation budget (b'_t)
+ * up the hierarchy.
+ */
+ list_for_each_entry(iocg, surpluses, surplus_list) {
+ struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
+
+ parent->hweight_donating += iocg->hweight_donating;
+ parent->hweight_after_donation += iocg->hweight_after_donation;
+ }
+
+ list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
+ if (iocg->level > 0) {
+ struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
+
+ parent->hweight_donating += iocg->hweight_donating;
+ parent->hweight_after_donation += iocg->hweight_after_donation;
+ }
+ }
+
+ /*
+ * Calculate inner hwa's (b) and make sure the donation values are
+ * within the accepted ranges as we're doing low res calculations with
+ * roundups.
+ */
+ list_for_each_entry(iocg, &inner_walk, walk_list) {
+ if (iocg->level) {
+ struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
+
+ iocg->hweight_active = DIV64_U64_ROUND_UP(
+ (u64)parent->hweight_active * iocg->active,
+ parent->child_active_sum);
+
+ }
+
+ iocg->hweight_donating = min(iocg->hweight_donating,
+ iocg->hweight_active);
+ iocg->hweight_after_donation = min(iocg->hweight_after_donation,
+ iocg->hweight_donating - 1);
+ if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
+ iocg->hweight_donating <= 1 ||
+ iocg->hweight_after_donation == 0)) {
+ pr_warn("iocg: invalid donation weights in ");
+ pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
+ pr_cont(": active=%u donating=%u after=%u\n",
+ iocg->hweight_active, iocg->hweight_donating,
+ iocg->hweight_after_donation);
+ }
+ }
+
+ /*
+ * Calculate the global donation rate (gamma) - the rate to adjust
+ * non-donating budgets by.
+ *
+ * No need to use 64bit multiplication here as the first operand is
+ * guaranteed to be smaller than WEIGHT_ONE (1<<16).
+ *
+ * We know that there are beneficiary nodes and the sum of the donating
+ * hweights can't be whole; however, due to the round-ups during hweight
+ * calculations, root_iocg->hweight_donating might still end up equal to
+ * or greater than whole. Limit the range when calculating the divider.
+ *
+ * gamma = (1 - t_r') / (1 - t_r)
+ */
+ gamma = DIV_ROUND_UP(
+ (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
+ WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
+
+ /*
+ * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
+ * nodes.
+ */
+ list_for_each_entry(iocg, &inner_walk, walk_list) {
+ struct ioc_gq *parent;
+ u32 inuse, wpt, wptp;
+ u64 st, sf;
+
+ if (iocg->level == 0) {
+ /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
+ iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
+ iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
+ WEIGHT_ONE - iocg->hweight_after_donation);
+ continue;
+ }
+
+ parent = iocg->ancestors[iocg->level - 1];
+
+ /* b' = gamma * b_f + b_t' */
+ iocg->hweight_inuse = DIV64_U64_ROUND_UP(
+ (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
+ WEIGHT_ONE) + iocg->hweight_after_donation;
+
+ /* w' = s' * b' / b'_p */
+ inuse = DIV64_U64_ROUND_UP(
+ (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
+ parent->hweight_inuse);
+
+ /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
+ st = DIV64_U64_ROUND_UP(
+ iocg->child_active_sum * iocg->hweight_donating,
+ iocg->hweight_active);
+ sf = iocg->child_active_sum - st;
+ wpt = DIV64_U64_ROUND_UP(
+ (u64)iocg->active * iocg->hweight_donating,
+ iocg->hweight_active);
+ wptp = DIV64_U64_ROUND_UP(
+ (u64)inuse * iocg->hweight_after_donation,
+ iocg->hweight_inuse);
+
+ iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
+ }
+
+ /*
+ * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
+ * we can finally determine leaf adjustments.
+ */
+ list_for_each_entry(iocg, surpluses, surplus_list) {
+ struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
+ u32 inuse;
+
+ /*
+ * In-debt iocgs participated in the donation calculation with
+ * the minimum target hweight_inuse. Configuring inuse
+ * accordingly would work fine but debt handling expects
+ * @iocg->inuse stay at the minimum and we don't wanna
+ * interfere.
+ */
+ if (iocg->abs_vdebt) {
+ WARN_ON_ONCE(iocg->inuse > 1);
+ continue;
+ }
+
+ /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
+ inuse = DIV64_U64_ROUND_UP(
+ parent->child_adjusted_sum * iocg->hweight_after_donation,
+ parent->hweight_inuse);
+
+ TRACE_IOCG_PATH(inuse_transfer, iocg, now,
+ iocg->inuse, inuse,
+ iocg->hweight_inuse,
+ iocg->hweight_after_donation);
+
+ __propagate_weights(iocg, iocg->active, inuse, true, now);
+ }
+
+ /* walk list should be dissolved after use */
+ list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
+ list_del_init(&iocg->walk_list);
}
static void ioc_timer_fn(struct timer_list *timer)
@@ -1341,12 +1984,14 @@ static void ioc_timer_fn(struct timer_list *timer)
struct ioc *ioc = container_of(timer, struct ioc, timer);
struct ioc_gq *iocg, *tiocg;
struct ioc_now now;
- int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
+ LIST_HEAD(surpluses);
+ int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0;
+ u64 usage_us_sum = 0;
u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
u32 missed_ppm[2], rq_wait_pct;
u64 period_vtime;
- int prev_busy_level, i;
+ int prev_busy_level;
/* how were the latencies during the period? */
ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
@@ -1370,30 +2015,71 @@ static void ioc_timer_fn(struct timer_list *timer)
*/
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
- !iocg_is_idle(iocg))
+ !iocg->delay && !iocg_is_idle(iocg))
continue;
spin_lock(&iocg->waitq.lock);
- if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
+ /* flush wait and indebt stat deltas */
+ if (iocg->wait_since) {
+ iocg->local_stat.wait_us += now.now - iocg->wait_since;
+ iocg->wait_since = now.now;
+ }
+ if (iocg->indebt_since) {
+ iocg->local_stat.indebt_us +=
+ now.now - iocg->indebt_since;
+ iocg->indebt_since = now.now;
+ }
+ if (iocg->indelay_since) {
+ iocg->local_stat.indelay_us +=
+ now.now - iocg->indelay_since;
+ iocg->indelay_since = now.now;
+ }
+
+ if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
+ iocg->delay) {
/* might be oversleeping vtime / hweight changes, kick */
- iocg_kick_waitq(iocg, &now);
- iocg_kick_delay(iocg, &now);
+ iocg_kick_waitq(iocg, true, &now);
+ if (iocg->abs_vdebt)
+ nr_debtors++;
} else if (iocg_is_idle(iocg)) {
/* no waiter and idle, deactivate */
- iocg->last_inuse = iocg->inuse;
- __propagate_active_weight(iocg, 0, 0);
+ u64 vtime = atomic64_read(&iocg->vtime);
+ s64 excess;
+
+ /*
+ * @iocg has been inactive for a full duration and will
+ * have a high budget. Account anything above target as
+ * error and throw away. On reactivation, it'll start
+ * with the target budget.
+ */
+ excess = now.vnow - vtime - ioc->margins.target;
+ if (excess > 0) {
+ u32 old_hwi;
+
+ current_hweight(iocg, NULL, &old_hwi);
+ ioc->vtime_err -= div64_u64(excess * old_hwi,
+ WEIGHT_ONE);
+ }
+
+ __propagate_weights(iocg, 0, 0, false, &now);
list_del_init(&iocg->active_list);
}
spin_unlock(&iocg->waitq.lock);
}
- commit_active_weights(ioc);
+ commit_weights(ioc);
+
+ /*
+ * Wait and indebt stat are flushed above and the donation calculation
+ * below needs updated usage stat. Let's bring stat up-to-date.
+ */
+ iocg_flush_stat(&ioc->active_iocgs, &now);
- /* calc usages and see whether some weights need to be moved around */
+ /* calc usage and see whether some weights need to be moved around */
list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
- u64 vdone, vtime, vusage, vmargin, vmin;
- u32 hw_active, hw_inuse, usage;
+ u64 vdone, vtime, usage_us, usage_dur;
+ u32 usage, hw_active, hw_inuse;
/*
* Collect unused and wind vtime closer to vnow to prevent
@@ -1417,116 +2103,105 @@ static void ioc_timer_fn(struct timer_list *timer)
time_before64(vdone, now.vnow - period_vtime))
nr_lagging++;
- if (waitqueue_active(&iocg->waitq))
- vusage = now.vnow - iocg->last_vtime;
- else if (time_before64(iocg->last_vtime, vtime))
- vusage = vtime - iocg->last_vtime;
- else
- vusage = 0;
-
- iocg->last_vtime += vusage;
/*
- * Factor in in-flight vtime into vusage to avoid
- * high-latency completions appearing as idle. This should
- * be done after the above ->last_time adjustment.
+ * Determine absolute usage factoring in in-flight IOs to avoid
+ * high-latency completions appearing as idle.
*/
- vusage = max(vusage, vtime - vdone);
-
- /* calculate hweight based usage ratio and record */
- if (vusage) {
- usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
- period_vtime);
- iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
- iocg->usages[iocg->usage_idx] = usage;
- } else {
- usage = 0;
+ usage_us = iocg->usage_delta_us;
+ usage_us_sum += usage_us;
+
+ if (vdone != vtime) {
+ u64 inflight_us = DIV64_U64_ROUND_UP(
+ cost_to_abs_cost(vtime - vdone, hw_inuse),
+ ioc->vtime_base_rate);
+ usage_us = max(usage_us, inflight_us);
}
+ /* convert to hweight based usage ratio */
+ if (time_after64(iocg->activated_at, ioc->period_at))
+ usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
+ else
+ usage_dur = max_t(u64, now.now - ioc->period_at, 1);
+
+ usage = clamp_t(u32,
+ DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
+ usage_dur),
+ 1, WEIGHT_ONE);
+
/* see whether there's surplus vtime */
- vmargin = ioc->margin_us * now.vrate;
- vmin = now.vnow - vmargin;
-
- iocg->has_surplus = false;
-
- if (!waitqueue_active(&iocg->waitq) &&
- time_before64(vtime, vmin)) {
- u64 delta = vmin - vtime;
-
- /* throw away surplus vtime */
- atomic64_add(delta, &iocg->vtime);
- atomic64_add(delta, &iocg->done_vtime);
- iocg->last_vtime += delta;
- /* if usage is sufficiently low, maybe it can donate */
- if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
- iocg->has_surplus = true;
- nr_surpluses++;
- }
- } else if (hw_inuse < hw_active) {
- u32 new_hwi, new_inuse;
+ WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
+ if (hw_inuse < hw_active ||
+ (!waitqueue_active(&iocg->waitq) &&
+ time_before64(vtime, now.vnow - ioc->margins.low))) {
+ u32 hwa, old_hwi, hwm, new_hwi;
- /* was donating but might need to take back some */
- if (waitqueue_active(&iocg->waitq)) {
- new_hwi = hw_active;
+ /*
+ * Already donating or accumulated enough to start.
+ * Determine the donation amount.
+ */
+ current_hweight(iocg, &hwa, &old_hwi);
+ hwm = current_hweight_max(iocg);
+ new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
+ usage, &now);
+ if (new_hwi < hwm) {
+ iocg->hweight_donating = hwa;
+ iocg->hweight_after_donation = new_hwi;
+ list_add(&iocg->surplus_list, &surpluses);
} else {
- new_hwi = max(hw_inuse,
- usage * SURPLUS_SCALE_PCT / 100 +
- SURPLUS_SCALE_ABS);
- }
+ TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
+ iocg->inuse, iocg->active,
+ iocg->hweight_inuse, new_hwi);
- new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
- hw_inuse);
- new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
-
- if (new_inuse > iocg->inuse) {
- TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
- iocg->inuse, new_inuse,
- hw_inuse, new_hwi);
- __propagate_active_weight(iocg, iocg->weight,
- new_inuse);
+ __propagate_weights(iocg, iocg->active,
+ iocg->active, true, &now);
+ nr_shortages++;
}
} else {
- /* genuninely out of vtime */
+ /* genuinely short on vtime */
nr_shortages++;
}
}
- if (!nr_shortages || !nr_surpluses)
- goto skip_surplus_transfers;
+ if (!list_empty(&surpluses) && nr_shortages)
+ transfer_surpluses(&surpluses, &now);
- /* there are both shortages and surpluses, transfer surpluses */
- list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
- u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
- int nr_valid = 0;
+ commit_weights(ioc);
- if (!iocg->has_surplus)
- continue;
+ /* surplus list should be dissolved after use */
+ list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
+ list_del_init(&iocg->surplus_list);
- /* base the decision on max historical usage */
- for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
- if (iocg->usages[i]) {
- usage = max(usage, iocg->usages[i]);
- nr_valid++;
+ /*
+ * A low weight iocg can amass a large amount of debt, for example, when
+ * anonymous memory gets reclaimed aggressively. If the system has a lot
+ * of memory paired with a slow IO device, the debt can span multiple
+ * seconds or more. If there are no other subsequent IO issuers, the
+ * in-debt iocg may end up blocked paying its debt while the IO device
+ * is idle.
+ *
+ * The following protects against such pathological cases. If the device
+ * has been sufficiently idle for a substantial amount of time, the
+ * debts are halved. The criteria are on the conservative side as we
+ * want to resolve the rare extreme cases without impacting regular
+ * operation by forgiving debts too readily.
+ */
+ if (nr_shortages ||
+ div64_u64(100 * usage_us_sum, now.now - ioc->period_at) >=
+ DEBT_BUSY_USAGE_PCT)
+ ioc->debt_busy_at = now.now;
+
+ if (nr_debtors &&
+ now.now - ioc->debt_busy_at >= DEBT_REDUCTION_IDLE_DUR) {
+ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
+ if (iocg->abs_vdebt) {
+ spin_lock(&iocg->waitq.lock);
+ iocg->abs_vdebt /= 2;
+ iocg_kick_waitq(iocg, true, &now);
+ spin_unlock(&iocg->waitq.lock);
}
}
- if (nr_valid < MIN_VALID_USAGES)
- continue;
-
- current_hweight(iocg, &hw_active, &hw_inuse);
- new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
- if (!new_hwi)
- continue;
-
- new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
- hw_inuse);
- if (new_inuse < iocg->inuse) {
- TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
- iocg->inuse, new_inuse,
- hw_inuse, new_hwi);
- __propagate_active_weight(iocg, iocg->weight, new_inuse);
- }
+ ioc->debt_busy_at = now.now;
}
-skip_surplus_transfers:
- commit_active_weights(ioc);
/*
* If q is getting clogged or we're missing too much, we're issuing
@@ -1554,11 +2229,9 @@ skip_surplus_transfers:
/*
* If there are IOs spanning multiple periods, wait
- * them out before pushing the device harder. If
- * there are surpluses, let redistribution work it
- * out first.
+ * them out before pushing the device harder.
*/
- if (!nr_lagging && !nr_surpluses)
+ if (!nr_lagging)
ioc->busy_level--;
} else {
/*
@@ -1577,7 +2250,7 @@ skip_surplus_transfers:
ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
- u64 vrate = atomic64_read(&ioc->vtime_rate);
+ u64 vrate = ioc->vtime_base_rate;
u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
/* rq_wait signal is always reliable, ignore user vrate_min */
@@ -1612,16 +2285,14 @@ skip_surplus_transfers:
}
trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
- nr_lagging, nr_shortages,
- nr_surpluses);
+ nr_lagging, nr_shortages);
- atomic64_set(&ioc->vtime_rate, vrate);
- ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
- ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
+ ioc->vtime_base_rate = vrate;
+ ioc_refresh_margins(ioc);
} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
missed_ppm, rq_wait_pct, nr_lagging,
- nr_shortages, nr_surpluses);
+ nr_shortages);
}
ioc_refresh_params(ioc, false);
@@ -1637,11 +2308,74 @@ skip_surplus_transfers:
ioc_start_period(ioc, &now);
} else {
ioc->busy_level = 0;
+ ioc->vtime_err = 0;
ioc->running = IOC_IDLE;
}
+
+ ioc_refresh_vrate(ioc, &now);
+ }
+
+ spin_unlock_irq(&ioc->lock);
+}
+
+static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
+ u64 abs_cost, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct ioc_margins *margins = &ioc->margins;
+ u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
+ u32 hwi, adj_step;
+ s64 margin;
+ u64 cost, new_inuse;
+
+ current_hweight(iocg, NULL, &hwi);
+ old_hwi = hwi;
+ cost = abs_cost_to_cost(abs_cost, hwi);
+ margin = now->vnow - vtime - cost;
+
+ /* debt handling owns inuse for debtors */
+ if (iocg->abs_vdebt)
+ return cost;
+
+ /*
+ * We only increase inuse during period and do so iff the margin has
+ * deteriorated since the previous adjustment.
+ */
+ if (margin >= iocg->saved_margin || margin >= margins->low ||
+ iocg->inuse == iocg->active)
+ return cost;
+
+ spin_lock_irq(&ioc->lock);
+
+ /* we own inuse only when @iocg is in the normal active state */
+ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
+ spin_unlock_irq(&ioc->lock);
+ return cost;
}
+ /*
+ * Bump up inuse till @abs_cost fits in the existing budget.
+ * adj_step must be determined after acquiring ioc->lock - we might
+ * have raced and lost to another thread for activation and could
+ * be reading 0 iocg->active before ioc->lock which will lead to
+ * infinite loop.
+ */
+ new_inuse = iocg->inuse;
+ adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
+ do {
+ new_inuse = new_inuse + adj_step;
+ propagate_weights(iocg, iocg->active, new_inuse, true, now);
+ current_hweight(iocg, NULL, &hwi);
+ cost = abs_cost_to_cost(abs_cost, hwi);
+ } while (time_after64(vtime + cost, now->vnow) &&
+ iocg->inuse != iocg->active);
+
spin_unlock_irq(&ioc->lock);
+
+ TRACE_IOCG_PATH(inuse_adjust, iocg, now,
+ old_inuse, iocg->inuse, old_hwi, hwi);
+
+ return cost;
}
static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
@@ -1725,38 +2459,25 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
struct ioc_gq *iocg = blkg_to_iocg(blkg);
struct ioc_now now;
struct iocg_wait wait;
- u32 hw_active, hw_inuse;
u64 abs_cost, cost, vtime;
+ bool use_debt, ioc_locked;
+ unsigned long flags;
/* bypass IOs if disabled or for root cgroup */
if (!ioc->enabled || !iocg->level)
return;
- /* always activate so that even 0 cost IOs get protected to some level */
- if (!iocg_activate(iocg, &now))
- return;
-
/* calculate the absolute vtime cost */
abs_cost = calc_vtime_cost(bio, iocg, false);
if (!abs_cost)
return;
- iocg->cursor = bio_end_sector(bio);
+ if (!iocg_activate(iocg, &now))
+ return;
+ iocg->cursor = bio_end_sector(bio);
vtime = atomic64_read(&iocg->vtime);
- current_hweight(iocg, &hw_active, &hw_inuse);
-
- if (hw_inuse < hw_active &&
- time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
- TRACE_IOCG_PATH(inuse_reset, iocg, &now,
- iocg->inuse, iocg->weight, hw_inuse, hw_active);
- spin_lock_irq(&ioc->lock);
- propagate_active_weight(iocg, iocg->weight, iocg->weight);
- spin_unlock_irq(&ioc->lock);
- current_hweight(iocg, &hw_active, &hw_inuse);
- }
-
- cost = abs_cost_to_cost(abs_cost, hw_inuse);
+ cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
/*
* If no one's waiting and within budget, issue right away. The
@@ -1765,21 +2486,32 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
*/
if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
time_before_eq64(vtime + cost, now.vnow)) {
- iocg_commit_bio(iocg, bio, cost);
+ iocg_commit_bio(iocg, bio, abs_cost, cost);
return;
}
/*
- * We activated above but w/o any synchronization. Deactivation is
- * synchronized with waitq.lock and we won't get deactivated as long
- * as we're waiting or has debt, so we're good if we're activated
- * here. In the unlikely case that we aren't, just issue the IO.
+ * We're over budget. This can be handled in two ways. IOs which may
+ * cause priority inversions are punted to @ioc->aux_iocg and charged as
+ * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
+ * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
+ * whether debt handling is needed and acquire locks accordingly.
*/
- spin_lock_irq(&iocg->waitq.lock);
+ use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
+ ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
+retry_lock:
+ iocg_lock(iocg, ioc_locked, &flags);
+ /*
+ * @iocg must stay activated for debt and waitq handling. Deactivation
+ * is synchronized against both ioc->lock and waitq.lock and we won't
+ * get deactivated as long as we're waiting or has debt, so we're good
+ * if we're activated here. In the unlikely cases that we aren't, just
+ * issue the IO.
+ */
if (unlikely(list_empty(&iocg->active_list))) {
- spin_unlock_irq(&iocg->waitq.lock);
- iocg_commit_bio(iocg, bio, cost);
+ iocg_unlock(iocg, ioc_locked, &flags);
+ iocg_commit_bio(iocg, bio, abs_cost, cost);
return;
}
@@ -1800,15 +2532,26 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
* clear them and leave @iocg inactive w/ dangling use_delay heavily
* penalizing the cgroup and its descendants.
*/
- if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
- iocg->abs_vdebt += abs_cost;
+ if (use_debt) {
+ iocg_incur_debt(iocg, abs_cost, &now);
if (iocg_kick_delay(iocg, &now))
blkcg_schedule_throttle(rqos->q,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
- spin_unlock_irq(&iocg->waitq.lock);
+ iocg_unlock(iocg, ioc_locked, &flags);
return;
}
+ /* guarantee that iocgs w/ waiters have maximum inuse */
+ if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
+ if (!ioc_locked) {
+ iocg_unlock(iocg, false, &flags);
+ ioc_locked = true;
+ goto retry_lock;
+ }
+ propagate_weights(iocg, iocg->active, iocg->active, true,
+ &now);
+ }
+
/*
* Append self to the waitq and schedule the wakeup timer if we're
* the first waiter. The timer duration is calculated based on the
@@ -1829,9 +2572,9 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
wait.committed = false; /* will be set true by waker */
__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
- iocg_kick_waitq(iocg, &now);
+ iocg_kick_waitq(iocg, ioc_locked, &now);
- spin_unlock_irq(&iocg->waitq.lock);
+ iocg_unlock(iocg, ioc_locked, &flags);
while (true) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1851,8 +2594,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
struct ioc *ioc = iocg->ioc;
sector_t bio_end = bio_end_sector(bio);
struct ioc_now now;
- u32 hw_inuse;
- u64 abs_cost, cost;
+ u64 vtime, abs_cost, cost;
unsigned long flags;
/* bypass if disabled or for root cgroup */
@@ -1864,8 +2606,9 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
return;
ioc_now(ioc, &now);
- current_hweight(iocg, NULL, &hw_inuse);
- cost = abs_cost_to_cost(abs_cost, hw_inuse);
+
+ vtime = atomic64_read(&iocg->vtime);
+ cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
/* update cursor if backmerging into the request at the cursor */
if (blk_rq_pos(rq) < bio_end &&
@@ -1878,7 +2621,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
*/
if (rq->bio && rq->bio->bi_iocost_cost &&
time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
- iocg_commit_bio(iocg, bio, cost);
+ iocg_commit_bio(iocg, bio, abs_cost, cost);
return;
}
@@ -1887,14 +2630,20 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
* be for the vast majority of cases. See debt handling in
* ioc_rqos_throttle() for details.
*/
- spin_lock_irqsave(&iocg->waitq.lock, flags);
+ spin_lock_irqsave(&ioc->lock, flags);
+ spin_lock(&iocg->waitq.lock);
+
if (likely(!list_empty(&iocg->active_list))) {
- iocg->abs_vdebt += abs_cost;
- iocg_kick_delay(iocg, &now);
+ iocg_incur_debt(iocg, abs_cost, &now);
+ if (iocg_kick_delay(iocg, &now))
+ blkcg_schedule_throttle(rqos->q,
+ (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
} else {
- iocg_commit_bio(iocg, bio, cost);
+ iocg_commit_bio(iocg, bio, abs_cost, cost);
}
- spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+
+ spin_unlock(&iocg->waitq.lock);
+ spin_unlock_irqrestore(&ioc->lock, flags);
}
static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
@@ -1908,6 +2657,7 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
{
struct ioc *ioc = rqos_to_ioc(rqos);
+ struct ioc_pcpu_stat *ccs;
u64 on_q_ns, rq_wait_ns, size_nsec;
int pidx, rw;
@@ -1931,13 +2681,17 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
+ ccs = get_cpu_ptr(ioc->pcpu_stat);
+
if (on_q_ns <= size_nsec ||
on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
- this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
+ local_inc(&ccs->missed[rw].nr_met);
else
- this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
+ local_inc(&ccs->missed[rw].nr_missed);
+
+ local64_add(rq_wait_ns, &ccs->rq_wait_ns);
- this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
+ put_cpu_ptr(ccs);
}
static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
@@ -1977,7 +2731,7 @@ static int blk_iocost_init(struct request_queue *q)
{
struct ioc *ioc;
struct rq_qos *rqos;
- int ret;
+ int i, cpu, ret;
ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
if (!ioc)
@@ -1989,6 +2743,16 @@ static int blk_iocost_init(struct request_queue *q)
return -ENOMEM;
}
+ for_each_possible_cpu(cpu) {
+ struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
+
+ for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
+ local_set(&ccs->missed[i].nr_met, 0);
+ local_set(&ccs->missed[i].nr_missed, 0);
+ }
+ local64_set(&ccs->rq_wait_ns, 0);
+ }
+
rqos = &ioc->rqos;
rqos->id = RQ_QOS_COST;
rqos->ops = &ioc_rqos_ops;
@@ -1999,6 +2763,7 @@ static int blk_iocost_init(struct request_queue *q)
INIT_LIST_HEAD(&ioc->active_iocgs);
ioc->running = IOC_IDLE;
+ ioc->vtime_base_rate = VTIME_PER_USEC;
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
ioc->period_at = ktime_to_us(ktime_get());
@@ -2029,7 +2794,7 @@ static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
if (!iocc)
return NULL;
- iocc->dfl_weight = CGROUP_WEIGHT_DFL;
+ iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
return &iocc->cpd;
}
@@ -2048,6 +2813,12 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
if (!iocg)
return NULL;
+ iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
+ if (!iocg->pcpu_stat) {
+ kfree(iocg);
+ return NULL;
+ }
+
return &iocg->pd;
}
@@ -2067,14 +2838,14 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
atomic64_set(&iocg->done_vtime, now.vnow);
atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
INIT_LIST_HEAD(&iocg->active_list);
- iocg->hweight_active = HWEIGHT_WHOLE;
- iocg->hweight_inuse = HWEIGHT_WHOLE;
+ INIT_LIST_HEAD(&iocg->walk_list);
+ INIT_LIST_HEAD(&iocg->surplus_list);
+ iocg->hweight_active = WEIGHT_ONE;
+ iocg->hweight_inuse = WEIGHT_ONE;
init_waitqueue_head(&iocg->waitq);
hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
iocg->waitq_timer.function = iocg_waitq_timer_fn;
- hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- iocg->delay_timer.function = iocg_delay_timer_fn;
iocg->level = blkg->blkcg->css.cgroup->level;
@@ -2084,7 +2855,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
}
spin_lock_irqsave(&ioc->lock, flags);
- weight_updated(iocg);
+ weight_updated(iocg, &now);
spin_unlock_irqrestore(&ioc->lock, flags);
}
@@ -2096,18 +2867,56 @@ static void ioc_pd_free(struct blkg_policy_data *pd)
if (ioc) {
spin_lock_irqsave(&ioc->lock, flags);
+
if (!list_empty(&iocg->active_list)) {
- propagate_active_weight(iocg, 0, 0);
+ struct ioc_now now;
+
+ ioc_now(ioc, &now);
+ propagate_weights(iocg, 0, 0, false, &now);
list_del_init(&iocg->active_list);
}
+
+ WARN_ON_ONCE(!list_empty(&iocg->walk_list));
+ WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
+
spin_unlock_irqrestore(&ioc->lock, flags);
hrtimer_cancel(&iocg->waitq_timer);
- hrtimer_cancel(&iocg->delay_timer);
}
+ free_percpu(iocg->pcpu_stat);
kfree(iocg);
}
+static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
+{
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+ struct ioc *ioc = iocg->ioc;
+ size_t pos = 0;
+
+ if (!ioc->enabled)
+ return 0;
+
+ if (iocg->level == 0) {
+ unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
+ ioc->vtime_base_rate * 10000,
+ VTIME_PER_USEC);
+ pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
+ vp10k / 100, vp10k % 100);
+ }
+
+ pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
+ iocg->last_stat.usage_us);
+
+ if (blkcg_debug_stats)
+ pos += scnprintf(buf + pos, size - pos,
+ " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
+ iocg->last_stat.wait_us,
+ iocg->last_stat.indebt_us,
+ iocg->last_stat.indelay_us);
+
+ return pos;
+}
+
static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
@@ -2115,7 +2924,7 @@ static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
struct ioc_gq *iocg = pd_to_iocg(pd);
if (dname && iocg->cfg_weight)
- seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
+ seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
return 0;
}
@@ -2125,7 +2934,7 @@ static int ioc_weight_show(struct seq_file *sf, void *v)
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
- seq_printf(sf, "default %u\n", iocc->dfl_weight);
+ seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
&blkcg_policy_iocost, seq_cft(sf)->private, false);
return 0;
@@ -2137,6 +2946,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
struct blkg_conf_ctx ctx;
+ struct ioc_now now;
struct ioc_gq *iocg;
u32 v;
int ret;
@@ -2151,13 +2961,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
return -EINVAL;
spin_lock(&blkcg->lock);
- iocc->dfl_weight = v;
+ iocc->dfl_weight = v * WEIGHT_ONE;
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct ioc_gq *iocg = blkg_to_iocg(blkg);
if (iocg) {
spin_lock_irq(&iocg->ioc->lock);
- weight_updated(iocg);
+ ioc_now(iocg->ioc, &now);
+ weight_updated(iocg, &now);
spin_unlock_irq(&iocg->ioc->lock);
}
}
@@ -2182,8 +2993,9 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
}
spin_lock(&iocg->ioc->lock);
- iocg->cfg_weight = v;
- weight_updated(iocg);
+ iocg->cfg_weight = v * WEIGHT_ONE;
+ ioc_now(iocg->ioc, &now);
+ weight_updated(iocg, &now);
spin_unlock(&iocg->ioc->lock);
blkg_conf_finish(&ctx);
@@ -2521,6 +3333,7 @@ static struct blkcg_policy blkcg_policy_iocost = {
.pd_alloc_fn = ioc_pd_alloc,
.pd_init_fn = ioc_pd_init,
.pd_free_fn = ioc_pd_free,
+ .pd_stat_fn = ioc_pd_stat,
};
static int __init ioc_init(void)
diff --git a/block/blk-map.c b/block/blk-map.c
index 6e804892d5ec..21630dccac62 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -12,7 +12,8 @@
#include "blk.h"
struct bio_map_data {
- int is_our_pages;
+ bool is_our_pages : 1;
+ bool is_null_mapped : 1;
struct iov_iter iter;
struct iovec iov[];
};
@@ -108,7 +109,7 @@ static int bio_uncopy_user(struct bio *bio)
struct bio_map_data *bmd = bio->bi_private;
int ret = 0;
- if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
+ if (!bmd->is_null_mapped) {
/*
* if we're in a workqueue, the request is orphaned, so
* don't copy into a random user address space, just free
@@ -126,24 +127,12 @@ static int bio_uncopy_user(struct bio *bio)
return ret;
}
-/**
- * bio_copy_user_iov - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @iter: iovec iterator
- * @gfp_mask: memory allocation flags
- *
- * Prepares and returns a bio for indirect user io, bouncing data
- * to/from kernel pages as necessary. Must be paired with
- * call bio_uncopy_user() on io completion.
- */
-static struct bio *bio_copy_user_iov(struct request_queue *q,
- struct rq_map_data *map_data, struct iov_iter *iter,
- gfp_t gfp_mask)
+static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
+ struct iov_iter *iter, gfp_t gfp_mask)
{
struct bio_map_data *bmd;
struct page *page;
- struct bio *bio;
+ struct bio *bio, *bounce_bio;
int i = 0, ret;
int nr_pages;
unsigned int len = iter->count;
@@ -151,14 +140,15 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
bmd = bio_alloc_map_data(iter, gfp_mask);
if (!bmd)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
/*
* We need to do a deep copy of the iov_iter including the iovecs.
* The caller provided iov might point to an on-stack or otherwise
* shortlived one.
*/
- bmd->is_our_pages = map_data ? 0 : 1;
+ bmd->is_our_pages = !map_data;
+ bmd->is_null_mapped = (map_data && map_data->null_mapped);
nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
if (nr_pages > BIO_MAX_PAGES)
@@ -168,8 +158,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
bio = bio_kmalloc(gfp_mask, nr_pages);
if (!bio)
goto out_bmd;
-
- ret = 0;
+ bio->bi_opf |= req_op(rq);
if (map_data) {
nr_pages = 1 << map_data->page_order;
@@ -186,7 +175,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
if (map_data) {
if (i == map_data->nr_entries * nr_pages) {
ret = -ENOMEM;
- break;
+ goto cleanup;
}
page = map_data->pages[i / nr_pages];
@@ -194,14 +183,14 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
i++;
} else {
- page = alloc_page(q->bounce_gfp | gfp_mask);
+ page = alloc_page(rq->q->bounce_gfp | gfp_mask);
if (!page) {
ret = -ENOMEM;
- break;
+ goto cleanup;
}
}
- if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) {
+ if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) {
if (!map_data)
__free_page(page);
break;
@@ -211,9 +200,6 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
offset = 0;
}
- if (ret)
- goto cleanup;
-
if (map_data)
map_data->offset += bio->bi_iter.bi_size;
@@ -233,41 +219,42 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
}
bio->bi_private = bmd;
- if (map_data && map_data->null_mapped)
- bio_set_flag(bio, BIO_NULL_MAPPED);
- return bio;
+
+ bounce_bio = bio;
+ ret = blk_rq_append_bio(rq, &bounce_bio);
+ if (ret)
+ goto cleanup;
+
+ /*
+ * We link the bounce buffer in and could have to traverse it later, so
+ * we have to get a ref to prevent it from being freed
+ */
+ bio_get(bounce_bio);
+ return 0;
cleanup:
if (!map_data)
bio_free_pages(bio);
bio_put(bio);
out_bmd:
kfree(bmd);
- return ERR_PTR(ret);
+ return ret;
}
-/**
- * bio_map_user_iov - map user iovec into bio
- * @q: the struct request_queue for the bio
- * @iter: iovec iterator
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-static struct bio *bio_map_user_iov(struct request_queue *q,
- struct iov_iter *iter, gfp_t gfp_mask)
+static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
+ gfp_t gfp_mask)
{
- unsigned int max_sectors = queue_max_hw_sectors(q);
- int j;
- struct bio *bio;
+ unsigned int max_sectors = queue_max_hw_sectors(rq->q);
+ struct bio *bio, *bounce_bio;
int ret;
+ int j;
if (!iov_iter_count(iter))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES));
if (!bio)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
+ bio->bi_opf |= req_op(rq);
while (iov_iter_count(iter)) {
struct page **pages;
@@ -283,7 +270,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
- if (unlikely(offs & queue_dma_alignment(q))) {
+ if (unlikely(offs & queue_dma_alignment(rq->q))) {
ret = -EINVAL;
j = 0;
} else {
@@ -295,7 +282,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
if (n > bytes)
n = bytes;
- if (!bio_add_hw_page(q, bio, page, n, offs,
+ if (!bio_add_hw_page(rq->q, bio, page, n, offs,
max_sectors, &same_page)) {
if (same_page)
put_page(page);
@@ -319,21 +306,31 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
break;
}
- bio_set_flag(bio, BIO_USER_MAPPED);
-
/*
- * subtle -- if bio_map_user_iov() ended up bouncing a bio,
- * it would normally disappear when its bi_end_io is run.
- * however, we need it for the unmap, so grab an extra
- * reference to it
+ * Subtle: if we end up needing to bounce a bio, it would normally
+ * disappear when its bi_end_io is run. However, we need the original
+ * bio for the unmap, so grab an extra reference to it
*/
bio_get(bio);
- return bio;
+ bounce_bio = bio;
+ ret = blk_rq_append_bio(rq, &bounce_bio);
+ if (ret)
+ goto out_put_orig;
+
+ /*
+ * We link the bounce buffer in and could have to traverse it
+ * later, so we have to get a ref to prevent it from being freed
+ */
+ bio_get(bounce_bio);
+ return 0;
+
+ out_put_orig:
+ bio_put(bio);
out_unmap:
bio_release_pages(bio, false);
bio_put(bio);
- return ERR_PTR(ret);
+ return ret;
}
/**
@@ -557,55 +554,6 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
}
EXPORT_SYMBOL(blk_rq_append_bio);
-static int __blk_rq_unmap_user(struct bio *bio)
-{
- int ret = 0;
-
- if (bio) {
- if (bio_flagged(bio, BIO_USER_MAPPED))
- bio_unmap_user(bio);
- else
- ret = bio_uncopy_user(bio);
- }
-
- return ret;
-}
-
-static int __blk_rq_map_user_iov(struct request *rq,
- struct rq_map_data *map_data, struct iov_iter *iter,
- gfp_t gfp_mask, bool copy)
-{
- struct request_queue *q = rq->q;
- struct bio *bio, *orig_bio;
- int ret;
-
- if (copy)
- bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
- else
- bio = bio_map_user_iov(q, iter, gfp_mask);
-
- if (IS_ERR(bio))
- return PTR_ERR(bio);
-
- bio->bi_opf &= ~REQ_OP_MASK;
- bio->bi_opf |= req_op(rq);
-
- orig_bio = bio;
-
- /*
- * We link the bounce buffer in and could have to traverse it
- * later so we have to get a ref to prevent it from being freed
- */
- ret = blk_rq_append_bio(rq, &bio);
- if (ret) {
- __blk_rq_unmap_user(orig_bio);
- return ret;
- }
- bio_get(bio);
-
- return 0;
-}
-
/**
* blk_rq_map_user_iov - map user data to a request, for passthrough requests
* @q: request queue where request should be inserted
@@ -649,7 +597,10 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
i = *iter;
do {
- ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+ if (copy)
+ ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask);
+ else
+ ret = bio_map_user_iov(rq, &i, gfp_mask);
if (ret)
goto unmap_rq;
if (!bio)
@@ -700,9 +651,13 @@ int blk_rq_unmap_user(struct bio *bio)
if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
mapped_bio = bio->bi_private;
- ret2 = __blk_rq_unmap_user(mapped_bio);
- if (ret2 && !ret)
- ret = ret2;
+ if (bio->bi_private) {
+ ret2 = bio_uncopy_user(mapped_bio);
+ if (ret2 && !ret)
+ ret = ret2;
+ } else {
+ bio_unmap_user(mapped_bio);
+ }
mapped_bio = bio;
bio = bio->bi_next;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f685d633bcc9..6ed715835d45 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -11,6 +11,7 @@
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-rq-qos.h"
static inline bool bio_will_gap(struct request_queue *q,
struct request *prev_rq, struct bio *prev, struct bio *next)
@@ -895,3 +896,203 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
return ELEVATOR_FRONT_MERGE;
return ELEVATOR_NO_MERGE;
}
+
+static void blk_account_io_merge_bio(struct request *req)
+{
+ if (!blk_do_io_stat(req))
+ return;
+
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+}
+
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
+ struct bio *bio,
+ unsigned int nr_segs)
+{
+ const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
+
+ if (!ll_back_merge_fn(req, bio, nr_segs))
+ return BIO_MERGE_FAILED;
+
+ trace_block_bio_backmerge(req->q, req, bio);
+ rq_qos_merge(req->q, req, bio);
+
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
+ req->biotail->bi_next = bio;
+ req->biotail = bio;
+ req->__data_len += bio->bi_iter.bi_size;
+
+ bio_crypt_free_ctx(bio);
+
+ blk_account_io_merge_bio(req);
+ return BIO_MERGE_OK;
+}
+
+enum bio_merge_status bio_attempt_front_merge(struct request *req,
+ struct bio *bio,
+ unsigned int nr_segs)
+{
+ const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
+
+ if (!ll_front_merge_fn(req, bio, nr_segs))
+ return BIO_MERGE_FAILED;
+
+ trace_block_bio_frontmerge(req->q, req, bio);
+ rq_qos_merge(req->q, req, bio);
+
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
+ bio->bi_next = req->bio;
+ req->bio = bio;
+
+ req->__sector = bio->bi_iter.bi_sector;
+ req->__data_len += bio->bi_iter.bi_size;
+
+ bio_crypt_do_front_merge(req, bio);
+
+ blk_account_io_merge_bio(req);
+ return BIO_MERGE_OK;
+}
+
+enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
+ struct request *req,
+ struct bio *bio)
+{
+ unsigned short segments = blk_rq_nr_discard_segments(req);
+
+ if (segments >= queue_max_discard_segments(q))
+ goto no_merge;
+ if (blk_rq_sectors(req) + bio_sectors(bio) >
+ blk_rq_get_max_sectors(req, blk_rq_pos(req)))
+ goto no_merge;
+
+ rq_qos_merge(q, req, bio);
+
+ req->biotail->bi_next = bio;
+ req->biotail = bio;
+ req->__data_len += bio->bi_iter.bi_size;
+ req->nr_phys_segments = segments + 1;
+
+ blk_account_io_merge_bio(req);
+ return BIO_MERGE_OK;
+no_merge:
+ req_set_nomerge(q, req);
+ return BIO_MERGE_FAILED;
+}
+
+static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
+ struct request *rq,
+ struct bio *bio,
+ unsigned int nr_segs,
+ bool sched_allow_merge)
+{
+ if (!blk_rq_merge_ok(rq, bio))
+ return BIO_MERGE_NONE;
+
+ switch (blk_try_merge(rq, bio)) {
+ case ELEVATOR_BACK_MERGE:
+ if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
+ return bio_attempt_back_merge(rq, bio, nr_segs);
+ break;
+ case ELEVATOR_FRONT_MERGE:
+ if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
+ return bio_attempt_front_merge(rq, bio, nr_segs);
+ break;
+ case ELEVATOR_DISCARD_MERGE:
+ return bio_attempt_discard_merge(q, rq, bio);
+ default:
+ return BIO_MERGE_NONE;
+ }
+
+ return BIO_MERGE_FAILED;
+}
+
+/**
+ * blk_attempt_plug_merge - try to merge with %current's plugged list
+ * @q: request_queue new bio is being queued at
+ * @bio: new bio being queued
+ * @nr_segs: number of segments in @bio
+ * @same_queue_rq: pointer to &struct request that gets filled in when
+ * another request associated with @q is found on the plug list
+ * (optional, may be %NULL)
+ *
+ * Determine whether @bio being queued on @q can be merged with a request
+ * on %current's plugged list. Returns %true if merge was successful,
+ * otherwise %false.
+ *
+ * Plugging coalesces IOs from the same issuer for the same purpose without
+ * going through @q->queue_lock. As such it's more of an issuing mechanism
+ * than scheduling, and the request, while may have elvpriv data, is not
+ * added on the elevator at this point. In addition, we don't have
+ * reliable access to the elevator outside queue lock. Only check basic
+ * merging parameters without querying the elevator.
+ *
+ * Caller must ensure !blk_queue_nomerges(q) beforehand.
+ */
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs, struct request **same_queue_rq)
+{
+ struct blk_plug *plug;
+ struct request *rq;
+ struct list_head *plug_list;
+
+ plug = blk_mq_plug(q, bio);
+ if (!plug)
+ return false;
+
+ plug_list = &plug->mq_list;
+
+ list_for_each_entry_reverse(rq, plug_list, queuelist) {
+ if (rq->q == q && same_queue_rq) {
+ /*
+ * Only blk-mq multiple hardware queues case checks the
+ * rq in the same queue, there should be only one such
+ * rq in a queue
+ **/
+ *same_queue_rq = rq;
+ }
+
+ if (rq->q != q)
+ continue;
+
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Iterate list of requests and see if we can merge this bio with any
+ * of them.
+ */
+bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
+ struct bio *bio, unsigned int nr_segs)
+{
+ struct request *rq;
+ int checked = 8;
+
+ list_for_each_entry_reverse(rq, list, queuelist) {
+ if (!checked--)
+ break;
+
+ switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) {
+ case BIO_MERGE_NONE:
+ continue;
+ case BIO_MERGE_OK:
+ return true;
+ case BIO_MERGE_FAILED:
+ return false;
+ }
+
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(blk_bio_list_merge);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3f09bcb8a6fd..3094542e12ae 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -116,6 +116,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
+ QUEUE_FLAG_NAME(STABLE_WRITES),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA),
@@ -240,7 +241,7 @@ static const char *const alloc_policy_name[] = {
#define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name
static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
- HCTX_FLAG_NAME(TAG_SHARED),
+ HCTX_FLAG_NAME(TAG_QUEUE_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
HCTX_FLAG_NAME(STACKING),
@@ -452,11 +453,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
atomic_read(&tags->active_queues));
seq_puts(m, "\nbitmap_tags:\n");
- sbitmap_queue_show(&tags->bitmap_tags, m);
+ sbitmap_queue_show(tags->bitmap_tags, m);
if (tags->nr_reserved_tags) {
seq_puts(m, "\nbreserved_tags:\n");
- sbitmap_queue_show(&tags->breserved_tags, m);
+ sbitmap_queue_show(tags->breserved_tags, m);
}
}
@@ -487,7 +488,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
if (res)
goto out;
if (hctx->tags)
- sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
+ sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
mutex_unlock(&q->sysfs_lock);
out:
@@ -521,7 +522,7 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
if (res)
goto out;
if (hctx->sched_tags)
- sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
+ sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
mutex_unlock(&q->sysfs_lock);
out:
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index d2790e5b06d1..3e9596738852 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -18,21 +18,6 @@
#include "blk-mq-tag.h"
#include "blk-wbt.h"
-void blk_mq_sched_free_hctx_data(struct request_queue *q,
- void (*exit)(struct blk_mq_hw_ctx *))
-{
- struct blk_mq_hw_ctx *hctx;
- int i;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- if (exit && hctx->sched_data)
- exit(hctx);
- kfree(hctx->sched_data);
- hctx->sched_data = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
-
void blk_mq_sched_assign_ioc(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -368,7 +353,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
case ELEVATOR_BACK_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
- if (!bio_attempt_back_merge(rq, bio, nr_segs))
+ if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
return false;
*merged_request = attempt_back_merge(q, rq);
if (!*merged_request)
@@ -377,86 +362,20 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
case ELEVATOR_FRONT_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
- if (!bio_attempt_front_merge(rq, bio, nr_segs))
+ if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
return false;
*merged_request = attempt_front_merge(q, rq);
if (!*merged_request)
elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
return true;
case ELEVATOR_DISCARD_MERGE:
- return bio_attempt_discard_merge(q, rq, bio);
+ return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
default:
return false;
}
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
-/*
- * Iterate list of requests and see if we can merge this bio with any
- * of them.
- */
-bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
- struct bio *bio, unsigned int nr_segs)
-{
- struct request *rq;
- int checked = 8;
-
- list_for_each_entry_reverse(rq, list, queuelist) {
- bool merged = false;
-
- if (!checked--)
- break;
-
- if (!blk_rq_merge_ok(rq, bio))
- continue;
-
- switch (blk_try_merge(rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_back_merge(rq, bio,
- nr_segs);
- break;
- case ELEVATOR_FRONT_MERGE:
- if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_front_merge(rq, bio,
- nr_segs);
- break;
- case ELEVATOR_DISCARD_MERGE:
- merged = bio_attempt_discard_merge(q, rq, bio);
- break;
- default:
- continue;
- }
-
- return merged;
- }
-
- return false;
-}
-EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
-
-/*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
- */
-static bool blk_mq_attempt_merge(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx, struct bio *bio,
- unsigned int nr_segs)
-{
- enum hctx_type type = hctx->type;
-
- lockdep_assert_held(&ctx->lock);
-
- if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
- ctx->rq_merged++;
- return true;
- }
-
- return false;
-}
-
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
@@ -470,14 +389,24 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
return e->type->ops.bio_merge(hctx, bio, nr_segs);
type = hctx->type;
- if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
- !list_empty_careful(&ctx->rq_lists[type])) {
- /* default per sw-queue merge */
- spin_lock(&ctx->lock);
- ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs);
- spin_unlock(&ctx->lock);
+ if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
+ list_empty_careful(&ctx->rq_lists[type]))
+ return false;
+
+ /* default per sw-queue merge */
+ spin_lock(&ctx->lock);
+ /*
+ * Reverse check our software queue for entries that we could
+ * potentially merge with. Currently includes a hand-wavy stop
+ * count of 8, to not spend too much time checking for merges.
+ */
+ if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
+ ctx->rq_merged++;
+ ret = true;
}
+ spin_unlock(&ctx->lock);
+
return ret;
}
@@ -531,7 +460,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
goto run;
}
- WARN_ON(e && (rq->tag != -1));
+ WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
/*
@@ -616,9 +545,11 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
+ unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
+
if (hctx->sched_tags) {
blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
- blk_mq_free_rq_map(hctx->sched_tags);
+ blk_mq_free_rq_map(hctx->sched_tags, flags);
hctx->sched_tags = NULL;
}
}
@@ -628,10 +559,12 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
unsigned int hctx_idx)
{
struct blk_mq_tag_set *set = q->tag_set;
+ /* Clear HCTX_SHARED so tags are init'ed */
+ unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
int ret;
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
- set->reserved_tags);
+ set->reserved_tags, flags);
if (!hctx->sched_tags)
return -ENOMEM;
@@ -649,8 +582,11 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
+ /* Clear HCTX_SHARED so tags are freed */
+ unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
+
if (hctx->sched_tags) {
- blk_mq_free_rq_map(hctx->sched_tags);
+ blk_mq_free_rq_map(hctx->sched_tags, flags);
hctx->sched_tags = NULL;
}
}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index e81ca1bf6e10..0476360f05f1 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -5,9 +5,6 @@
#include "blk-mq.h"
#include "blk-mq-tag.h"
-void blk_mq_sched_free_hctx_data(struct request_queue *q,
- void (*exit)(struct blk_mq_hw_ctx *));
-
void blk_mq_sched_assign_ioc(struct request *rq);
void blk_mq_sched_request_inserted(struct request *rq);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 32d82e23b095..aacf10decdbd 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -23,9 +23,18 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
+ !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
+ atomic_inc(&set->active_queues_shared_sbitmap);
+ } else {
+ if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
+ !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ atomic_inc(&hctx->tags->active_queues);
+ }
return true;
}
@@ -35,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
- sbitmap_queue_wake_all(&tags->bitmap_tags);
+ sbitmap_queue_wake_all(tags->bitmap_tags);
if (include_reserve)
- sbitmap_queue_wake_all(&tags->breserved_tags);
+ sbitmap_queue_wake_all(tags->breserved_tags);
}
/*
@@ -47,11 +56,19 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
-
- if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- return;
-
- atomic_dec(&tags->active_queues);
+ struct request_queue *q = hctx->queue;
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+ if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
+ &q->queue_flags))
+ return;
+ atomic_dec(&set->active_queues_shared_sbitmap);
+ } else {
+ if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return;
+ atomic_dec(&tags->active_queues);
+ }
blk_mq_tag_wakeup_all(tags, false);
}
@@ -59,7 +76,8 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
struct sbitmap_queue *bt)
{
- if (!data->q->elevator && !hctx_may_queue(data->hctx, bt))
+ if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
+ !hctx_may_queue(data->hctx, bt))
return BLK_MQ_NO_TAG;
if (data->shallow_depth)
@@ -82,10 +100,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
WARN_ON_ONCE(1);
return BLK_MQ_NO_TAG;
}
- bt = &tags->breserved_tags;
+ bt = tags->breserved_tags;
tag_offset = 0;
} else {
- bt = &tags->bitmap_tags;
+ bt = tags->bitmap_tags;
tag_offset = tags->nr_reserved_tags;
}
@@ -131,9 +149,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
- bt = &tags->breserved_tags;
+ bt = tags->breserved_tags;
else
- bt = &tags->bitmap_tags;
+ bt = tags->bitmap_tags;
/*
* If destination hw queue is changed, fake wake up on
@@ -167,10 +185,10 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags);
- sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
+ sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
} else {
BUG_ON(tag >= tags->nr_reserved_tags);
- sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
+ sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
}
}
@@ -197,7 +215,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
- if (rq && rq->q == hctx->queue)
+ if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
return iter_data->fn(hctx, rq, iter_data->data, reserved);
return true;
}
@@ -298,9 +316,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
if (tags->nr_reserved_tags)
- bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
+ bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
flags | BT_TAG_ITER_RESERVED);
- bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
+ bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
}
/**
@@ -416,8 +434,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
continue;
if (tags->nr_reserved_tags)
- bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
- bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
+ bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
+ bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
}
blk_queue_exit(q);
}
@@ -429,30 +447,64 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
node);
}
-static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
- int node, int alloc_policy)
+static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
+ int node, int alloc_policy)
{
unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
- if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
- goto free_tags;
- if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
- node))
+ if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node))
+ return -ENOMEM;
+ if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags,
+ round_robin, node))
goto free_bitmap_tags;
- return tags;
+ tags->bitmap_tags = &tags->__bitmap_tags;
+ tags->breserved_tags = &tags->__breserved_tags;
+
+ return 0;
free_bitmap_tags:
- sbitmap_queue_free(&tags->bitmap_tags);
-free_tags:
- kfree(tags);
- return NULL;
+ sbitmap_queue_free(&tags->__bitmap_tags);
+ return -ENOMEM;
+}
+
+int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
+{
+ unsigned int depth = set->queue_depth - set->reserved_tags;
+ int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
+ bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
+ int i, node = set->numa_node;
+
+ if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node))
+ return -ENOMEM;
+ if (bt_alloc(&set->__breserved_tags, set->reserved_tags,
+ round_robin, node))
+ goto free_bitmap_tags;
+
+ for (i = 0; i < set->nr_hw_queues; i++) {
+ struct blk_mq_tags *tags = set->tags[i];
+
+ tags->bitmap_tags = &set->__bitmap_tags;
+ tags->breserved_tags = &set->__breserved_tags;
+ }
+
+ return 0;
+free_bitmap_tags:
+ sbitmap_queue_free(&set->__bitmap_tags);
+ return -ENOMEM;
+}
+
+void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
+{
+ sbitmap_queue_free(&set->__bitmap_tags);
+ sbitmap_queue_free(&set->__breserved_tags);
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags,
- int node, int alloc_policy)
+ int node, unsigned int flags)
{
+ int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) {
@@ -467,13 +519,22 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
- return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
+ if (flags & BLK_MQ_F_TAG_HCTX_SHARED)
+ return tags;
+
+ if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
+ kfree(tags);
+ return NULL;
+ }
+ return tags;
}
-void blk_mq_free_tags(struct blk_mq_tags *tags)
+void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
{
- sbitmap_queue_free(&tags->bitmap_tags);
- sbitmap_queue_free(&tags->breserved_tags);
+ if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) {
+ sbitmap_queue_free(tags->bitmap_tags);
+ sbitmap_queue_free(tags->breserved_tags);
+ }
kfree(tags);
}
@@ -492,6 +553,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
*/
if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set;
+ /* Only sched tags can grow, so clear HCTX_SHARED flag */
+ unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
struct blk_mq_tags *new;
bool ret;
@@ -506,30 +569,35 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
return -EINVAL;
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
- tags->nr_reserved_tags);
+ tags->nr_reserved_tags, flags);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
if (ret) {
- blk_mq_free_rq_map(new);
+ blk_mq_free_rq_map(new, flags);
return -ENOMEM;
}
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
- blk_mq_free_rq_map(*tagsptr);
+ blk_mq_free_rq_map(*tagsptr, flags);
*tagsptr = new;
} else {
/*
* Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing.
*/
- sbitmap_queue_resize(&tags->bitmap_tags,
+ sbitmap_queue_resize(tags->bitmap_tags,
tdepth - tags->nr_reserved_tags);
}
return 0;
}
+void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
+{
+ sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
+}
+
/**
* blk_mq_unique_tag() - return a tag that is unique queue-wide
* @rq: request for which to compute a unique tag
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index b1acac518c4e..7d3e6b333a4a 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -2,8 +2,6 @@
#ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H
-#include "blk-mq.h"
-
/*
* Tag address space map.
*/
@@ -13,17 +11,25 @@ struct blk_mq_tags {
atomic_t active_queues;
- struct sbitmap_queue bitmap_tags;
- struct sbitmap_queue breserved_tags;
+ struct sbitmap_queue *bitmap_tags;
+ struct sbitmap_queue *breserved_tags;
+
+ struct sbitmap_queue __bitmap_tags;
+ struct sbitmap_queue __breserved_tags;
struct request **rqs;
struct request **static_rqs;
struct list_head page_list;
};
+extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
+ unsigned int reserved_tags,
+ int node, unsigned int flags);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
-extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
-extern void blk_mq_free_tags(struct blk_mq_tags *tags);
+extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set,
+ unsigned int flags);
+extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
@@ -31,6 +37,9 @@ extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tags,
unsigned int depth, bool can_grow);
+extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
+ unsigned int size);
+
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
@@ -56,7 +65,7 @@ extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
return false;
return __blk_mq_tag_busy(hctx);
@@ -64,43 +73,12 @@ static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
return;
__blk_mq_tag_idle(hctx);
}
-/*
- * For shared tag users, we track the number of currently active users
- * and attempt to provide a fair share of the tag depth for each of them.
- */
-static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
- struct sbitmap_queue *bt)
-{
- unsigned int depth, users;
-
- if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
- return true;
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- return true;
-
- /*
- * Don't try dividing an ant
- */
- if (bt->sb.depth == 1)
- return true;
-
- users = atomic_read(&hctx->tags->active_queues);
- if (!users)
- return true;
-
- /*
- * Allow at least some tags
- */
- depth = max((bt->sb.depth + users - 1) / users, 4U);
- return atomic_read(&hctx->nr_active) < depth;
-}
-
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
unsigned int tag)
{
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b3d2785eefe9..e04b759add75 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -519,7 +519,7 @@ void blk_mq_free_request(struct request *rq)
ctx->rq_completed[rq_is_sync(rq)]++;
if (rq->rq_flags & RQF_MQ_INFLIGHT)
- atomic_dec(&hctx->nr_active);
+ __blk_mq_dec_active_requests(hctx);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->backing_dev_info);
@@ -1096,19 +1096,20 @@ static inline unsigned int queued_to_index(unsigned int queued)
static bool __blk_mq_get_driver_tag(struct request *rq)
{
- struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+ struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
int tag;
blk_mq_tag_busy(rq->mq_hctx);
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
- bt = &rq->mq_hctx->tags->breserved_tags;
+ bt = rq->mq_hctx->tags->breserved_tags;
tag_offset = 0;
+ } else {
+ if (!hctx_may_queue(rq->mq_hctx, bt))
+ return false;
}
- if (!hctx_may_queue(rq->mq_hctx, bt))
- return false;
tag = __sbitmap_queue_get(bt);
if (tag == BLK_MQ_NO_TAG)
return false;
@@ -1124,10 +1125,10 @@ static bool blk_mq_get_driver_tag(struct request *rq)
if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
return false;
- if ((hctx->flags & BLK_MQ_F_TAG_SHARED) &&
+ if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
rq->rq_flags |= RQF_MQ_INFLIGHT;
- atomic_inc(&hctx->nr_active);
+ __blk_mq_inc_active_requests(hctx);
}
hctx->tags->rqs[rq->tag] = rq;
return true;
@@ -1145,7 +1146,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
struct sbitmap_queue *sbq;
list_del_init(&wait->entry);
- sbq = &hctx->tags->bitmap_tags;
+ sbq = hctx->tags->bitmap_tags;
atomic_dec(&sbq->ws_active);
}
spin_unlock(&hctx->dispatch_wait_lock);
@@ -1163,12 +1164,12 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
+ struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
struct wait_queue_head *wq;
wait_queue_entry_t *wait;
bool ret;
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
blk_mq_sched_mark_restart_hctx(hctx);
/*
@@ -1420,7 +1421,7 @@ out:
bool needs_restart;
/* For non-shared tags, the RESTART check will suffice */
bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
- (hctx->flags & BLK_MQ_F_TAG_SHARED);
+ (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
blk_mq_release_budgets(q, nr_budgets);
@@ -2296,20 +2297,21 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
}
-void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
{
kfree(tags->rqs);
tags->rqs = NULL;
kfree(tags->static_rqs);
tags->static_rqs = NULL;
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(tags, flags);
}
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
- unsigned int reserved_tags)
+ unsigned int reserved_tags,
+ unsigned int flags)
{
struct blk_mq_tags *tags;
int node;
@@ -2318,8 +2320,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE)
node = set->numa_node;
- tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
- BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
if (!tags)
return NULL;
@@ -2327,7 +2328,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
if (!tags->rqs) {
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(tags, flags);
return NULL;
}
@@ -2336,7 +2337,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
node);
if (!tags->static_rqs) {
kfree(tags->rqs);
- blk_mq_free_tags(tags);
+ blk_mq_free_tags(tags, flags);
return NULL;
}
@@ -2660,6 +2661,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
goto free_hctx;
atomic_set(&hctx->nr_active, 0);
+ atomic_set(&hctx->elevator_queued, 0);
if (node == NUMA_NO_NODE)
node = set->numa_node;
hctx->numa_node = node;
@@ -2668,7 +2670,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q;
- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
INIT_LIST_HEAD(&hctx->hctx_list);
@@ -2745,10 +2747,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
int hctx_idx)
{
+ unsigned int flags = set->flags;
int ret = 0;
set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
- set->queue_depth, set->reserved_tags);
+ set->queue_depth, set->reserved_tags, flags);
if (!set->tags[hctx_idx])
return false;
@@ -2757,7 +2760,7 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
if (!ret)
return true;
- blk_mq_free_rq_map(set->tags[hctx_idx]);
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
set->tags[hctx_idx] = NULL;
return false;
}
@@ -2765,9 +2768,11 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
unsigned int hctx_idx)
{
+ unsigned int flags = set->flags;
+
if (set->tags && set->tags[hctx_idx]) {
blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
- blk_mq_free_rq_map(set->tags[hctx_idx]);
+ blk_mq_free_rq_map(set->tags[hctx_idx], flags);
set->tags[hctx_idx] = NULL;
}
}
@@ -2885,14 +2890,14 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
queue_for_each_hw_ctx(q, hctx, i) {
if (shared)
- hctx->flags |= BLK_MQ_F_TAG_SHARED;
+ hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
else
- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
}
}
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
- bool shared)
+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
+ bool shared)
{
struct request_queue *q;
@@ -2913,9 +2918,9 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
list_del(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
- set->flags &= ~BLK_MQ_F_TAG_SHARED;
+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
- blk_mq_update_tag_set_depth(set, false);
+ blk_mq_update_tag_set_shared(set, false);
}
mutex_unlock(&set->tag_list_lock);
INIT_LIST_HEAD(&q->tag_set_list);
@@ -2930,12 +2935,12 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
* Check to see if we're transitioning to shared (from 1 to 2 queues).
*/
if (!list_empty(&set->tag_list) &&
- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
- set->flags |= BLK_MQ_F_TAG_SHARED;
+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+ set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
- blk_mq_update_tag_set_depth(set, true);
+ blk_mq_update_tag_set_shared(set, true);
}
- if (set->flags & BLK_MQ_F_TAG_SHARED)
+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
queue_set_hctx_shared(q, true);
list_add_tail(&q->tag_set_list, &set->tag_list);
@@ -3438,11 +3443,23 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_mq_map;
+ if (blk_mq_is_sbitmap_shared(set->flags)) {
+ atomic_set(&set->active_queues_shared_sbitmap, 0);
+
+ if (blk_mq_init_shared_sbitmap(set, set->flags)) {
+ ret = -ENOMEM;
+ goto out_free_mq_rq_maps;
+ }
+ }
+
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
+out_free_mq_rq_maps:
+ for (i = 0; i < set->nr_hw_queues; i++)
+ blk_mq_free_map_and_requests(set, i);
out_free_mq_map:
for (i = 0; i < set->nr_maps; i++) {
kfree(set->map[i].mq_map);
@@ -3461,6 +3478,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
for (i = 0; i < set->nr_hw_queues; i++)
blk_mq_free_map_and_requests(set, i);
+ if (blk_mq_is_sbitmap_shared(set->flags))
+ blk_mq_exit_shared_sbitmap(set);
+
for (j = 0; j < set->nr_maps; j++) {
kfree(set->map[j].mq_map);
set->map[j].mq_map = NULL;
@@ -3497,6 +3517,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
if (!hctx->sched_tags) {
ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
false);
+ if (!ret && blk_mq_is_sbitmap_shared(set->flags))
+ blk_mq_tag_resize_shared_sbitmap(set, nr);
} else {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
nr, true);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 863a2f3346d4..a52703c98b77 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -53,11 +53,12 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
*/
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
-void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
- unsigned int reserved_tags);
+ unsigned int reserved_tags,
+ unsigned int flags);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth);
@@ -158,6 +159,11 @@ struct blk_mq_alloc_data {
struct blk_mq_hw_ctx *hctx;
};
+static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
+{
+ return flags & BLK_MQ_F_TAG_HCTX_SHARED;
+}
+
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
if (data->q->elevator)
@@ -193,6 +199,28 @@ static inline bool blk_mq_get_dispatch_budget(struct request_queue *q)
return true;
}
+static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (blk_mq_is_sbitmap_shared(hctx->flags))
+ atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
+ else
+ atomic_inc(&hctx->nr_active);
+}
+
+static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (blk_mq_is_sbitmap_shared(hctx->flags))
+ atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
+ else
+ atomic_dec(&hctx->nr_active);
+}
+
+static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (blk_mq_is_sbitmap_shared(hctx->flags))
+ return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
+ return atomic_read(&hctx->nr_active);
+}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
@@ -201,7 +229,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
if (rq->rq_flags & RQF_MQ_INFLIGHT) {
rq->rq_flags &= ~RQF_MQ_INFLIGHT;
- atomic_dec(&hctx->nr_active);
+ __blk_mq_dec_active_requests(hctx);
}
}
@@ -253,4 +281,46 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
return NULL;
}
+/*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+ struct sbitmap_queue *bt)
+{
+ unsigned int depth, users;
+
+ if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
+ return true;
+
+ /*
+ * Don't try dividing an ant
+ */
+ if (bt->sb.depth == 1)
+ return true;
+
+ if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &q->queue_flags))
+ return true;
+ users = atomic_read(&set->active_queues_shared_sbitmap);
+ } else {
+ if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return true;
+ users = atomic_read(&hctx->tags->active_queues);
+ }
+
+ if (!users)
+ return true;
+
+ /*
+ * Allow at least some tags
+ */
+ depth = max((bt->sb.depth + users - 1) / users, 4U);
+ return __blk_mq_active_requests(hctx) < depth;
+}
+
+
#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 76a7e03bcd6c..4f6eb4bb1723 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -172,15 +172,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors);
*
* Description:
* If a driver doesn't want IOs to cross a given chunk size, it can set
- * this limit and prevent merging across chunks. Note that the chunk size
- * must currently be a power-of-2 in sectors. Also note that the block
- * layer must accept a page worth of data at any offset. So if the
- * crossing of chunks is a hard limitation in the driver, it must still be
- * prepared to split single page bios.
+ * this limit and prevent merging across chunks. Note that the block layer
+ * must accept a page worth of data at any offset. So if the crossing of
+ * chunks is a hard limitation in the driver, it must still be prepared
+ * to split single page bios.
**/
void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
{
- BUG_ON(!is_power_of_2(chunk_sectors));
q->limits.chunk_sectors = chunk_sectors;
}
EXPORT_SYMBOL(blk_queue_chunk_sectors);
@@ -374,6 +372,19 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
}
EXPORT_SYMBOL(blk_queue_alignment_offset);
+void blk_queue_update_readahead(struct request_queue *q)
+{
+ /*
+ * For read-ahead of large files to be effective, we need to read ahead
+ * at least twice the optimal I/O size.
+ */
+ q->backing_dev_info->ra_pages =
+ max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
+ q->backing_dev_info->io_pages =
+ queue_max_sectors(q) >> (PAGE_SHIFT - 9);
+}
+EXPORT_SYMBOL_GPL(blk_queue_update_readahead);
+
/**
* blk_limits_io_min - set minimum request size for a device
* @limits: the queue limits
@@ -452,6 +463,8 @@ EXPORT_SYMBOL(blk_limits_io_opt);
void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
{
blk_limits_io_opt(&q->limits, opt);
+ q->backing_dev_info->ra_pages =
+ max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
}
EXPORT_SYMBOL(blk_queue_io_opt);
@@ -534,6 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->io_min = max(t->io_min, b->io_min);
t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
+ t->chunk_sectors = lcm_not_zero(t->chunk_sectors, b->chunk_sectors);
/* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) {
@@ -556,6 +570,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
ret = -1;
}
+ /* chunk_sectors a multiple of the physical block size? */
+ if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
+ t->chunk_sectors = 0;
+ t->misaligned = 1;
+ ret = -1;
+ }
+
t->raid_partial_stripes_expensive =
max(t->raid_partial_stripes_expensive,
b->raid_partial_stripes_expensive);
@@ -594,10 +615,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->discard_granularity;
}
- if (b->chunk_sectors)
- t->chunk_sectors = min_not_zero(t->chunk_sectors,
- b->chunk_sectors);
-
t->zoned = max(t->zoned, b->zoned);
return ret;
}
@@ -629,8 +646,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
top, bottom);
}
- t->backing_dev_info->io_pages =
- t->limits.max_sectors >> (PAGE_SHIFT - 9);
+ blk_queue_update_readahead(disk->queue);
}
EXPORT_SYMBOL(disk_stack_limits);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7dda709f3ccb..76b54c7750b0 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -260,14 +260,14 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
static ssize_t \
-queue_show_##name(struct request_queue *q, char *page) \
+queue_##name##_show(struct request_queue *q, char *page) \
{ \
int bit; \
bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \
return queue_var_show(neg ? !bit : bit, page); \
} \
static ssize_t \
-queue_store_##name(struct request_queue *q, const char *page, size_t count) \
+queue_##name##_store(struct request_queue *q, const char *page, size_t count) \
{ \
unsigned long val; \
ssize_t ret; \
@@ -287,6 +287,7 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \
QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
#undef QUEUE_SYSFS_BIT_FNS
static ssize_t queue_zoned_show(struct request_queue *q, char *page)
@@ -547,218 +548,73 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_dax(q), page);
}
-static struct queue_sysfs_entry queue_requests_entry = {
- .attr = {.name = "nr_requests", .mode = 0644 },
- .show = queue_requests_show,
- .store = queue_requests_store,
-};
-
-static struct queue_sysfs_entry queue_ra_entry = {
- .attr = {.name = "read_ahead_kb", .mode = 0644 },
- .show = queue_ra_show,
- .store = queue_ra_store,
-};
-
-static struct queue_sysfs_entry queue_max_sectors_entry = {
- .attr = {.name = "max_sectors_kb", .mode = 0644 },
- .show = queue_max_sectors_show,
- .store = queue_max_sectors_store,
-};
-
-static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
- .attr = {.name = "max_hw_sectors_kb", .mode = 0444 },
- .show = queue_max_hw_sectors_show,
-};
-
-static struct queue_sysfs_entry queue_max_segments_entry = {
- .attr = {.name = "max_segments", .mode = 0444 },
- .show = queue_max_segments_show,
-};
-
-static struct queue_sysfs_entry queue_max_discard_segments_entry = {
- .attr = {.name = "max_discard_segments", .mode = 0444 },
- .show = queue_max_discard_segments_show,
-};
-
-static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
- .attr = {.name = "max_integrity_segments", .mode = 0444 },
- .show = queue_max_integrity_segments_show,
-};
-
-static struct queue_sysfs_entry queue_max_segment_size_entry = {
- .attr = {.name = "max_segment_size", .mode = 0444 },
- .show = queue_max_segment_size_show,
-};
+#define QUEUE_RO_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
+ .attr = { .name = _name, .mode = 0444 }, \
+ .show = _prefix##_show, \
+};
+
+#define QUEUE_RW_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
+ .attr = { .name = _name, .mode = 0644 }, \
+ .show = _prefix##_show, \
+ .store = _prefix##_store, \
+};
+
+QUEUE_RW_ENTRY(queue_requests, "nr_requests");
+QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
+QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
+QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
+QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
+QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
+QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_RW_ENTRY(elv_iosched, "scheduler");
+
+QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
+QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size");
+QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors");
+QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size");
+QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
+
+QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
+QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
+QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
+QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
+QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
+
+QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
+QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
+QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+
+QUEUE_RO_ENTRY(queue_zoned, "zoned");
+QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
+QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
+QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
+
+QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
+QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
+QUEUE_RW_ENTRY(queue_poll, "io_poll");
+QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
+QUEUE_RW_ENTRY(queue_wc, "write_cache");
+QUEUE_RO_ENTRY(queue_fua, "fua");
+QUEUE_RO_ENTRY(queue_dax, "dax");
+QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
+QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
-static struct queue_sysfs_entry queue_iosched_entry = {
- .attr = {.name = "scheduler", .mode = 0644 },
- .show = elv_iosched_show,
- .store = elv_iosched_store,
-};
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
+#endif
+/* legacy alias for logical_block_size: */
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
.attr = {.name = "hw_sector_size", .mode = 0444 },
.show = queue_logical_block_size_show,
};
-static struct queue_sysfs_entry queue_logical_block_size_entry = {
- .attr = {.name = "logical_block_size", .mode = 0444 },
- .show = queue_logical_block_size_show,
-};
-
-static struct queue_sysfs_entry queue_physical_block_size_entry = {
- .attr = {.name = "physical_block_size", .mode = 0444 },
- .show = queue_physical_block_size_show,
-};
-
-static struct queue_sysfs_entry queue_chunk_sectors_entry = {
- .attr = {.name = "chunk_sectors", .mode = 0444 },
- .show = queue_chunk_sectors_show,
-};
-
-static struct queue_sysfs_entry queue_io_min_entry = {
- .attr = {.name = "minimum_io_size", .mode = 0444 },
- .show = queue_io_min_show,
-};
-
-static struct queue_sysfs_entry queue_io_opt_entry = {
- .attr = {.name = "optimal_io_size", .mode = 0444 },
- .show = queue_io_opt_show,
-};
-
-static struct queue_sysfs_entry queue_discard_granularity_entry = {
- .attr = {.name = "discard_granularity", .mode = 0444 },
- .show = queue_discard_granularity_show,
-};
-
-static struct queue_sysfs_entry queue_discard_max_hw_entry = {
- .attr = {.name = "discard_max_hw_bytes", .mode = 0444 },
- .show = queue_discard_max_hw_show,
-};
-
-static struct queue_sysfs_entry queue_discard_max_entry = {
- .attr = {.name = "discard_max_bytes", .mode = 0644 },
- .show = queue_discard_max_show,
- .store = queue_discard_max_store,
-};
-
-static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
- .attr = {.name = "discard_zeroes_data", .mode = 0444 },
- .show = queue_discard_zeroes_data_show,
-};
-
-static struct queue_sysfs_entry queue_write_same_max_entry = {
- .attr = {.name = "write_same_max_bytes", .mode = 0444 },
- .show = queue_write_same_max_show,
-};
-
-static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
- .attr = {.name = "write_zeroes_max_bytes", .mode = 0444 },
- .show = queue_write_zeroes_max_show,
-};
-
-static struct queue_sysfs_entry queue_zone_append_max_entry = {
- .attr = {.name = "zone_append_max_bytes", .mode = 0444 },
- .show = queue_zone_append_max_show,
-};
-
-static struct queue_sysfs_entry queue_nonrot_entry = {
- .attr = {.name = "rotational", .mode = 0644 },
- .show = queue_show_nonrot,
- .store = queue_store_nonrot,
-};
-
-static struct queue_sysfs_entry queue_zoned_entry = {
- .attr = {.name = "zoned", .mode = 0444 },
- .show = queue_zoned_show,
-};
-
-static struct queue_sysfs_entry queue_nr_zones_entry = {
- .attr = {.name = "nr_zones", .mode = 0444 },
- .show = queue_nr_zones_show,
-};
-
-static struct queue_sysfs_entry queue_max_open_zones_entry = {
- .attr = {.name = "max_open_zones", .mode = 0444 },
- .show = queue_max_open_zones_show,
-};
-
-static struct queue_sysfs_entry queue_max_active_zones_entry = {
- .attr = {.name = "max_active_zones", .mode = 0444 },
- .show = queue_max_active_zones_show,
-};
-
-static struct queue_sysfs_entry queue_nomerges_entry = {
- .attr = {.name = "nomerges", .mode = 0644 },
- .show = queue_nomerges_show,
- .store = queue_nomerges_store,
-};
-
-static struct queue_sysfs_entry queue_rq_affinity_entry = {
- .attr = {.name = "rq_affinity", .mode = 0644 },
- .show = queue_rq_affinity_show,
- .store = queue_rq_affinity_store,
-};
-
-static struct queue_sysfs_entry queue_iostats_entry = {
- .attr = {.name = "iostats", .mode = 0644 },
- .show = queue_show_iostats,
- .store = queue_store_iostats,
-};
-
-static struct queue_sysfs_entry queue_random_entry = {
- .attr = {.name = "add_random", .mode = 0644 },
- .show = queue_show_random,
- .store = queue_store_random,
-};
-
-static struct queue_sysfs_entry queue_poll_entry = {
- .attr = {.name = "io_poll", .mode = 0644 },
- .show = queue_poll_show,
- .store = queue_poll_store,
-};
-
-static struct queue_sysfs_entry queue_poll_delay_entry = {
- .attr = {.name = "io_poll_delay", .mode = 0644 },
- .show = queue_poll_delay_show,
- .store = queue_poll_delay_store,
-};
-
-static struct queue_sysfs_entry queue_wc_entry = {
- .attr = {.name = "write_cache", .mode = 0644 },
- .show = queue_wc_show,
- .store = queue_wc_store,
-};
-
-static struct queue_sysfs_entry queue_fua_entry = {
- .attr = {.name = "fua", .mode = 0444 },
- .show = queue_fua_show,
-};
-
-static struct queue_sysfs_entry queue_dax_entry = {
- .attr = {.name = "dax", .mode = 0444 },
- .show = queue_dax_show,
-};
-
-static struct queue_sysfs_entry queue_io_timeout_entry = {
- .attr = {.name = "io_timeout", .mode = 0644 },
- .show = queue_io_timeout_show,
- .store = queue_io_timeout_store,
-};
-
-static struct queue_sysfs_entry queue_wb_lat_entry = {
- .attr = {.name = "wbt_lat_usec", .mode = 0644 },
- .show = queue_wb_lat_show,
- .store = queue_wb_lat_store,
-};
-
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static struct queue_sysfs_entry throtl_sample_time_entry = {
- .attr = {.name = "throttle_sample_time", .mode = 0644 },
- .show = blk_throtl_sample_time_show,
- .store = blk_throtl_sample_time_store,
-};
-#endif
+QUEUE_RW_ENTRY(queue_nonrot, "rotational");
+QUEUE_RW_ENTRY(queue_iostats, "iostats");
+QUEUE_RW_ENTRY(queue_random, "add_random");
+QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
static struct attribute *queue_attrs[] = {
&queue_requests_entry.attr,
@@ -769,7 +625,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_discard_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
- &queue_iosched_entry.attr,
+ &elv_iosched_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
@@ -791,6 +647,7 @@ static struct attribute *queue_attrs[] = {
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
+ &queue_stable_writes_entry.attr,
&queue_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
@@ -800,7 +657,7 @@ static struct attribute *queue_attrs[] = {
&queue_poll_delay_entry.attr,
&queue_io_timeout_entry.attr,
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- &throtl_sample_time_entry.attr,
+ &blk_throtl_sample_time_entry.attr,
#endif
NULL,
};
@@ -1000,6 +857,8 @@ int blk_register_queue(struct gendisk *disk)
percpu_ref_switch_to_percpu(&q->q_usage_counter);
}
+ blk_queue_update_readahead(q);
+
ret = blk_trace_init_sysfs(dev);
if (ret)
return ret;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index fee3325edf27..36ba61c5cdbd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -15,10 +15,10 @@
#include "blk-cgroup-rwstat.h"
/* Max dispatch from a group in 1 round */
-static int throtl_grp_quantum = 8;
+#define THROTL_GRP_QUANTUM 8
/* Total max dispatch from all groups in one round */
-static int throtl_quantum = 32;
+#define THROTL_QUANTUM 32
/* Throttling is performed over a slice and after that slice is renewed */
#define DFL_THROTL_SLICE_HD (HZ / 10)
@@ -150,7 +150,7 @@ struct throtl_grp {
/* user configured IOPS limits */
unsigned int iops_conf[2][LIMIT_CNT];
- /* Number of bytes disptached in current slice */
+ /* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
@@ -852,7 +852,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
/*
* A bio has been dispatched. Also adjust slice_end. It might happen
* that initially cgroup limit was very low resulting in high
- * slice_end, but later limit was bumped up and bio was dispached
+ * slice_end, but later limit was bumped up and bio was dispatched
* sooner, then we need to reduce slice_end. A high bogus slice_end
* is bad because it does not allow new slice to start.
*/
@@ -894,13 +894,19 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
}
static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+ u32 iops_limit, unsigned long *wait)
{
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
+ if (iops_limit == UINT_MAX) {
+ if (wait)
+ *wait = 0;
+ return true;
+ }
+
jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Round up to the next throttle slice, wait time must be nonzero */
@@ -913,7 +919,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
* have been trimmed.
*/
- tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
+ tmp = (u64)iops_limit * jiffy_elapsed_rnd;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
@@ -936,13 +942,19 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
}
static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+ u64 bps_limit, unsigned long *wait)
{
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
+ if (bps_limit == U64_MAX) {
+ if (wait)
+ *wait = 0;
+ return true;
+ }
+
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */
@@ -951,7 +963,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
- tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
+ tmp = bps_limit * jiffy_elapsed_rnd;
do_div(tmp, HZ);
bytes_allowed = tmp;
@@ -963,7 +975,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
- jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
+ jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
if (!jiffy_wait)
jiffy_wait = 1;
@@ -987,6 +999,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+ u64 bps_limit = tg_bps_limit(tg, rw);
+ u32 iops_limit = tg_iops_limit(tg, rw);
/*
* Currently whole state machine of group depends on first bio
@@ -998,8 +1012,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (tg_bps_limit(tg, rw) == U64_MAX &&
- tg_iops_limit(tg, rw) == UINT_MAX) {
+ if (bps_limit == U64_MAX && iops_limit == UINT_MAX) {
if (wait)
*wait = 0;
return true;
@@ -1021,8 +1034,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
- if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
- tg_with_in_iops_limit(tg, bio, &iops_wait)) {
+ if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
+ tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait)
*wait = 0;
return true;
@@ -1082,7 +1095,7 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* If @tg doesn't currently have any bios queued in the same
* direction, queueing @bio can change when @tg should be
* dispatched. Mark that @tg was empty. This is automatically
- * cleaered on the next tg_update_disptime().
+ * cleared on the next tg_update_disptime().
*/
if (!sq->nr_queued[rw])
tg->flags |= THROTL_TG_WAS_EMPTY;
@@ -1175,8 +1188,8 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
unsigned int nr_reads = 0, nr_writes = 0;
- unsigned int max_nr_reads = throtl_grp_quantum*3/4;
- unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+ unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4;
+ unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads;
struct bio *bio;
/* Try to dispatch 75% READS and 25% WRITES */
@@ -1226,7 +1239,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
if (sq->nr_queued[0] || sq->nr_queued[1])
tg_update_disptime(tg);
- if (nr_disp >= throtl_quantum)
+ if (nr_disp >= THROTL_QUANTUM)
break;
}
@@ -1303,7 +1316,7 @@ again:
}
}
} else {
- /* reached the top-level, queue issueing */
+ /* reached the top-level, queue issuing */
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
out_unlock:
@@ -1314,8 +1327,8 @@ out_unlock:
* blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
* @work: work item being executed
*
- * This function is queued for execution when bio's reach the bio_lists[]
- * of throtl_data->service_queue. Those bio's are ready and issued by this
+ * This function is queued for execution when bios reach the bio_lists[]
+ * of throtl_data->service_queue. Those bios are ready and issued by this
* function.
*/
static void blk_throtl_dispatch_work_fn(struct work_struct *work)
@@ -1428,8 +1441,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
* that a group's limit are dropped suddenly and we don't want to
* account recently dispatched IO with new low rate.
*/
- throtl_start_new_slice(tg, 0);
- throtl_start_new_slice(tg, 1);
+ throtl_start_new_slice(tg, READ);
+ throtl_start_new_slice(tg, WRITE);
if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg);
@@ -2230,7 +2243,7 @@ again:
/*
* @bio passed through this layer without being throttled.
- * Climb up the ladder. If we''re already at the top, it
+ * Climb up the ladder. If we're already at the top, it
* can be executed directly.
*/
qn = &tg->qnode_on_parent[rw];
diff --git a/block/blk.h b/block/blk.h
index 49e2928a1632..c08762e10b04 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -29,6 +29,12 @@ struct blk_flush_queue {
spinlock_t mq_flush_lock;
};
+enum bio_merge_status {
+ BIO_MERGE_OK,
+ BIO_MERGE_NONE,
+ BIO_MERGE_FAILED,
+};
+
extern struct kmem_cache *blk_requestq_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
@@ -169,14 +175,19 @@ static inline void blk_integrity_del(struct gendisk *disk)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
-bool bio_attempt_front_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs);
-bool bio_attempt_back_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs);
-bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
- struct bio *bio);
+enum bio_merge_status bio_attempt_front_merge(struct request *req,
+ struct bio *bio,
+ unsigned int nr_segs);
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
+ struct bio *bio,
+ unsigned int nr_segs);
+enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
+ struct request *req,
+ struct bio *bio);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq);
+bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
+ struct bio *bio, unsigned int nr_segs);
void blk_account_io_start(struct request *req);
void blk_account_io_done(struct request *req, u64 now);
@@ -350,7 +361,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
-void delete_partition(struct gendisk *disk, struct hd_struct *part);
+void delete_partition(struct hd_struct *part);
int bdev_add_partition(struct block_device *bdev, int partno,
sector_t start, sector_t length);
int bdev_del_partition(struct block_device *bdev, int partno);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index d185396d88bb..330fede77271 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -207,7 +207,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
BUG_ON(!req->nr_phys_segments);
- buf->sg_list = kzalloc(sz, GFP_KERNEL);
+ buf->sg_list = kmalloc(sz, GFP_KERNEL);
if (!buf->sg_list)
return -ENOMEM;
sg_init_table(buf->sg_list, req->nr_phys_segments);
diff --git a/block/genhd.c b/block/genhd.c
index 99c64641c314..05fb27cbb667 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -50,14 +50,13 @@ static void disk_release_events(struct gendisk *disk);
* zero and will not be set to zero
*/
void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
- bool revalidate)
+ bool update_bdev)
{
sector_t capacity = get_capacity(disk);
set_capacity(disk, size);
-
- if (revalidate)
- revalidate_disk(disk);
+ if (update_bdev)
+ revalidate_disk_size(disk, true);
if (capacity != size && capacity != 0 && size != 0) {
char *envp[] = { "RESIZE=1", NULL };
@@ -110,8 +109,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
}
}
-static unsigned int part_in_flight(struct request_queue *q,
- struct hd_struct *part)
+static unsigned int part_in_flight(struct hd_struct *part)
{
unsigned int inflight = 0;
int cpu;
@@ -126,8 +124,7 @@ static unsigned int part_in_flight(struct request_queue *q,
return inflight;
}
-static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2])
+static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2])
{
int cpu;
@@ -676,11 +673,23 @@ static int exact_lock(dev_t devt, void *data)
return 0;
}
+static void disk_scan_partitions(struct gendisk *disk)
+{
+ struct block_device *bdev;
+
+ if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
+ return;
+
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+ bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
+ if (!IS_ERR(bdev))
+ blkdev_put(bdev, FMODE_READ);
+}
+
static void register_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups)
{
struct device *ddev = disk_to_dev(disk);
- struct block_device *bdev;
struct disk_part_iter piter;
struct hd_struct *part;
int err;
@@ -722,25 +731,8 @@ static void register_disk(struct device *parent, struct gendisk *disk,
return;
}
- /* No minors to use for partitions */
- if (!disk_part_scan_enabled(disk))
- goto exit;
+ disk_scan_partitions(disk);
- /* No such device (e.g., media were just removed) */
- if (!get_capacity(disk))
- goto exit;
-
- bdev = bdget_disk(disk, 0);
- if (!bdev)
- goto exit;
-
- bdev->bd_invalidated = 1;
- err = blkdev_get(bdev, FMODE_READ, NULL);
- if (err < 0)
- goto exit;
- blkdev_put(bdev, FMODE_READ);
-
-exit:
/* announce disk after possible partitions are created */
dev_set_uevent_suppress(ddev, 0);
kobject_uevent(&ddev->kobj, KOBJ_ADD);
@@ -913,7 +905,7 @@ void del_gendisk(struct gendisk *disk)
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
while ((part = disk_part_iter_next(&piter))) {
invalidate_partition(disk, part->partno);
- delete_partition(disk, part);
+ delete_partition(part);
}
disk_part_iter_exit(&piter);
@@ -1301,7 +1293,7 @@ ssize_t part_stat_show(struct device *dev,
if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, p);
else
- inflight = part_in_flight(q, p);
+ inflight = part_in_flight(p);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
@@ -1343,7 +1335,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
if (queue_is_mq(q))
blk_mq_in_flight_rw(q, p, inflight);
else
- part_in_flight_rw(q, p, inflight);
+ part_in_flight_rw(p, inflight);
return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}
@@ -1623,7 +1615,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
if (queue_is_mq(gp->queue))
inflight = blk_mq_in_flight(gp->queue, hd);
else
- inflight = part_in_flight(gp->queue, hd);
+ inflight = part_in_flight(hd);
seq_printf(seqf, "%4d %7d %s "
"%lu %lu %lu %u "
@@ -1729,45 +1721,48 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
}
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
- if (disk) {
- disk->part0.dkstats = alloc_percpu(struct disk_stats);
- if (!disk->part0.dkstats) {
- kfree(disk);
- return NULL;
- }
- init_rwsem(&disk->lookup_sem);
- disk->node_id = node_id;
- if (disk_expand_part_tbl(disk, 0)) {
- free_percpu(disk->part0.dkstats);
- kfree(disk);
- return NULL;
- }
- ptbl = rcu_dereference_protected(disk->part_tbl, 1);
- rcu_assign_pointer(ptbl->part[0], &disk->part0);
+ if (!disk)
+ return NULL;
- /*
- * set_capacity() and get_capacity() currently don't use
- * seqcounter to read/update the part0->nr_sects. Still init
- * the counter as we can read the sectors in IO submission
- * patch using seqence counters.
- *
- * TODO: Ideally set_capacity() and get_capacity() should be
- * converted to make use of bd_mutex and sequence counters.
- */
- hd_sects_seq_init(&disk->part0);
- if (hd_ref_init(&disk->part0)) {
- hd_free_part(&disk->part0);
- kfree(disk);
- return NULL;
- }
+ disk->part0.dkstats = alloc_percpu(struct disk_stats);
+ if (!disk->part0.dkstats)
+ goto out_free_disk;
- disk->minors = minors;
- rand_initialize_disk(disk);
- disk_to_dev(disk)->class = &block_class;
- disk_to_dev(disk)->type = &disk_type;
- device_initialize(disk_to_dev(disk));
+ init_rwsem(&disk->lookup_sem);
+ disk->node_id = node_id;
+ if (disk_expand_part_tbl(disk, 0)) {
+ free_percpu(disk->part0.dkstats);
+ goto out_free_disk;
}
+
+ ptbl = rcu_dereference_protected(disk->part_tbl, 1);
+ rcu_assign_pointer(ptbl->part[0], &disk->part0);
+
+ /*
+ * set_capacity() and get_capacity() currently don't use
+ * seqcounter to read/update the part0->nr_sects. Still init
+ * the counter as we can read the sectors in IO submission
+ * patch using seqence counters.
+ *
+ * TODO: Ideally set_capacity() and get_capacity() should be
+ * converted to make use of bd_mutex and sequence counters.
+ */
+ hd_sects_seq_init(&disk->part0);
+ if (hd_ref_init(&disk->part0))
+ goto out_free_part0;
+
+ disk->minors = minors;
+ rand_initialize_disk(disk);
+ disk_to_dev(disk)->class = &block_class;
+ disk_to_dev(disk)->type = &disk_type;
+ device_initialize(disk_to_dev(disk));
return disk;
+
+out_free_part0:
+ hd_free_part(&disk->part0);
+out_free_disk:
+ kfree(disk);
+ return NULL;
}
EXPORT_SYMBOL(__alloc_disk_node);
@@ -2052,7 +2047,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
* CONTEXT:
* Might sleep.
*/
-unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
+static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
struct disk_events *ev = disk->ev;
unsigned int pending;
@@ -2090,6 +2085,33 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
return pending;
}
+/**
+ * bdev_check_media_change - check if a removable media has been changed
+ * @bdev: block device to check
+ *
+ * Check whether a removable media has been changed, and attempt to free all
+ * dentries and inodes and invalidates all block device page cache entries in
+ * that case.
+ *
+ * Returns %true if the block device changed, or %false if not.
+ */
+bool bdev_check_media_change(struct block_device *bdev)
+{
+ unsigned int events;
+
+ events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
+ DISK_EVENT_EJECT_REQUEST);
+ if (!(events & DISK_EVENT_MEDIA_CHANGE))
+ return false;
+
+ if (__invalidate_device(bdev, true))
+ pr_warn("VFS: busy inodes on changed media %s\n",
+ bdev->bd_disk->disk_name);
+ set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+ return true;
+}
+EXPORT_SYMBOL(bdev_check_media_change);
+
/*
* Separate this part out so that a different pointer for clearing_ptr can be
* passed in for disk_clear_events.
diff --git a/block/ioctl.c b/block/ioctl.c
index bdb3bbb253d9..06262c28f0c6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -112,8 +112,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
uint64_t range[2];
uint64_t start, len;
struct request_queue *q = bdev_get_queue(bdev);
- struct address_space *mapping = bdev->bd_inode->i_mapping;
-
+ int err;
if (!(mode & FMODE_WRITE))
return -EBADF;
@@ -134,7 +133,11 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
if (start + len > i_size_read(bdev->bd_inode))
return -EINVAL;
- truncate_inode_pages_range(mapping, start, start + len - 1);
+
+ err = truncate_bdev_range(bdev, mode, start, start + len - 1);
+ if (err)
+ return err;
+
return blkdev_issue_discard(bdev, start >> 9, len >> 9,
GFP_KERNEL, flags);
}
@@ -143,8 +146,8 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
unsigned long arg)
{
uint64_t range[2];
- struct address_space *mapping;
uint64_t start, end, len;
+ int err;
if (!(mode & FMODE_WRITE))
return -EBADF;
@@ -166,8 +169,9 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
return -EINVAL;
/* Invalidate the page cache, including dirty pages */
- mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
+ err = truncate_bdev_range(bdev, mode, start, end);
+ if (err)
+ return err;
return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
BLKDEV_ZERO_NOUNMAP);
@@ -474,15 +478,14 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
if (get_user(n, argp))
return -EFAULT;
- if (!(mode & FMODE_EXCL)) {
- bdgrab(bdev);
- if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
- return -EBUSY;
- }
+ if (mode & FMODE_EXCL)
+ return set_blocksize(bdev, n);
+ if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev)))
+ return -EBUSY;
ret = set_blocksize(bdev, n);
- if (!(mode & FMODE_EXCL))
- blkdev_put(bdev, mode | FMODE_EXCL);
+ blkdev_put(bdev, mode | FMODE_EXCL);
+
return ret;
}
diff --git a/block/ioprio.c b/block/ioprio.c
index 04ebd37966f1..364d2294ba90 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio)
switch (class) {
case IOPRIO_CLASS_RT:
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN))
return -EPERM;
fallthrough;
/* rt has prio field too */
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index a38c5ab103d1..dc89199bc8c6 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -359,7 +359,7 @@ static unsigned int kyber_sched_tags_shift(struct request_queue *q)
* All of the hardware queues have the same depth, so we can just grab
* the shift of the first one.
*/
- return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
+ return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift;
}
static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
@@ -502,7 +502,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
khd->batching = 0;
hctx->sched_data = khd;
- sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
+ sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags,
kqd->async_depth);
return 0;
@@ -573,7 +573,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
bool merged;
spin_lock(&kcq->lock);
- merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
+ merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
spin_unlock(&kcq->lock);
return merged;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b57470e154c8..800ac902809b 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -386,6 +386,8 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
spin_lock(&dd->lock);
rq = __dd_dispatch_request(dd);
spin_unlock(&dd->lock);
+ if (rq)
+ atomic_dec(&rq->mq_hctx->elevator_queued);
return rq;
}
@@ -533,6 +535,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
dd_insert_request(hctx, rq, at_head);
+ atomic_inc(&hctx->elevator_queued);
}
spin_unlock(&dd->lock);
}
@@ -579,6 +582,9 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ if (!atomic_read(&hctx->elevator_queued))
+ return false;
+
return !list_empty_careful(&dd->dispatch) ||
!list_empty_careful(&dd->fifo_list[0]) ||
!list_empty_careful(&dd->fifo_list[1]);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 722406b841df..5cacbac30107 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -199,14 +199,20 @@ static ssize_t part_alignment_offset_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+
+ return sprintf(buf, "%u\n",
+ queue_limit_alignment_offset(&part_to_disk(p)->queue->limits,
+ p->start_sect));
}
static ssize_t part_discard_alignment_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%u\n", p->discard_alignment);
+
+ return sprintf(buf, "%u\n",
+ queue_limit_discard_alignment(&part_to_disk(p)->queue->limits,
+ p->start_sect));
}
static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
@@ -318,8 +324,9 @@ int hd_ref_init(struct hd_struct *part)
* Must be called either with bd_mutex held, before a disk can be opened or
* after all disk users are gone.
*/
-void delete_partition(struct gendisk *disk, struct hd_struct *part)
+void delete_partition(struct hd_struct *part)
{
+ struct gendisk *disk = part_to_disk(part);
struct disk_part_tbl *ptbl =
rcu_dereference_protected(disk->part_tbl, 1);
@@ -327,7 +334,7 @@ void delete_partition(struct gendisk *disk, struct hd_struct *part)
* ->part_tbl is referenced in this part's release handler, so
* we have to hold the disk device
*/
- get_device(disk_to_dev(part_to_disk(part)));
+ get_device(disk_to_dev(disk));
rcu_assign_pointer(ptbl->part[part->partno], NULL);
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
@@ -405,10 +412,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
pdev = part_to_dev(p);
p->start_sect = start;
- p->alignment_offset =
- queue_limit_alignment_offset(&disk->queue->limits, start);
- p->discard_alignment =
- queue_limit_discard_alignment(&disk->queue->limits, start);
p->nr_sects = len;
p->partno = partno;
p->policy = get_disk_ro(disk);
@@ -554,7 +557,7 @@ int bdev_del_partition(struct block_device *bdev, int partno)
sync_blockdev(bdevp);
invalidate_bdev(bdevp);
- delete_partition(bdev->bd_disk, part);
+ delete_partition(part);
ret = 0;
out_unlock:
mutex_unlock(&bdev->bd_mutex);
@@ -592,8 +595,8 @@ int bdev_resize_partition(struct block_device *bdev, int partno,
if (partition_overlaps(bdev->bd_disk, start, length, partno))
goto out_unlock;
- part_nr_sects_write(part, (sector_t)length);
- i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT);
+ part_nr_sects_write(part, length);
+ bd_set_nr_sectors(bdevp, length);
ret = 0;
out_unlock:
@@ -634,7 +637,7 @@ int blk_drop_partitions(struct block_device *bdev)
disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY);
while ((part = disk_part_iter_next(&piter)))
- delete_partition(bdev->bd_disk, part);
+ delete_partition(part);
disk_part_iter_exit(&piter);
return 0;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index ef722f04f88a..4421e61c1af1 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -37,8 +37,6 @@ const unsigned char scsi_command_size_tbl[8] =
};
EXPORT_SYMBOL(scsi_command_size_tbl);
-#include <scsi/sg.h>
-
static int sg_get_version(int __user *p)
{
static const int sg_version_num = 30527;
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 226219da3da6..71c2b1564558 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1670,7 +1670,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
}
if (mode & (FMODE_READ|FMODE_WRITE)) {
- check_disk_change(bdev);
+ bdev_check_media_change(bdev);
if (mode & FMODE_WRITE) {
int wrprot;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 5ca7216e9e01..c34e71b0c4a9 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -347,7 +347,6 @@ aoeblk_gdalloc(void *vp)
mempool_t *mp;
struct request_queue *q;
struct blk_mq_tag_set *set;
- enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
ulong flags;
int late = 0;
int err;
@@ -407,7 +406,7 @@ aoeblk_gdalloc(void *vp)
WARN_ON(d->gd);
WARN_ON(d->flags & DEVFL_UP);
blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
- q->backing_dev_info->ra_pages = READ_AHEAD / PAGE_SIZE;
+ blk_queue_io_opt(q, SZ_2M);
d->bufpool = mp;
d->blkq = gd->queue = q;
q->queuedata = d;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 6dba41395155..313f0b946fe2 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -900,9 +900,7 @@ aoecmd_sleepwork(struct work_struct *work)
ssize = get_capacity(d->gd);
bd = bdget_disk(d->gd, 0);
if (bd) {
- inode_lock(bd->bd_inode);
- i_size_write(bd->bd_inode, (loff_t)ssize<<9);
- inode_unlock(bd->bd_inode);
+ bd_set_nr_sectors(bd, ssize);
bdput(bd);
}
spin_lock_irq(&d->lock);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index a50e13af0305..3e881fdb06e0 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1732,7 +1732,8 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
/* invalidate the buffer track to force a reread */
BufferDrive = -1;
set_bit(drive, &fake_change);
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev))
+ floppy_revalidate(bdev->bd_disk);
return 0;
default:
return -EINVAL;
@@ -1909,7 +1910,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
return 0;
if (mode & (FMODE_READ|FMODE_WRITE)) {
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev))
+ floppy_revalidate(bdev->bd_disk);
if (mode & FMODE_WRITE) {
if (p->wpstat) {
if (p->ref < 0)
@@ -1953,7 +1955,6 @@ static const struct block_device_operations floppy_fops = {
.release = floppy_release,
.ioctl = fd_ioctl,
.check_events = floppy_check_events,
- .revalidate_disk= floppy_revalidate,
};
static const struct blk_mq_ops ataflop_mq_ops = {
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 2723a70eb855..cc49a921339f 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -403,7 +403,6 @@ static struct brd_device *brd_alloc(int i)
disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ram%d", i);
set_capacity(disk, rd_size * 2);
- brd->brd_queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
/* Tell the block layer that this is not a rotational device */
blk_queue_flag_set(QUEUE_FLAG_NONROT, brd->brd_queue);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 43c8ae4d9fca..54a4930c04fe 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1362,15 +1362,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
if (b) {
blk_stack_limits(&q->limits, &b->limits, 0);
-
- if (q->backing_dev_info->ra_pages !=
- b->backing_dev_info->ra_pages) {
- drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
- q->backing_dev_info->ra_pages,
- b->backing_dev_info->ra_pages);
- q->backing_dev_info->ra_pages =
- b->backing_dev_info->ra_pages;
- }
+ blk_queue_update_readahead(q);
}
fixup_discard_if_not_supported(q);
fixup_write_zeroes(device, q);
@@ -3370,7 +3362,6 @@ static void device_to_statistics(struct device_statistics *s,
if (get_ldev(device)) {
struct drbd_md *md = &device->ldev->md;
u64 *history_uuids = (u64 *)s->history_uuids;
- struct request_queue *q;
int n;
spin_lock_irq(&md->uuid_lock);
@@ -3384,11 +3375,6 @@ static void device_to_statistics(struct device_statistics *s,
spin_unlock_irq(&md->uuid_lock);
s->dev_disk_flags = md->flags;
- q = bdev_get_queue(device->ldev->backing_bdev);
- s->dev_lower_blocked =
- bdi_congested(q->backing_dev_info,
- (1 << WB_async_congested) |
- (1 << WB_sync_congested));
put_ldev(device);
}
s->dev_size = drbd_get_capacity(device->this_bdev);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a563b023458a..7df79ae6b0a1 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -561,6 +561,7 @@ static void floppy_release_irq_and_dma(void);
* output_byte is automatically disabled when reset is set.
*/
static void reset_fdc(void);
+static int floppy_revalidate(struct gendisk *disk);
/*
* These are global variables, as that's the easiest way to give
@@ -3275,7 +3276,8 @@ static int invalidate_drive(struct block_device *bdev)
/* invalidate the buffer track to force a reread */
set_bit((long)bdev->bd_disk->private_data, &fake_change);
process_fd_request();
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev))
+ floppy_revalidate(bdev->bd_disk);
return 0;
}
@@ -4123,7 +4125,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
drive_state[drive].last_checked = 0;
clear_bit(FD_OPEN_SHOULD_FAIL_BIT,
&drive_state[drive].flags);
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev))
+ floppy_revalidate(bdev->bd_disk);
if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags))
goto out;
if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags))
@@ -4291,7 +4294,6 @@ static const struct block_device_operations floppy_fops = {
.ioctl = fd_ioctl,
.getgeo = fd_getgeo,
.check_events = floppy_check_events,
- .revalidate_disk = floppy_revalidate,
#ifdef CONFIG_COMPAT
.compat_ioctl = fd_compat_ioctl,
#endif
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d3394191e168..cb1191d6e945 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -253,7 +253,7 @@ static void loop_set_size(struct loop_device *lo, loff_t size)
{
struct block_device *bdev = lo->lo_device;
- bd_set_size(bdev, size << SECTOR_SHIFT);
+ bd_set_nr_sectors(bdev, size);
set_capacity_revalidate_and_notify(lo->lo_disk, size, false);
}
@@ -1251,7 +1251,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
set_capacity(lo->lo_disk, 0);
loop_sysfs_exit(lo);
if (bdev) {
- bd_set_size(bdev, 0);
+ bd_set_nr_sectors(bdev, 0);
/* let user-space know about this change */
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index edf8b632e3d2..2dca0aab0a9a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -300,6 +300,7 @@ static void nbd_size_update(struct nbd_device *nbd)
{
struct nbd_config *config = nbd->config;
struct block_device *bdev = bdget_disk(nbd->disk, 0);
+ sector_t nr_sectors = config->bytesize >> 9;
if (config->flags & NBD_FLAG_SEND_TRIM) {
nbd->disk->queue->limits.discard_granularity = config->blksize;
@@ -308,13 +309,13 @@ static void nbd_size_update(struct nbd_device *nbd)
}
blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
- set_capacity(nbd->disk, config->bytesize >> 9);
+ set_capacity(nbd->disk, nr_sectors);
if (bdev) {
if (bdev->bd_disk) {
- bd_set_size(bdev, config->bytesize);
+ bd_set_nr_sectors(bdev, nr_sectors);
set_blocksize(bdev, config->blksize);
} else
- bdev->bd_invalidated = 1;
+ set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
bdput(bdev);
}
kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
@@ -1138,7 +1139,7 @@ static void nbd_bdev_reset(struct block_device *bdev)
{
if (bdev->bd_openers > 1)
return;
- bd_set_size(bdev, 0);
+ bd_set_nr_sectors(bdev, 0);
}
static void nbd_parse_flags(struct nbd_device *nbd)
@@ -1321,7 +1322,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
return ret;
if (max_part)
- bdev->bd_invalidated = 1;
+ set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
mutex_unlock(&nbd->config_lock);
ret = wait_event_interruptible(config->recv_wq,
atomic_read(&config->recv_threads) == 0);
@@ -1499,9 +1500,9 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
refcount_set(&nbd->config_refs, 1);
refcount_inc(&nbd->refs);
mutex_unlock(&nbd->config_lock);
- bdev->bd_invalidated = 1;
+ set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
} else if (nbd_disconnected(nbd->config)) {
- bdev->bd_invalidated = 1;
+ set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
}
out:
mutex_unlock(&nbd_index_mutex);
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 5124eca90e83..70da8b86ce58 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -233,7 +233,7 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
struct pcd_unit *cd = bdev->bd_disk->private_data;
int ret;
- check_disk_change(bdev);
+ bdev_check_media_change(bdev);
mutex_lock(&pcd_mutex);
ret = cdrom_open(&cd->info, bdev, mode);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 1034e445680c..467dbd06b7cd 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1082,65 +1082,6 @@ static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *p
}
}
-/*
- * recover a failed write, query for relocation if possible
- *
- * returns 1 if recovery is possible, or 0 if not
- *
- */
-static int pkt_start_recovery(struct packet_data *pkt)
-{
- /*
- * FIXME. We need help from the file system to implement
- * recovery handling.
- */
- return 0;
-#if 0
- struct request *rq = pkt->rq;
- struct pktcdvd_device *pd = rq->rq_disk->private_data;
- struct block_device *pkt_bdev;
- struct super_block *sb = NULL;
- unsigned long old_block, new_block;
- sector_t new_sector;
-
- pkt_bdev = bdget(kdev_t_to_nr(pd->pkt_dev));
- if (pkt_bdev) {
- sb = get_super(pkt_bdev);
- bdput(pkt_bdev);
- }
-
- if (!sb)
- return 0;
-
- if (!sb->s_op->relocate_blocks)
- goto out;
-
- old_block = pkt->sector / (CD_FRAMESIZE >> 9);
- if (sb->s_op->relocate_blocks(sb, old_block, &new_block))
- goto out;
-
- new_sector = new_block * (CD_FRAMESIZE >> 9);
- pkt->sector = new_sector;
-
- bio_reset(pkt->bio);
- bio_set_dev(pkt->bio, pd->bdev);
- bio_set_op_attrs(pkt->bio, REQ_OP_WRITE, 0);
- pkt->bio->bi_iter.bi_sector = new_sector;
- pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE;
- pkt->bio->bi_vcnt = pkt->frames;
-
- pkt->bio->bi_end_io = pkt_end_io_packet_write;
- pkt->bio->bi_private = pkt;
-
- drop_super(sb);
- return 1;
-
-out:
- drop_super(sb);
- return 0;
-#endif
-}
-
static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state)
{
#if PACKET_DEBUG > 1
@@ -1357,12 +1298,8 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
break;
case PACKET_RECOVERY_STATE:
- if (pkt_start_recovery(pkt)) {
- pkt_start_write(pd, pkt);
- } else {
- pkt_dbg(2, pd, "No recovery possible\n");
- pkt_set_state(pkt, PACKET_FINISHED_STATE);
- }
+ pkt_dbg(2, pd, "No recovery possible\n");
+ pkt_set_state(pkt, PACKET_FINISHED_STATE);
break;
case PACKET_FINISHED_STATE:
@@ -2173,16 +2110,18 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
int ret;
long lba;
struct request_queue *q;
+ struct block_device *bdev;
/*
* We need to re-open the cdrom device without O_NONBLOCK to be able
* to read/write from/to it. It is already opened in O_NONBLOCK mode
- * so bdget() can't fail.
+ * so open should not fail.
*/
- bdget(pd->bdev->bd_dev);
- ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd);
- if (ret)
+ bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd);
+ if (IS_ERR(bdev)) {
+ ret = PTR_ERR(bdev);
goto out;
+ }
ret = pkt_get_last_written(pd, &lba);
if (ret) {
@@ -2192,7 +2131,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
set_capacity(pd->disk, lba << 2);
set_capacity(pd->bdev->bd_disk, lba << 2);
- bd_set_size(pd->bdev, (loff_t)lba << 11);
+ bd_set_nr_sectors(pd->bdev, lba << 2);
q = bdev_get_queue(pd->bdev);
if (write) {
@@ -2226,7 +2165,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
return 0;
out_putdev:
- blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
+ blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
return ret;
}
@@ -2563,7 +2502,6 @@ static int pkt_seq_show(struct seq_file *m, void *p)
static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
{
int i;
- int ret = 0;
char b[BDEVNAME_SIZE];
struct block_device *bdev;
@@ -2586,12 +2524,9 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
}
}
- bdev = bdget(dev);
- if (!bdev)
- return -ENOMEM;
- ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
- if (ret)
- return ret;
+ bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
return -EINVAL;
@@ -2609,7 +2544,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
if (IS_ERR(pd->cdrw.thread)) {
pkt_err(pd, "can't start kernel thread\n");
- ret = -ENOMEM;
goto out_mem;
}
@@ -2621,7 +2555,7 @@ out_mem:
blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
- return ret;
+ return -ENOMEM;
}
static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e77eaab5cf23..3e89b5d48ee6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4921,7 +4921,7 @@ static void rbd_dev_update_size(struct rbd_device *rbd_dev)
size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
dout("setting size to %llu sectors", (unsigned long long)size);
set_capacity(rbd_dev->disk, size);
- revalidate_disk(rbd_dev->disk);
+ revalidate_disk_size(rbd_dev->disk, true);
}
}
@@ -5022,7 +5022,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
}
if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
- q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
/*
* disk_release() expects a queue ref from add_disk() and will
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index cc6a4e2587ae..d7a69741c0f6 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -102,18 +102,12 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
size_t new_nsectors)
{
- int err = 0;
-
rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n",
dev->nsectors, new_nsectors);
dev->nsectors = new_nsectors;
set_capacity(dev->gd, dev->nsectors);
- err = revalidate_disk(dev->gd);
- if (err)
- rnbd_clt_err(dev,
- "Failed to change device size from %zu to %zu, err: %d\n",
- dev->nsectors, new_nsectors, err);
- return err;
+ revalidate_disk_size(dev->gd, true);
+ return 0;
}
static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
@@ -1180,7 +1174,7 @@ static int setup_mq_tags(struct rnbd_clt_session *sess)
tag_set->queue_depth = sess->queue_depth;
tag_set->numa_node = NUMA_NO_NODE;
tag_set->flags = BLK_MQ_F_SHOULD_MERGE |
- BLK_MQ_F_TAG_SHARED;
+ BLK_MQ_F_TAG_QUEUE_SHARED;
tag_set->cmd_size = sizeof(struct rnbd_iu);
tag_set->nr_hw_queues = num_online_cpus();
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index dd34504382e5..52dd1efa00f9 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -638,7 +638,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
return 0;
if (mode & (FMODE_READ|FMODE_WRITE)) {
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev) && fs->disk_in)
+ fs->ejected = 0;
if ((mode & FMODE_WRITE) && fs->write_protected) {
err = -EROFS;
goto out;
@@ -735,24 +736,6 @@ static unsigned int floppy_check_events(struct gendisk *disk,
return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
}
-static int floppy_revalidate(struct gendisk *disk)
-{
- struct floppy_state *fs = disk->private_data;
- struct swim __iomem *base = fs->swd->base;
-
- swim_drive(base, fs->location);
-
- if (fs->ejected)
- setup_medium(fs);
-
- if (!fs->disk_in)
- swim_motor(base, OFF);
- else
- fs->ejected = 0;
-
- return !fs->disk_in;
-}
-
static const struct block_device_operations floppy_fops = {
.owner = THIS_MODULE,
.open = floppy_unlocked_open,
@@ -760,7 +743,6 @@ static const struct block_device_operations floppy_fops = {
.ioctl = floppy_ioctl,
.getgeo = floppy_getgeo,
.check_events = floppy_check_events,
- .revalidate_disk = floppy_revalidate,
};
static struct kobject *floppy_find(dev_t dev, int *part, void *data)
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index aa77eb5fb7de..c2d922d125e2 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -945,7 +945,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
if (err == 0 && (mode & FMODE_NDELAY) == 0
&& (mode & (FMODE_READ|FMODE_WRITE))) {
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev))
+ floppy_revalidate(bdev->bd_disk);
if (fs->ejected)
err = -ENXIO;
}
@@ -1055,7 +1056,6 @@ static const struct block_device_operations floppy_fops = {
.release = floppy_release,
.ioctl = floppy_ioctl,
.check_events = floppy_check_events,
- .revalidate_disk= floppy_revalidate,
};
static const struct blk_mq_ops swim3_mq_ops = {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index b2e48dac1ebd..a314b9382442 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -598,7 +598,7 @@ static void virtblk_update_cache_mode(struct virtio_device *vdev)
struct virtio_blk *vblk = vdev->priv;
blk_queue_write_cache(vblk->disk->queue, writeback, false);
- revalidate_disk(vblk->disk);
+ revalidate_disk_size(vblk->disk, true);
}
static const char *const virtblk_cache_types[] = {
@@ -646,7 +646,7 @@ static struct attribute *virtblk_attrs[] = {
static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
- struct device *dev = container_of(kobj, struct device, kobj);
+ struct device *dev = kobj_to_dev(kobj);
struct gendisk *disk = dev_to_disk(dev);
struct virtio_blk *vblk = disk->private_data;
struct virtio_device *vdev = vblk->vdev;
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 5d8e0ab3f054..8d581c7536fb 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -888,26 +888,20 @@ static unsigned int ace_check_events(struct gendisk *gd, unsigned int clearing)
return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0;
}
-static int ace_revalidate_disk(struct gendisk *gd)
+static void ace_media_changed(struct ace_device *ace)
{
- struct ace_device *ace = gd->private_data;
unsigned long flags;
- dev_dbg(ace->dev, "ace_revalidate_disk()\n");
-
- if (ace->media_change) {
- dev_dbg(ace->dev, "requesting cf id and scheduling tasklet\n");
+ dev_dbg(ace->dev, "requesting cf id and scheduling tasklet\n");
- spin_lock_irqsave(&ace->lock, flags);
- ace->id_req_count++;
- spin_unlock_irqrestore(&ace->lock, flags);
+ spin_lock_irqsave(&ace->lock, flags);
+ ace->id_req_count++;
+ spin_unlock_irqrestore(&ace->lock, flags);
- tasklet_schedule(&ace->fsm_tasklet);
- wait_for_completion(&ace->id_completion);
- }
+ tasklet_schedule(&ace->fsm_tasklet);
+ wait_for_completion(&ace->id_completion);
dev_dbg(ace->dev, "revalidate complete\n");
- return ace->id_result;
}
static int ace_open(struct block_device *bdev, fmode_t mode)
@@ -922,7 +916,8 @@ static int ace_open(struct block_device *bdev, fmode_t mode)
ace->users++;
spin_unlock_irqrestore(&ace->lock, flags);
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev) && ace->media_change)
+ ace_media_changed(ace);
mutex_unlock(&xsysace_mutex);
return 0;
@@ -966,7 +961,6 @@ static const struct block_device_operations ace_fops = {
.open = ace_open,
.release = ace_release,
.check_events = ace_check_events,
- .revalidate_disk = ace_revalidate_disk,
.getgeo = ace_getgeo,
};
@@ -1080,7 +1074,7 @@ static int ace_setup(struct ace_device *ace)
(unsigned long long) ace->physaddr, ace->baseaddr, ace->irq);
ace->media_change = 1;
- ace_revalidate_disk(ace->gd);
+ ace_media_changed(ace);
/* Make the sysace device 'live' */
add_disk(ace->gd);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9100ac36670a..bff3d4021c18 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -52,6 +52,9 @@ static unsigned int num_devices = 1;
*/
static size_t huge_class_size;
+static const struct block_device_operations zram_devops;
+static const struct block_device_operations zram_wb_devops;
+
static void zram_free_page(struct zram *zram, size_t index);
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
u32 index, int offset, struct bio *bio);
@@ -408,8 +411,7 @@ static void reset_bdev(struct zram *zram)
zram->backing_dev = NULL;
zram->old_block_size = 0;
zram->bdev = NULL;
- zram->disk->queue->backing_dev_info->capabilities |=
- BDI_CAP_SYNCHRONOUS_IO;
+ zram->disk->fops = &zram_devops;
kvfree(zram->bitmap);
zram->bitmap = NULL;
}
@@ -491,9 +493,10 @@ static ssize_t backing_dev_store(struct device *dev,
goto out;
}
- bdev = bdgrab(I_BDEV(inode));
- err = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
- if (err < 0) {
+ bdev = blkdev_get_by_dev(inode->i_rdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+ if (IS_ERR(bdev)) {
+ err = PTR_ERR(bdev);
bdev = NULL;
goto out;
}
@@ -528,8 +531,7 @@ static ssize_t backing_dev_store(struct device *dev,
* freely but in fact, IO is going on so finally could cause
* use-after-free when the IO is really done.
*/
- zram->disk->queue->backing_dev_info->capabilities &=
- ~BDI_CAP_SYNCHRONOUS_IO;
+ zram->disk->fops = &zram_wb_devops;
up_write(&zram->init_lock);
pr_info("setup backing device %s\n", file_name);
@@ -1739,7 +1741,7 @@ static ssize_t disksize_store(struct device *dev,
zram->disksize = disksize;
set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
- revalidate_disk(zram->disk);
+ revalidate_disk_size(zram->disk, true);
up_write(&zram->init_lock);
return len;
@@ -1786,7 +1788,7 @@ static ssize_t reset_store(struct device *dev,
/* Make sure all the pending I/O are finished */
fsync_bdev(bdev);
zram_reset_device(zram);
- revalidate_disk(zram->disk);
+ revalidate_disk_size(zram->disk, true);
bdput(bdev);
mutex_lock(&bdev->bd_mutex);
@@ -1819,6 +1821,13 @@ static const struct block_device_operations zram_devops = {
.owner = THIS_MODULE
};
+static const struct block_device_operations zram_wb_devops = {
+ .open = zram_open,
+ .submit_bio = zram_submit_bio,
+ .swap_slot_free_notify = zram_slot_free_notify,
+ .owner = THIS_MODULE
+};
+
static DEVICE_ATTR_WO(compact);
static DEVICE_ATTR_RW(disksize);
static DEVICE_ATTR_RO(initstate);
@@ -1946,8 +1955,7 @@ static int zram_add(void)
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
- zram->disk->queue->backing_dev_info->capabilities |=
- (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO);
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 09b0cd292720..9874fc1c815b 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -479,7 +479,7 @@ static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- check_disk_change(bdev);
+ bdev_check_media_change(bdev);
mutex_lock(&gdrom_mutex);
ret = cdrom_open(gd.cd_info, bdev, mode);
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 380bf518338e..5d52a1f4738c 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -28,7 +28,8 @@
#include <linux/uaccess.h>
struct raw_device_data {
- struct block_device *binding;
+ dev_t binding;
+ struct block_device *bdev;
int inuse;
};
@@ -63,19 +64,25 @@ static int raw_open(struct inode *inode, struct file *filp)
return 0;
}
+ pr_warn_ratelimited(
+ "process %s (pid %d) is using the deprecated raw device\n"
+ "support will be removed in Linux 5.14.\n",
+ current->comm, current->pid);
+
mutex_lock(&raw_mutex);
/*
* All we need to do on open is check that the device is bound.
*/
- bdev = raw_devices[minor].binding;
err = -ENODEV;
- if (!bdev)
+ if (!raw_devices[minor].binding)
goto out;
- bdgrab(bdev);
- err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open);
- if (err)
+ bdev = blkdev_get_by_dev(raw_devices[minor].binding,
+ filp->f_mode | FMODE_EXCL, raw_open);
+ if (IS_ERR(bdev)) {
+ err = PTR_ERR(bdev);
goto out;
+ }
err = set_blocksize(bdev, bdev_logical_block_size(bdev));
if (err)
goto out1;
@@ -85,6 +92,7 @@ static int raw_open(struct inode *inode, struct file *filp)
file_inode(filp)->i_mapping =
bdev->bd_inode->i_mapping;
filp->private_data = bdev;
+ raw_devices[minor].bdev = bdev;
mutex_unlock(&raw_mutex);
return 0;
@@ -105,7 +113,7 @@ static int raw_release(struct inode *inode, struct file *filp)
struct block_device *bdev;
mutex_lock(&raw_mutex);
- bdev = raw_devices[minor].binding;
+ bdev = raw_devices[minor].bdev;
if (--raw_devices[minor].inuse == 0)
/* Here inode->i_mapping == bdev->bd_inode->i_mapping */
inode->i_mapping = &inode->i_data;
@@ -128,6 +136,7 @@ raw_ioctl(struct file *filp, unsigned int command, unsigned long arg)
static int bind_set(int number, u64 major, u64 minor)
{
dev_t dev = MKDEV(major, minor);
+ dev_t raw = MKDEV(RAW_MAJOR, number);
struct raw_device_data *rawdev;
int err = 0;
@@ -161,25 +170,17 @@ static int bind_set(int number, u64 major, u64 minor)
mutex_unlock(&raw_mutex);
return -EBUSY;
}
- if (rawdev->binding) {
- bdput(rawdev->binding);
+ if (rawdev->binding)
module_put(THIS_MODULE);
- }
+
+ rawdev->binding = dev;
if (!dev) {
/* unbind */
- rawdev->binding = NULL;
- device_destroy(raw_class, MKDEV(RAW_MAJOR, number));
+ device_destroy(raw_class, raw);
} else {
- rawdev->binding = bdget(dev);
- if (rawdev->binding == NULL) {
- err = -ENOMEM;
- } else {
- dev_t raw = MKDEV(RAW_MAJOR, number);
- __module_get(THIS_MODULE);
- device_destroy(raw_class, raw);
- device_create(raw_class, NULL, raw, NULL,
- "raw%d", number);
- }
+ __module_get(THIS_MODULE);
+ device_destroy(raw_class, raw);
+ device_create(raw_class, NULL, raw, NULL, "raw%d", number);
}
mutex_unlock(&raw_mutex);
return err;
@@ -187,18 +188,9 @@ static int bind_set(int number, u64 major, u64 minor)
static int bind_get(int number, dev_t *dev)
{
- struct raw_device_data *rawdev;
- struct block_device *bdev;
-
if (number <= 0 || number >= max_raw_minors)
return -EINVAL;
-
- rawdev = &raw_devices[number];
-
- mutex_lock(&raw_mutex);
- bdev = rawdev->binding;
- *dev = bdev ? bdev->bd_dev : 0;
- mutex_unlock(&raw_mutex);
+ *dev = raw_devices[number].binding;
return 0;
}
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 212bb2d8bf34..25d2d88e82ad 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1611,7 +1611,11 @@ static int idecd_open(struct block_device *bdev, fmode_t mode)
struct cdrom_info *info;
int rc = -ENXIO;
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev)) {
+ info = ide_drv_g(bdev->bd_disk, cdrom_info);
+
+ ide_cd_read_toc(info->drive);
+ }
mutex_lock(&ide_cd_mutex);
info = ide_cd_get(bdev->bd_disk);
@@ -1753,15 +1757,6 @@ static unsigned int idecd_check_events(struct gendisk *disk,
return cdrom_check_events(&info->devinfo, clearing);
}
-static int idecd_revalidate_disk(struct gendisk *disk)
-{
- struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
-
- ide_cd_read_toc(info->drive);
-
- return 0;
-}
-
static const struct block_device_operations idecd_ops = {
.owner = THIS_MODULE,
.open = idecd_open,
@@ -1770,7 +1765,6 @@ static const struct block_device_operations idecd_ops = {
.compat_ioctl = IS_ENABLED(CONFIG_COMPAT) ?
idecd_compat_ioctl : NULL,
.check_events = idecd_check_events,
- .revalidate_disk = idecd_revalidate_disk
};
/* module options */
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 1d3407d7e095..34b9441084f8 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -739,12 +739,9 @@ static void ide_disk_setup(ide_drive_t *drive)
set_wcache(drive, 1);
if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 &&
- (drive->head == 0 || drive->head > 16)) {
+ (drive->head == 0 || drive->head > 16))
printk(KERN_ERR "%s: invalid geometry: %d physical heads?\n",
drive->name, drive->head);
- drive->dev_flags &= ~IDE_DFLAG_ATTACH;
- } else
- drive->dev_flags |= IDE_DFLAG_ATTACH;
}
static void ide_disk_flush(ide_drive_t *drive)
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index af7503b47dbe..f5a2870aaf54 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -516,8 +516,6 @@ static void ide_floppy_setup(ide_drive_t *drive)
(void) ide_floppy_get_capacity(drive);
ide_proc_register_driver(drive, floppy->driver);
-
- drive->dev_flags |= IDE_DFLAG_ATTACH;
}
static void ide_floppy_flush(ide_drive_t *drive)
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 05c26986637b..e2b6c82586ce 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -225,8 +225,12 @@ static int ide_gd_open(struct block_device *bdev, fmode_t mode)
* and the door_lock is irrelevant at this point.
*/
drive->disk_ops->set_doorlock(drive, disk, 1);
- drive->dev_flags |= IDE_DFLAG_MEDIA_CHANGED;
- check_disk_change(bdev);
+ if (__invalidate_device(bdev, true))
+ pr_warn("VFS: busy inodes on changed media %s\n",
+ bdev->bd_disk->disk_name);
+ drive->disk_ops->get_capacity(drive);
+ set_capacity(disk, ide_gd_capacity(drive));
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
} else if (drive->dev_flags & IDE_DFLAG_FORMAT_IN_PROGRESS) {
ret = -EBUSY;
goto out_put_idkp;
@@ -284,32 +288,6 @@ static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
-static unsigned int ide_gd_check_events(struct gendisk *disk,
- unsigned int clearing)
-{
- struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
- ide_drive_t *drive = idkp->drive;
- bool ret;
-
- /* do not scan partitions twice if this is a removable device */
- if (drive->dev_flags & IDE_DFLAG_ATTACH) {
- drive->dev_flags &= ~IDE_DFLAG_ATTACH;
- return 0;
- }
-
- /*
- * The following is used to force revalidation on the first open on
- * removeable devices, and never gets reported to userland as
- * DISK_EVENT_FLAG_UEVENT isn't set in genhd->event_flags.
- * This is intended as removable ide disk can't really detect
- * MEDIA_CHANGE events.
- */
- ret = drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED;
- drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
-
- return ret ? DISK_EVENT_MEDIA_CHANGE : 0;
-}
-
static void ide_gd_unlock_native_capacity(struct gendisk *disk)
{
struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
@@ -320,18 +298,6 @@ static void ide_gd_unlock_native_capacity(struct gendisk *disk)
disk_ops->unlock_native_capacity(drive);
}
-static int ide_gd_revalidate_disk(struct gendisk *disk)
-{
- struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
- ide_drive_t *drive = idkp->drive;
-
- if (ide_gd_check_events(disk, 0))
- drive->disk_ops->get_capacity(drive);
-
- set_capacity(disk, ide_gd_capacity(drive));
- return 0;
-}
-
static int ide_gd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
@@ -364,9 +330,7 @@ static const struct block_device_operations ide_gd_ops = {
.compat_ioctl = ide_gd_compat_ioctl,
#endif
.getgeo = ide_gd_getgeo,
- .check_events = ide_gd_check_events,
.unlock_native_capacity = ide_gd_unlock_native_capacity,
- .revalidate_disk = ide_gd_revalidate_disk
};
static int ide_gd_probe(ide_drive_t *drive)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index c7cadaafa947..7f54ae223644 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -475,6 +475,7 @@ struct search {
unsigned int read_dirty_data:1;
unsigned int cache_missed:1;
+ struct hd_struct *part;
unsigned long start_time;
struct btree_op op;
@@ -669,7 +670,7 @@ static void bio_complete(struct search *s)
{
if (s->orig_bio) {
/* Count on bcache device */
- disk_end_io_acct(s->d->disk, bio_op(s->orig_bio), s->start_time);
+ part_end_io_acct(s->part, s->orig_bio, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
s->orig_bio->bi_status = s->iop.status;
@@ -731,7 +732,7 @@ static inline struct search *search_alloc(struct bio *bio,
s->write = op_is_write(bio_op(bio));
s->read_dirty_data = 0;
/* Count on the bcache device */
- s->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio));
+ s->start_time = part_start_io_acct(d->disk, &s->part, bio);
s->iop.c = d->c;
s->iop.bio = NULL;
s->iop.inode = d->id;
@@ -1072,6 +1073,7 @@ struct detached_dev_io_private {
unsigned long start_time;
bio_end_io_t *bi_end_io;
void *bi_private;
+ struct hd_struct *part;
};
static void detached_dev_end_io(struct bio *bio)
@@ -1083,7 +1085,7 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_private = ddip->bi_private;
/* Count on the bcache device */
- disk_end_io_acct(ddip->d->disk, bio_op(bio), ddip->start_time);
+ part_end_io_acct(ddip->part, bio, ddip->start_time);
if (bio->bi_status) {
struct cached_dev *dc = container_of(ddip->d,
@@ -1109,7 +1111,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
ddip->d = d;
/* Count on the bcache device */
- ddip->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio));
+ ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio);
ddip->bi_end_io = bio->bi_end_io;
ddip->bi_private = bio->bi_private;
bio->bi_end_io = detached_dev_end_io;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1bbdc410ee3c..6bfa77167362 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1427,9 +1427,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
if (ret)
return ret;
- dc->disk.disk->queue->backing_dev_info->ra_pages =
- max(dc->disk.disk->queue->backing_dev_info->ra_pages,
- q->backing_dev_info->ra_pages);
+ blk_queue_io_opt(dc->disk.disk->queue,
+ max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
atomic_set(&dc->io_errors, 0);
dc->io_disable = false;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8d2b835d7a10..56b723d012ac 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -701,7 +701,7 @@ static void rs_set_capacity(struct raid_set *rs)
struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
set_capacity(gendisk, rs->md.array_sectors);
- revalidate_disk(gendisk);
+ revalidate_disk_size(gendisk, true);
}
/*
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 229f461e7def..f3444508827d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1819,7 +1819,7 @@ static int device_requires_stable_pages(struct dm_target *ti,
{
struct request_queue *q = bdev_get_queue(dev->bdev);
- return q && bdi_cap_stable_pages_required(q->backing_dev_info);
+ return q && blk_queue_stable_writes(q);
}
/*
@@ -1904,9 +1904,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
* because they do their own checksumming.
*/
if (dm_table_requires_stable_pages(t))
- q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
else
- q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
/*
* Determine whether or not this queue's I/O timings contribute
@@ -1929,8 +1929,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
#endif
- /* Allow reads to exceed readahead limits */
- q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
+ blk_queue_update_readahead(q);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4a40df8af7d3..6253fc82bd6f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2099,18 +2099,6 @@ static void event_callback(void *context)
}
/*
- * Protected by md->suspend_lock obtained by dm_swap_table().
- */
-static void __set_size(struct mapped_device *md, sector_t size)
-{
- lockdep_assert_held(&md->suspend_lock);
-
- set_capacity(md->disk, size);
-
- i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-}
-
-/*
* Returns old map, which caller must destroy.
*/
static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
@@ -2132,7 +2120,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
if (size != dm_get_size(md))
memset(&md->geometry, 0, sizeof(md->geometry));
- __set_size(md, size);
+ set_capacity(md->disk, size);
+ bd_set_nr_sectors(md->bdev, size);
dm_table_event_callback(t, event_callback, md);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d50737ec4039..0580b51a156a 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -582,7 +582,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
break;
case CHANGE_CAPACITY:
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
break;
case RESYNCING:
set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
@@ -1296,12 +1296,12 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
__func__, __LINE__);
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
} else {
/* revert to previous sectors */
ret = mddev->pers->resize(mddev, old_dev_sectors);
if (!ret)
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index c2ae9125c4c3..5ab22069b5be 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -202,7 +202,7 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev_resume(mddev);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
kfree_rcu(oldconf, rcu);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 607278207023..64bc22d2b606 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -464,6 +464,7 @@ struct md_io {
bio_end_io_t *orig_bi_end_io;
void *orig_bi_private;
unsigned long start_time;
+ struct hd_struct *part;
};
static void md_end_io(struct bio *bio)
@@ -471,7 +472,7 @@ static void md_end_io(struct bio *bio)
struct md_io *md_io = bio->bi_private;
struct mddev *mddev = md_io->mddev;
- disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time);
+ part_end_io_acct(md_io->part, bio, md_io->start_time);
bio->bi_end_io = md_io->orig_bi_end_io;
bio->bi_private = md_io->orig_bi_private;
@@ -517,9 +518,8 @@ static blk_qc_t md_submit_bio(struct bio *bio)
bio->bi_end_io = md_end_io;
bio->bi_private = md_io;
- md_io->start_time = disk_start_io_acct(mddev->gendisk,
- bio_sectors(bio),
- bio_op(bio));
+ md_io->start_time = part_start_io_acct(mddev->gendisk,
+ &md_io->part, bio);
}
/* bio could be mergeable after passing to underlayer */
@@ -5358,7 +5358,7 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
mddev->array_sectors = sectors;
if (mddev->pers) {
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
}
}
mddev_unlock(mddev);
@@ -6109,7 +6109,7 @@ int do_md_run(struct mddev *mddev)
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
@@ -6427,7 +6427,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
set_capacity(disk, 0);
mutex_unlock(&mddev->open_mutex);
mddev->changed = 1;
- revalidate_disk(disk);
+ revalidate_disk_size(disk, true);
if (mddev->ro)
mddev->ro = 0;
@@ -7259,7 +7259,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
md_cluster_ops->update_size(mddev, old_dev_sectors);
else if (mddev->queue) {
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
}
}
return rv;
@@ -7848,7 +7848,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
atomic_inc(&mddev->openers);
mutex_unlock(&mddev->open_mutex);
- check_disk_change(bdev);
+ bdev_check_media_change(bdev);
out:
if (err)
mddev_put(mddev);
@@ -9018,7 +9018,7 @@ void md_do_sync(struct md_thread *thread)
mddev_unlock(mddev);
if (!mddev_is_clustered(mddev)) {
set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ revalidate_disk_size(mddev->gendisk, true);
}
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d9c4e6b7e939..f9e2ccdd22c4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -397,7 +397,7 @@ struct mddev {
* These locks are separate due to conflicting interactions
* with bdev->bd_mutex.
* Lock ordering is:
- * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+ * reconfig_mutex -> bd_mutex
* bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
*/
struct mutex open_mutex;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f54a449f97aa..aa2d72791768 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -410,22 +410,6 @@ static int raid0_run(struct mddev *mddev)
mdname(mddev),
(unsigned long long)mddev->array_sectors);
- if (mddev->queue) {
- /* calculate the max read-ahead size.
- * For read-ahead of large files to be effective, we need to
- * readahead at least twice a whole stripe. i.e. number of devices
- * multiplied by chunk size times 2.
- * If an individual device has an ra_pages greater than the
- * chunk size, then we will not drive that device as hard as it
- * wants. We consider this a configuration error: a larger
- * chunksize should be used in that case.
- */
- int stripe = mddev->raid_disks *
- (mddev->chunk_sectors << 9) / PAGE_SIZE;
- if (mddev->queue->backing_dev_info->ra_pages < 2* stripe)
- mddev->queue->backing_dev_info->ra_pages = 2* stripe;
- }
-
dump_zones(mddev);
ret = md_integrity_register(mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e8fa32733917..5d1bdee313ec 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3703,10 +3703,20 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
+static void raid10_set_io_opt(struct r10conf *conf)
+{
+ int raid_disks = conf->geo.raid_disks;
+
+ if (!(conf->geo.raid_disks % conf->geo.near_copies))
+ raid_disks /= conf->geo.near_copies;
+ blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
+ raid_disks);
+}
+
static int raid10_run(struct mddev *mddev)
{
struct r10conf *conf;
- int i, disk_idx, chunk_size;
+ int i, disk_idx;
struct raid10_info *disk;
struct md_rdev *rdev;
sector_t size;
@@ -3742,18 +3752,13 @@ static int raid10_run(struct mddev *mddev)
mddev->thread = conf->thread;
conf->thread = NULL;
- chunk_size = mddev->chunk_sectors << 9;
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
- blk_queue_io_min(mddev->queue, chunk_size);
- if (conf->geo.raid_disks % conf->geo.near_copies)
- blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
- else
- blk_queue_io_opt(mddev->queue, chunk_size *
- (conf->geo.raid_disks / conf->geo.near_copies));
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+ raid10_set_io_opt(conf);
}
rdev_for_each(rdev, mddev) {
@@ -3868,19 +3873,6 @@ static int raid10_run(struct mddev *mddev)
mddev->resync_max_sectors = size;
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
- if (mddev->queue) {
- int stripe = conf->geo.raid_disks *
- ((mddev->chunk_sectors << 9) / PAGE_SIZE);
-
- /* Calculate max read-ahead size.
- * We need to readahead at least twice a whole stripe....
- * maybe...
- */
- stripe /= conf->geo.near_copies;
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
- }
-
if (md_integrity_register(mddev))
goto out_free_conf;
@@ -4718,16 +4710,8 @@ static void end_reshape(struct r10conf *conf)
conf->reshape_safe = MaxSector;
spin_unlock_irq(&conf->device_lock);
- /* read-ahead size must cover two whole stripes, which is
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
- */
- if (conf->mddev->queue) {
- int stripe = conf->geo.raid_disks *
- ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
- stripe /= conf->geo.near_copies;
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
- }
+ if (conf->mddev->queue)
+ raid10_set_io_opt(conf);
conf->fullsync = 0;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 225380efd1e2..d589d26c86ea 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6638,14 +6638,14 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
if (!conf)
err = -ENODEV;
else if (new != conf->skip_copy) {
+ struct request_queue *q = mddev->queue;
+
mddev_suspend(mddev);
conf->skip_copy = new;
if (new)
- mddev->queue->backing_dev_info->capabilities |=
- BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
else
- mddev->queue->backing_dev_info->capabilities &=
- ~BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
mddev_resume(mddev);
}
mddev_unlock(mddev);
@@ -7232,6 +7232,12 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
return 0;
}
+static void raid5_set_io_opt(struct r5conf *conf)
+{
+ blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
+ (conf->raid_disks - conf->max_degraded));
+}
+
static int raid5_run(struct mddev *mddev)
{
struct r5conf *conf;
@@ -7516,13 +7522,10 @@ static int raid5_run(struct mddev *mddev)
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
- if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
- mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
- blk_queue_io_opt(mddev->queue, chunk_size *
- (conf->raid_disks - conf->max_degraded));
+ raid5_set_io_opt(conf);
mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
@@ -8106,16 +8109,8 @@ static void end_reshape(struct r5conf *conf)
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- /* read-ahead size must cover two whole stripes, which is
- * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
- */
- if (conf->mddev->queue) {
- int data_disks = conf->raid_disks - conf->max_degraded;
- int stripe = data_disks * ((conf->chunk_sectors << 9)
- / PAGE_SIZE);
- if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
- conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
- }
+ if (conf->mddev->queue)
+ raid5_set_io_opt(conf);
}
}
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 6c022ef0f84d..80fe3852ce0f 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -472,8 +472,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
}
if (mmc_host_is_spi(host) && host->use_spi_crc)
- mq->queue->backing_dev_info->capabilities |=
- BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue);
mq->queue->queuedata = mq;
blk_queue_rq_timeout(mq->queue, 60 * HZ);
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 7d930569a7df..b5e5d3140f57 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -2196,6 +2196,8 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name)
bdi = bdi_alloc(NUMA_NO_NODE);
if (!bdi)
return ERR_PTR(-ENOMEM);
+ bdi->ra_pages = 0;
+ bdi->io_pages = 0;
/*
* We put '-0' suffix to the name to get the same name format as we
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 1f718381a045..22e5617b2cea 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -226,7 +226,6 @@ static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
static const struct block_device_operations nd_blk_fops = {
.owner = THIS_MODULE,
.submit_bio = nd_blk_submit_bio,
- .revalidate_disk = nvdimm_revalidate_disk,
};
static void nd_blk_release_queue(void *q)
@@ -284,7 +283,7 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
device_add_disk(dev, disk, NULL);
- revalidate_disk(disk);
+ nvdimm_check_and_set_ro(disk);
return 0;
}
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 0ff610e728ff..12ff6f8784ac 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1513,7 +1513,6 @@ static const struct block_device_operations btt_fops = {
.submit_bio = btt_submit_bio,
.rw_page = btt_rw_page,
.getgeo = btt_getgeo,
- .revalidate_disk = nvdimm_revalidate_disk,
};
static int btt_blk_init(struct btt *btt)
@@ -1538,8 +1537,6 @@ static int btt_blk_init(struct btt *btt)
btt->btt_disk->private_data = btt;
btt->btt_disk->queue = btt->btt_queue;
btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
- btt->btt_disk->queue->backing_dev_info->capabilities |=
- BDI_CAP_SYNCHRONOUS_IO;
blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
@@ -1558,7 +1555,7 @@ static int btt_blk_init(struct btt *btt)
set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL);
btt->nd_btt->size = btt->nlba * (u64)btt->sector_size;
- revalidate_disk(btt->btt_disk);
+ nvdimm_check_and_set_ro(btt->btt_disk);
return 0;
}
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 955265656b96..2304c6183822 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -628,7 +628,7 @@ int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
}
EXPORT_SYMBOL(__nd_driver_register);
-int nvdimm_revalidate_disk(struct gendisk *disk)
+void nvdimm_check_and_set_ro(struct gendisk *disk)
{
struct device *dev = disk_to_dev(disk)->parent;
struct nd_region *nd_region = to_nd_region(dev->parent);
@@ -639,16 +639,13 @@ int nvdimm_revalidate_disk(struct gendisk *disk)
* read-only if the disk is already read-only.
*/
if (disk_ro || nd_region->ro == disk_ro)
- return 0;
+ return;
dev_info(dev, "%s read-only, marking %s read-only\n",
dev_name(&nd_region->dev), disk->disk_name);
set_disk_ro(disk, 1);
-
- return 0;
-
}
-EXPORT_SYMBOL(nvdimm_revalidate_disk);
+EXPORT_SYMBOL(nvdimm_check_and_set_ro);
static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
char *buf)
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 85c1ae813ea3..72740108ba42 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -361,7 +361,7 @@ u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region);
void nvdimm_bus_lock(struct device *dev);
void nvdimm_bus_unlock(struct device *dev);
bool is_nvdimm_bus_locked(struct device *dev);
-int nvdimm_revalidate_disk(struct gendisk *disk);
+void nvdimm_check_and_set_ro(struct gendisk *disk);
void nvdimm_drvdata_release(struct kref *kref);
void put_ndd(struct nvdimm_drvdata *ndd);
int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index fab29b514372..1711fdfd8d28 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -281,7 +281,6 @@ static const struct block_device_operations pmem_fops = {
.owner = THIS_MODULE,
.submit_bio = pmem_submit_bio,
.rw_page = pmem_rw_page,
- .revalidate_disk = nvdimm_revalidate_disk,
};
static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
@@ -476,7 +475,6 @@ static int pmem_attach_disk(struct device *dev,
disk->queue = q;
disk->flags = GENHD_FL_EXT_DEVT;
disk->private_data = pmem;
- disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
nvdimm_namespace_disk_name(ndns, disk->disk_name);
set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
/ 512);
@@ -501,7 +499,7 @@ static int pmem_attach_disk(struct device *dev,
if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
return -ENOMEM;
- revalidate_disk(disk);
+ nvdimm_check_and_set_ro(disk);
pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
"badblocks");
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2f249052c95d..64f4bf85c3d1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -94,21 +94,34 @@ static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
+static void nvme_update_bdev_size(struct gendisk *disk)
+{
+ struct block_device *bdev = bdget_disk(disk, 0);
+
+ if (bdev) {
+ bd_set_nr_sectors(bdev, get_capacity(disk));
+ bdput(bdev);
+ }
+}
+
+/*
+ * Prepare a queue for teardown.
+ *
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
+ * holding bd_butex. This will end buffered writers dirtying pages that can't
+ * be synced.
+ */
static void nvme_set_queue_dying(struct nvme_ns *ns)
{
- /*
- * Revalidating a dead namespace sets capacity to 0. This will end
- * buffered writers dirtying pages that can't be synced.
- */
if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
+
blk_set_queue_dying(ns->queue);
- /* Forcibly unquiesce queues to avoid blocking dispatch */
blk_mq_unquiesce_queue(ns->queue);
- /*
- * Revalidate after unblocking dispatchers that may be holding bd_butex
- */
- revalidate_disk(ns->disk);
+
+ set_capacity(ns->disk, 0);
+ nvme_update_bdev_size(ns->disk);
}
static void nvme_queue_scan(struct nvme_ctrl *ctrl)
@@ -2134,7 +2147,8 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
nvme_update_disk_info(ns->head->disk, ns, id);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
- nvme_mpath_update_disk_size(ns->head->disk);
+ blk_queue_update_readahead(ns->head->disk->queue);
+ nvme_update_bdev_size(ns->head->disk);
}
#endif
return 0;
@@ -2339,7 +2353,6 @@ static const struct block_device_operations nvme_fops = {
.open = nvme_open,
.release = nvme_release,
.getgeo = nvme_getgeo,
- .revalidate_disk= nvme_revalidate_disk,
.report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -3924,8 +3937,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
goto out_free_ns;
if (ctrl->opts && ctrl->opts->data_digest)
- ns->queue->backing_dev_info->capabilities
- |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
@@ -4051,14 +4063,19 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
+ int ret;
ns = nvme_find_get_ns(ctrl, nsid);
- if (ns) {
- if (revalidate_disk(ns->disk))
- nvme_ns_remove(ns);
- nvme_put_ns(ns);
- } else
+ if (!ns) {
nvme_alloc_ns(ctrl, nsid);
+ return;
+ }
+
+ ret = nvme_revalidate_disk(ns->disk);
+ revalidate_disk_size(ns->disk, ret == 0);
+ if (ret)
+ nvme_ns_remove(ns);
+ nvme_put_ns(ns);
}
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index d4ba736c6c89..74896be40c17 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -673,13 +673,9 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
nvme_mpath_set_live(ns);
}
- if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
- struct gendisk *disk = ns->head->disk;
-
- if (disk)
- disk->queue->backing_dev_info->capabilities |=
- BDI_CAP_STABLE_WRITES;
- }
+ if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
+ ns->head->disk->queue);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9fd45ff656da..398394bcbee4 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -682,16 +682,6 @@ static inline void nvme_trace_bio_complete(struct request *req,
trace_block_bio_complete(ns->head->disk->queue, req->bio);
}
-static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
-{
- struct block_device *bdev = bdget_disk(disk, 0);
-
- if (bdev) {
- bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT);
- bdput(bdev);
- }
-}
-
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
@@ -766,9 +756,6 @@ static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
{
}
-static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
-{
-}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_BLK_DEV_ZONED
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index af5b0ecb8f89..a9698fba9b76 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -101,18 +101,11 @@ int dasd_scan_partitions(struct dasd_block *block)
struct block_device *bdev;
int rc;
- bdev = bdget_disk(block->gdp, 0);
- if (!bdev) {
- DBF_DEV_EVENT(DBF_ERR, block->base, "%s",
- "scan partitions error, bdget returned NULL");
- return -ENODEV;
- }
-
- rc = blkdev_get(bdev, FMODE_READ, NULL);
- if (rc < 0) {
+ bdev = blkdev_get_by_dev(disk_devt(block->gdp), FMODE_READ, NULL);
+ if (IS_ERR(bdev)) {
DBF_DEV_EVENT(DBF_ERR, block->base,
- "scan partitions error, blkdev_get returned %d",
- rc);
+ "scan partitions error, blkdev_get returned %ld",
+ PTR_ERR(bdev));
return -ENODEV;
}
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 777734d1b4e5..faaf5596e31c 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -55,10 +55,7 @@ dasd_ioctl_enable(struct block_device *bdev)
dasd_enable_device(base);
/* Formatting the dasd device can change the capacity. */
- mutex_lock(&bdev->bd_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)get_capacity(base->block->gdp) << 9);
- mutex_unlock(&bdev->bd_mutex);
+ bd_set_nr_sectors(bdev, get_capacity(base->block->gdp));
dasd_put_device(base);
return 0;
}
@@ -91,9 +88,7 @@ dasd_ioctl_disable(struct block_device *bdev)
* Set i_size to zero, since read, write, etc. check against this
* value.
*/
- mutex_lock(&bdev->bd_mutex);
- i_size_write(bdev->bd_inode, 0);
- mutex_unlock(&bdev->bd_mutex);
+ bd_set_nr_sectors(bdev, 0);
dasd_put_device(base);
return 0;
}
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index b5dd1caae5e9..a622f334c933 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -962,8 +962,8 @@ static int iscsi_sw_tcp_slave_configure(struct scsi_device *sdev)
struct iscsi_conn *conn = session->leadconn;
if (conn->datadgst_en)
- sdev->request_queue->backing_dev_info->capabilities
- |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
+ sdev->request_queue);
blk_queue_dma_alignment(sdev->request_queue, 0);
return 0;
}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 95018e650f2d..d020639c28c6 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -217,7 +217,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr,
sd_print_sense_hdr(sdkp, &sshdr);
return -EINVAL;
}
- revalidate_disk(sdkp->disk);
+ sd_revalidate_disk(sdkp->disk);
return count;
}
@@ -1381,8 +1381,10 @@ static int sd_open(struct block_device *bdev, fmode_t mode)
if (!scsi_block_when_processing_errors(sdev))
goto error_out;
- if (sdev->removable || sdkp->write_prot)
- check_disk_change(bdev);
+ if (sdev->removable || sdkp->write_prot) {
+ if (bdev_check_media_change(bdev))
+ sd_revalidate_disk(bdev->bd_disk);
+ }
/*
* If the drive is empty, just let the open fail.
@@ -1706,8 +1708,10 @@ static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr)
static void sd_rescan(struct device *dev)
{
struct scsi_disk *sdkp = dev_get_drvdata(dev);
+ int ret;
- revalidate_disk(sdkp->disk);
+ ret = sd_revalidate_disk(sdkp->disk);
+ revalidate_disk_size(sdkp->disk, ret == 0);
}
static int sd_ioctl(struct block_device *bdev, fmode_t mode,
@@ -1841,7 +1845,6 @@ static const struct block_device_operations sd_fops = {
.compat_ioctl = sd_compat_ioctl,
#endif
.check_events = sd_check_events,
- .revalidate_disk = sd_revalidate_disk,
.unlock_native_capacity = sd_unlock_native_capacity,
.report_zones = sd_zbc_report_zones,
.pr_ops = &sd_pr_ops,
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 3b3a53c6a0de..2b43c0f97442 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -517,6 +517,17 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt)
return ret;
}
+static void sr_revalidate_disk(struct scsi_cd *cd)
+{
+ struct scsi_sense_hdr sshdr;
+
+ /* if the unit is not ready, nothing more to do */
+ if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
+ return;
+ sr_cd_check(&cd->cdi);
+ get_sectorsize(cd);
+}
+
static int sr_block_open(struct block_device *bdev, fmode_t mode)
{
struct scsi_cd *cd;
@@ -529,7 +540,8 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode)
sdev = cd->device;
scsi_autopm_get_device(sdev);
- check_disk_change(bdev);
+ if (bdev_check_media_change(bdev))
+ sr_revalidate_disk(cd);
mutex_lock(&cd->lock);
ret = cdrom_open(&cd->cdi, bdev, mode);
@@ -658,26 +670,6 @@ static unsigned int sr_block_check_events(struct gendisk *disk,
return ret;
}
-static int sr_block_revalidate_disk(struct gendisk *disk)
-{
- struct scsi_sense_hdr sshdr;
- struct scsi_cd *cd;
-
- cd = scsi_cd_get(disk);
- if (!cd)
- return -ENXIO;
-
- /* if the unit is not ready, nothing more to do */
- if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
- goto out;
-
- sr_cd_check(&cd->cdi);
- get_sectorsize(cd);
-out:
- scsi_cd_put(cd);
- return 0;
-}
-
static const struct block_device_operations sr_bdops =
{
.owner = THIS_MODULE,
@@ -688,7 +680,6 @@ static const struct block_device_operations sr_bdops =
.compat_ioctl = sr_block_compat_ioctl,
#endif
.check_events = sr_block_check_events,
- .revalidate_disk = sr_block_revalidate_disk,
};
static int sr_open(struct cdrom_device_info *cdi, int purpose)
@@ -802,6 +793,7 @@ static int sr_probe(struct device *dev)
dev_set_drvdata(dev, cd);
disk->flags |= GENHD_FL_REMOVABLE;
+ sr_revalidate_disk(cd);
device_add_disk(&sdev->sdev_gendev, disk, NULL);
sdev_printk(KERN_DEBUG, sdev,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3576123d8299..6ecf863bfa2f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -625,7 +625,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
inode = file_inode(vma->vm_file);
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0;
might_sleep();
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 74df32be4c6a..e34fa20acf61 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -80,8 +80,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
if (ret)
return ret;
- if (v9ses->cache)
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
+ if (!v9ses->cache) {
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
+ }
sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
if (!v9ses->cache)
diff --git a/fs/afs/super.c b/fs/afs/super.c
index b552357b1d13..3a40ee752c1e 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -456,7 +456,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
ret = super_setup_bdi(sb);
if (ret)
return ret;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* allocate the root inode and dentry */
if (as->dyn_root) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8ae833e00443..6b9d19ffa5af 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -103,6 +103,35 @@ void invalidate_bdev(struct block_device *bdev)
}
EXPORT_SYMBOL(invalidate_bdev);
+/*
+ * Drop all buffers & page cache for given bdev range. This function bails
+ * with error if bdev has other exclusive owner (such as filesystem).
+ */
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ struct block_device *claimed_bdev = NULL;
+ int err;
+
+ /*
+ * If we don't hold exclusive handle for the device, upgrade to it
+ * while we discard the buffer cache to avoid discarding buffers
+ * under live filesystem.
+ */
+ if (!(mode & FMODE_EXCL)) {
+ claimed_bdev = bdev->bd_contains;
+ err = bd_prepare_to_claim(bdev, claimed_bdev,
+ truncate_bdev_range);
+ if (err)
+ return err;
+ }
+ truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+ if (claimed_bdev)
+ bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
+ return 0;
+}
+EXPORT_SYMBOL(truncate_bdev_range);
+
static void set_init_blocksize(struct block_device *bdev)
{
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
@@ -876,11 +905,11 @@ struct block_device *bdget(dev_t dev)
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
+ spin_lock_init(&bdev->bd_size_lock);
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
bdev->bd_part_count = 0;
- bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
@@ -1290,6 +1319,7 @@ static void check_disk_size_change(struct gendisk *disk,
{
loff_t disk_size, bdev_size;
+ spin_lock(&bdev->bd_size_lock);
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
@@ -1299,85 +1329,51 @@ static void check_disk_size_change(struct gendisk *disk,
disk->disk_name, bdev_size, disk_size);
}
i_size_write(bdev->bd_inode, disk_size);
- if (bdev_size > disk_size && __invalidate_device(bdev, false))
+ }
+ spin_unlock(&bdev->bd_size_lock);
+
+ if (bdev_size > disk_size) {
+ if (__invalidate_device(bdev, false))
pr_warn("VFS: busy inodes on resized disk %s\n",
disk->disk_name);
}
- bdev->bd_invalidated = 0;
}
/**
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
- * @disk: struct gendisk to be revalidated
+ * revalidate_disk_size - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @verbose: if %true log a message about a size change if there is any
*
- * This routine is a wrapper for lower-level driver's revalidate_disk
- * call-backs. It is used to do common pre and post operations needed
- * for all revalidate_disk operations.
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs. When shrinking the bdev size, its all caches
+ * are freed.
*/
-int revalidate_disk(struct gendisk *disk)
+void revalidate_disk_size(struct gendisk *disk, bool verbose)
{
- int ret = 0;
-
- if (disk->fops->revalidate_disk)
- ret = disk->fops->revalidate_disk(disk);
+ struct block_device *bdev;
/*
* Hidden disks don't have associated bdev so there's no point in
- * revalidating it.
+ * revalidating them.
*/
- if (!(disk->flags & GENHD_FL_HIDDEN)) {
- struct block_device *bdev = bdget_disk(disk, 0);
-
- if (!bdev)
- return ret;
+ if (disk->flags & GENHD_FL_HIDDEN)
+ return;
- mutex_lock(&bdev->bd_mutex);
- check_disk_size_change(disk, bdev, ret == 0);
- mutex_unlock(&bdev->bd_mutex);
+ bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ check_disk_size_change(disk, bdev, verbose);
bdput(bdev);
}
- return ret;
}
-EXPORT_SYMBOL(revalidate_disk);
+EXPORT_SYMBOL(revalidate_disk_size);
-/*
- * This routine checks whether a removable media has been changed,
- * and invalidates all buffer-cache-entries in that case. This
- * is a relatively slow routine, so we have to try to minimize using
- * it. Thus it is called only upon a 'mount' or 'open'. This
- * is the best way of combining speed and utility, I think.
- * People changing diskettes in the middle of an operation deserve
- * to lose :-)
- */
-int check_disk_change(struct block_device *bdev)
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
- struct gendisk *disk = bdev->bd_disk;
- const struct block_device_operations *bdops = disk->fops;
- unsigned int events;
-
- events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
- DISK_EVENT_EJECT_REQUEST);
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return 0;
-
- if (__invalidate_device(bdev, true))
- pr_warn("VFS: busy inodes on changed media %s\n",
- disk->disk_name);
- bdev->bd_invalidated = 1;
- if (bdops->revalidate_disk)
- bdops->revalidate_disk(bdev->bd_disk);
- return 1;
-}
-
-EXPORT_SYMBOL(check_disk_change);
-
-void bd_set_size(struct block_device *bdev, loff_t size)
-{
- inode_lock(bdev->bd_inode);
- i_size_write(bdev->bd_inode, size);
- inode_unlock(bdev->bd_inode);
+ spin_lock(&bdev->bd_size_lock);
+ i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+ spin_unlock(&bdev->bd_size_lock);
}
-EXPORT_SYMBOL(bd_set_size);
+EXPORT_SYMBOL(bd_set_nr_sectors);
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
@@ -1388,6 +1384,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
lockdep_assert_held(&bdev->bd_mutex);
+ clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+
rescan:
ret = blk_drop_partitions(bdev);
if (ret)
@@ -1446,22 +1444,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
struct gendisk *disk;
int ret;
int partno;
- int perm = 0;
bool first_open = false, unblock_events = true, need_restart;
- if (mode & FMODE_READ)
- perm |= MAY_READ;
- if (mode & FMODE_WRITE)
- perm |= MAY_WRITE;
- /*
- * hooks: /n/, see "layering violations".
- */
- if (!for_part) {
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0)
- return ret;
- }
-
restart:
need_restart = false;
ret = -ENXIO;
@@ -1514,7 +1498,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
}
if (!ret) {
- bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ bd_set_nr_sectors(bdev, get_capacity(disk));
set_init_blocksize(bdev);
}
@@ -1524,7 +1508,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
@@ -1542,7 +1526,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
ret = -ENXIO;
goto out_clear;
}
- bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
set_init_blocksize(bdev);
}
@@ -1554,7 +1538,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated &&
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
@@ -1632,16 +1616,27 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
* RETURNS:
* 0 on success, -errno on failure.
*/
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
{
- int res;
+ int ret, perm = 0;
- res =__blkdev_get(bdev, mode, holder, 0);
- if (res)
- bdput(bdev);
- return res;
+ if (mode & FMODE_READ)
+ perm |= MAY_READ;
+ if (mode & FMODE_WRITE)
+ perm |= MAY_WRITE;
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret)
+ goto bdput;
+
+ ret =__blkdev_get(bdev, mode, holder, 0);
+ if (ret)
+ goto bdput;
+ return 0;
+
+bdput:
+ bdput(bdev);
+ return ret;
}
-EXPORT_SYMBOL(blkdev_get);
/**
* blkdev_get_by_path - open a block device by name
@@ -1889,7 +1884,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (bdev_read_only(I_BDEV(bd_inode)))
return -EPERM;
- if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode))
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
return -ETXTBSY;
if (!iov_iter_count(from))
@@ -1969,7 +1964,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
int error;
@@ -1997,8 +1991,9 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return -EINVAL;
/* Invalidate the page cache, including dirty pages. */
- mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
+ error = truncate_bdev_range(bdev, file->f_mode, start, end);
+ if (error)
+ return error;
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
@@ -2025,7 +2020,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
* the caller will be given -EBUSY. The third argument is
* inclusive, so the rounding here is safe.
*/
- return invalidate_inode_pages2_range(mapping,
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index abf86b202b43..fb82d54c8adf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3091,8 +3091,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sb_buffer;
}
- sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
diff --git a/fs/buffer.c b/fs/buffer.c
index 50bbc99e3d96..5a28a6aa7f16 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2771,16 +2771,6 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
-#if 0
- /* Not really sure about this - do we need this ? */
- if (page->mapping->a_ops->invalidatepage)
- page->mapping->a_ops->invalidatepage(page, offset);
-#endif
unlock_page(page);
return 0; /* don't care */
}
@@ -2975,12 +2965,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
- /*
- * The page may have dirty, unmapped buffers. For example,
- * they may have been added in ext3_writepage(). Make them
- * freeable here, so the page does not leak.
- */
- do_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 58b27e4070a3..e6005c78bfa9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2321,7 +2321,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
wb = locked_inode_to_wb_and_lock_list(inode);
- WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+ WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) &&
!test_bit(WB_registered, &wb->state),
"bdi-%s not registered\n", bdi_dev_name(wb->bdi));
@@ -2346,7 +2346,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* to make sure background write-back happens
* later.
*/
- if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+ if (wakeup_bdi &&
+ (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
wb_wakeup_delayed(wb);
return;
}
@@ -2581,7 +2582,7 @@ int write_inode_now(struct inode *inode, int sync)
.range_end = LLONG_MAX,
};
- if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0;
might_sleep();
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index bba747520e9b..581329203d68 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1049,9 +1049,9 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
if (err)
return err;
- sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
/* fuse does it's own writeback accounting */
- sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
+ sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT;
+ sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT;
/*
* For a single fuse filesystem use max 1% of dirty +
diff --git a/fs/namei.c b/fs/namei.c
index e99e2a9da0f7..f1eb8ccd2be9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -568,8 +568,8 @@ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
struct super_block *sb = mnt->mnt_sb;
- /* Bind mounts and multi-root filesystems can have disconnected paths */
- if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
+ /* Bind mounts can have disconnected paths */
+ if (mnt->mnt_root == sb->s_root)
return true;
return is_subdir(dentry, mnt->mnt_root);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 7a70287f21a2..f943e37853fa 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1200,13 +1200,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
}
#endif
-static void nfs_set_readahead(struct backing_dev_info *bdi,
- unsigned long iomax_pages)
-{
- bdi->ra_pages = VM_READAHEAD_PAGES;
- bdi->io_pages = iomax_pages;
-}
-
int nfs_get_tree_common(struct fs_context *fc)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
@@ -1251,7 +1244,7 @@ int nfs_get_tree_common(struct fs_context *fc)
MINOR(server->s_dev));
if (error)
goto error_splat_super;
- nfs_set_readahead(s->s_bdi, server->rpages);
+ s->s_bdi->io_pages = server->rpages;
server->super = s;
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 89d13e0705fe..0179a73a3fa2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1766,7 +1766,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
int sectsize;
char *p = (char *)page;
struct fd f;
- struct inode *inode;
ssize_t ret = -EINVAL;
int live_threshold;
@@ -1793,20 +1792,16 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
reg->hr_block_bytes == 0)
goto out2;
- inode = igrab(f.file->f_mapping->host);
- if (inode == NULL)
+ if (!S_ISBLK(f.file->f_mapping->host->i_mode))
goto out2;
- if (!S_ISBLK(inode->i_mode))
- goto out3;
-
- reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
- ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
- if (ret) {
+ reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev,
+ FMODE_WRITE | FMODE_READ, NULL);
+ if (IS_ERR(reg->hr_bdev)) {
+ ret = PTR_ERR(reg->hr_bdev);
reg->hr_bdev = NULL;
- goto out3;
+ goto out2;
}
- inode = NULL;
bdevname(reg->hr_bdev, reg->hr_dev_name);
@@ -1909,16 +1904,13 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
config_item_name(&reg->hr_item), reg->hr_dev_name);
out3:
- iput(inode);
+ if (ret < 0) {
+ blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE);
+ reg->hr_bdev = NULL;
+ }
out2:
fdput(f);
out:
- if (ret < 0) {
- if (reg->hr_bdev) {
- blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
- reg->hr_bdev = NULL;
- }
- }
return ret;
}
diff --git a/fs/super.c b/fs/super.c
index 904459b35119..a51c2083cd6b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1256,6 +1256,8 @@ static int set_bdev_super(struct super_block *s, void *data)
s->s_dev = s->s_bdev->bd_dev;
s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+ if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
+ s->s_iflags |= SB_I_STABLE_WRITES;
return 0;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a2420c900275..fbddb2a1c03f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2177,6 +2177,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
c->vi.vol_id);
if (err)
goto out_close;
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 25aade344192..d7816c01a4f6 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -167,6 +167,8 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id);
if (err)
goto fail_free;
+ sb->s_bdi->ra_pages = 0;
+ sb->s_bdi->io_pages = 0;
/* Turn source into a shfl_string and map the folder */
size = strlen(fc->source) + 1;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0b06b2d26c9a..44df4fcef65c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -110,33 +110,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
/*
* Flags in backing_dev_info::capability
*
- * The first three flags control whether dirty pages will contribute to the
- * VM's accounting and whether writepages() should be called for dirty pages
- * (something that would not, for example, be appropriate for ramfs)
- *
- * WARNING: these flags are closely related and should not normally be
- * used separately. The BDI_CAP_NO_ACCT_AND_WRITEBACK combines these
- * three flags into a single convenience macro.
- *
- * BDI_CAP_NO_ACCT_DIRTY: Dirty pages shouldn't contribute to accounting
- * BDI_CAP_NO_WRITEBACK: Don't write pages back
- * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages
- * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold.
- *
- * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
- * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be
- * inefficient.
+ * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages
+ * should contribute to accounting
+ * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages
+ * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold
*/
-#define BDI_CAP_NO_ACCT_DIRTY 0x00000001
-#define BDI_CAP_NO_WRITEBACK 0x00000002
-#define BDI_CAP_NO_ACCT_WB 0x00000004
-#define BDI_CAP_STABLE_WRITES 0x00000008
-#define BDI_CAP_STRICTLIMIT 0x00000010
-#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
-#define BDI_CAP_SYNCHRONOUS_IO 0x00000040
-
-#define BDI_CAP_NO_ACCT_AND_WRITEBACK \
- (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
+#define BDI_CAP_WRITEBACK (1 << 0)
+#define BDI_CAP_WRITEBACK_ACCT (1 << 1)
+#define BDI_CAP_STRICTLIMIT (1 << 2)
extern struct backing_dev_info noop_backing_dev_info;
@@ -175,41 +156,9 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
long congestion_wait(int sync, long timeout);
long wait_iff_congested(int sync, long timeout);
-static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi)
-{
- return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO;
-}
-
-static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
-{
- return bdi->capabilities & BDI_CAP_STABLE_WRITES;
-}
-
-static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
-{
- return !(bdi->capabilities & BDI_CAP_NO_WRITEBACK);
-}
-
-static inline bool bdi_cap_account_dirty(struct backing_dev_info *bdi)
-{
- return !(bdi->capabilities & BDI_CAP_NO_ACCT_DIRTY);
-}
-
-static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
-{
- /* Paranoia: BDI_CAP_NO_WRITEBACK implies BDI_CAP_NO_ACCT_WB */
- return !(bdi->capabilities & (BDI_CAP_NO_ACCT_WB |
- BDI_CAP_NO_WRITEBACK));
-}
-
-static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
-{
- return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host));
-}
-
-static inline bool mapping_cap_account_dirty(struct address_space *mapping)
+static inline bool mapping_can_writeback(struct address_space *mapping)
{
- return bdi_cap_account_dirty(inode_to_bdi(mapping->host));
+ return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
}
static inline int bdi_sched_wait(void *word)
@@ -233,9 +182,9 @@ int inode_congested(struct inode *inode, int cong_bits);
* inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
* @inode: inode of interest
*
- * cgroup writeback requires support from both the bdi and filesystem.
- * Also, both memcg and iocg have to be on the default hierarchy. Test
- * whether all conditions are met.
+ * Cgroup writeback requires support from the filesystem. Also, both memcg and
+ * iocg have to be on the default hierarchy. Test whether all conditions are
+ * met.
*
* Note that the test result may change dynamically on the same inode
* depending on how memcg and iocg are configured.
@@ -246,8 +195,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
cgroup_subsys_on_dfl(io_cgrp_subsys) &&
- bdi_cap_account_dirty(bdi) &&
- (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+ (bdi->capabilities & BDI_CAP_WRITEBACK) &&
(inode->i_sb->s_iflags & SB_I_CGROUPWB);
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9d2d5ad367a4..b23eeca4d677 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -139,6 +139,10 @@ struct blk_mq_hw_ctx {
* shared across request queues.
*/
atomic_t nr_active;
+ /**
+ * @elevator_queued: Number of queued requests on hctx.
+ */
+ atomic_t elevator_queued;
/** @cpuhp_online: List to store request if CPU is going to die */
struct hlist_node cpuhp_online;
@@ -231,6 +235,9 @@ enum hctx_type {
* @flags: Zero or more BLK_MQ_F_* flags.
* @driver_data: Pointer to data owned by the block driver that created this
* tag set.
+ * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
+ * @__breserved_tags:
+ * A shared reserved tags sbitmap, used over all hctx's
* @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
* elements.
* @tag_list_lock: Serializes tag_list accesses.
@@ -249,7 +256,10 @@ struct blk_mq_tag_set {
unsigned int timeout;
unsigned int flags;
void *driver_data;
+ atomic_t active_queues_shared_sbitmap;
+ struct sbitmap_queue __bitmap_tags;
+ struct sbitmap_queue __breserved_tags;
struct blk_mq_tags **tags;
struct mutex tag_list_lock;
@@ -378,12 +388,13 @@ struct blk_mq_ops {
enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
- BLK_MQ_F_TAG_SHARED = 1 << 1,
+ BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
/*
* Set when this device requires underlying blk-mq device for
* completing IO:
*/
BLK_MQ_F_STACKING = 1 << 2,
+ BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_NO_SCHED = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
@@ -489,8 +500,6 @@ void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_complete_request(struct request *rq);
bool blk_mq_complete_request_remote(struct request *rq);
-bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
- struct bio *bio, unsigned int nr_segs);
bool blk_mq_queue_stopped(struct request_queue *q);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4ecf4fed171f..eb20e28184ab 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -20,7 +20,7 @@ typedef void (bio_end_io_t) (struct bio *);
struct bio_crypt_ctx;
struct block_device {
- dev_t bd_dev; /* not a kdev_t - it's a search key */
+ dev_t bd_dev;
int bd_openers;
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
@@ -37,7 +37,8 @@ struct block_device {
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
- int bd_invalidated;
+
+ spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
struct gendisk * bd_disk;
struct backing_dev_info *bd_bdi;
@@ -255,8 +256,6 @@ enum {
BIO_NO_PAGE_REF, /* don't put release vec pages */
BIO_CLONED, /* doesn't own data */
BIO_BOUNCED, /* bio is a bounce bio */
- BIO_USER_MAPPED, /* contains user pages */
- BIO_NULL_MAPPED, /* contains invalid user pages */
BIO_WORKINGSET, /* contains userspace workingset pages */
BIO_QUIET, /* Make BIO Quiet */
BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bb5636cc17b9..8e77f12de522 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -24,6 +24,7 @@
#include <linux/percpu-refcount.h>
#include <linux/scatterlist.h>
#include <linux/blkzoned.h>
+#include <linux/pm.h>
struct module;
struct scsi_ioctl_command;
@@ -458,7 +459,7 @@ struct request_queue {
#ifdef CONFIG_PM
struct device *dev;
- int rpm_status;
+ enum rpm_status rpm_status;
unsigned int nr_pending;
#endif
@@ -484,6 +485,8 @@ struct request_queue {
struct timer_list timeout;
struct work_struct timeout_work;
+ atomic_t nr_active_requests_shared_sbitmap;
+
struct list_head icq_list;
#ifdef CONFIG_BLK_CGROUP
DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
@@ -603,6 +606,7 @@ struct request_queue {
#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */
#define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */
#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */
+#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */
#define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */
#define QUEUE_FLAG_WC 17 /* Write back caching */
#define QUEUE_FLAG_FUA 18 /* device supports FUA writes */
@@ -615,6 +619,7 @@ struct request_queue {
#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */
#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */
#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
+#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_SAME_COMP))
@@ -631,6 +636,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_noxmerges(q) \
test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
#define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
+#define blk_queue_stable_writes(q) \
+ test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
@@ -1059,11 +1066,17 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
static inline unsigned int blk_max_size_offset(struct request_queue *q,
sector_t offset)
{
- if (!q->limits.chunk_sectors)
+ unsigned int chunk_sectors = q->limits.chunk_sectors;
+
+ if (!chunk_sectors)
return q->limits.max_sectors;
- return min(q->limits.max_sectors, (unsigned int)(q->limits.chunk_sectors -
- (offset & (q->limits.chunk_sectors - 1))));
+ if (likely(is_power_of_2(chunk_sectors)))
+ chunk_sectors -= offset & (chunk_sectors - 1);
+ else
+ chunk_sectors -= sector_div(offset, chunk_sectors);
+
+ return min(q->limits.max_sectors, chunk_sectors);
}
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
@@ -1130,6 +1143,7 @@ extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
+void blk_queue_update_readahead(struct request_queue *q);
extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
@@ -1455,10 +1469,9 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
if (q->limits.misaligned)
return -1;
-
if (bdev != bdev->bd_contains)
- return bdev->bd_part->alignment_offset;
-
+ return queue_limit_alignment_offset(&q->limits,
+ bdev->bd_part->start_sect);
return q->limits.alignment_offset;
}
@@ -1498,8 +1511,8 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
struct request_queue *q = bdev_get_queue(bdev);
if (bdev != bdev->bd_contains)
- return bdev->bd_part->discard_alignment;
-
+ return queue_limit_discard_alignment(&q->limits,
+ bdev->bd_part->start_sect);
return q->limits.discard_alignment;
}
@@ -1930,6 +1943,11 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
void disk_end_io_acct(struct gendisk *disk, unsigned int op,
unsigned long start_time);
+unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
+ struct bio *bio);
+void part_end_io_acct(struct hd_struct *part, struct bio *bio,
+ unsigned long start_time);
+
/**
* bio_start_io_acct - start I/O accounting for bio based drivers
* @bio: bio to start account for
@@ -1967,7 +1985,6 @@ void blkdev_show(struct seq_file *seqf, off_t offset);
#define BLKDEV_MAJOR_MAX 0
#endif
-int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
void *holder);
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder);
@@ -1984,11 +2001,18 @@ void bdput(struct block_device *);
#ifdef CONFIG_BLOCK
void invalidate_bdev(struct block_device *bdev);
+int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart,
+ loff_t lend);
int sync_blockdev(struct block_device *bdev);
#else
static inline void invalidate_bdev(struct block_device *bdev)
{
}
+static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ return 0;
+}
static inline int sync_blockdev(struct block_device *bdev)
{
return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7519ae003a08..222465b7cf41 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1385,7 +1385,7 @@ extern int send_sigurg(struct fown_struct *fown);
#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
-#define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */
+#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 4ab853461dff..38f23d757013 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -65,8 +65,6 @@ struct hd_struct {
struct disk_stats __percpu *dkstats;
struct percpu_ref ref;
- sector_t alignment_offset;
- unsigned int discard_alignment;
struct device __dev;
struct kobject *holder_dir;
int policy, partno;
@@ -193,6 +191,8 @@ struct gendisk {
void *private_data;
int flags;
+ unsigned long state;
+#define GD_NEED_PART_SCAN 0
struct rw_semaphore lookup_sem;
struct kobject *slave_dir;
@@ -315,9 +315,8 @@ static inline int get_disk_ro(struct gendisk *disk)
extern void disk_block_events(struct gendisk *disk);
extern void disk_unblock_events(struct gendisk *disk);
extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
-extern void set_capacity_revalidate_and_notify(struct gendisk *disk,
- sector_t size, bool revalidate);
-extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
+void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
+ bool update_bdev);
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk) __latent_entropy;
@@ -372,10 +371,10 @@ extern void blk_unregister_region(dev_t devt, unsigned long range);
int register_blkdev(unsigned int major, const char *name);
void unregister_blkdev(unsigned int major, const char *name);
-int revalidate_disk(struct gendisk *disk);
-int check_disk_change(struct block_device *bdev);
+void revalidate_disk_size(struct gendisk *disk, bool verbose);
+bool bdev_check_media_change(struct block_device *bdev);
int __invalidate_device(struct block_device *bdev, bool kill_dirty);
-void bd_set_size(struct block_device *bdev, loff_t size);
+void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors);
/* for drivers/char/raw.c: */
int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a254841bd315..62653769509f 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -490,8 +490,6 @@ enum {
IDE_DFLAG_NOPROBE = BIT(9),
/* need to do check_media_change() */
IDE_DFLAG_REMOVABLE = BIT(10),
- /* needed for removable devices */
- IDE_DFLAG_ATTACH = BIT(11),
IDE_DFLAG_FORCED_GEOM = BIT(12),
/* disallow setting unmask bit */
IDE_DFLAG_NO_UNMASK = BIT(13),
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index cb9afad82a90..8af13ba60c7e 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -473,9 +473,9 @@ static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) {
#endif /* CONFIG_HIBERNATION */
#ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV
-int is_hibernate_resume_dev(const struct inode *);
+int is_hibernate_resume_dev(dev_t dev);
#else
-static inline int is_hibernate_resume_dev(const struct inode *i) { return 0; }
+static inline int is_hibernate_resume_dev(dev_t dev) { return 0; }
#endif
/* Hibernation and suspend events */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 661046994db4..4340a7b6e7a1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -467,7 +467,8 @@ extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
extern int free_swap_and_cache(swp_entry_t);
-extern int swap_type_of(dev_t, sector_t, struct block_device **);
+int swap_type_of(dev_t device, sector_t offset);
+int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h
index c2f580fd371b..b350860d2e71 100644
--- a/include/trace/events/iocost.h
+++ b/include/trace/events/iocost.h
@@ -26,7 +26,6 @@ TRACE_EVENT(iocost_iocg_activate,
__field(u64, vrate)
__field(u64, last_period)
__field(u64, cur_period)
- __field(u64, last_vtime)
__field(u64, vtime)
__field(u32, weight)
__field(u32, inuse)
@@ -42,7 +41,6 @@ TRACE_EVENT(iocost_iocg_activate,
__entry->vrate = now->vrate;
__entry->last_period = last_period;
__entry->cur_period = cur_period;
- __entry->last_vtime = iocg->last_vtime;
__entry->vtime = vtime;
__entry->weight = iocg->weight;
__entry->inuse = iocg->inuse;
@@ -51,13 +49,12 @@ TRACE_EVENT(iocost_iocg_activate,
),
TP_printk("[%s:%s] now=%llu:%llu vrate=%llu "
- "period=%llu->%llu vtime=%llu->%llu "
+ "period=%llu->%llu vtime=%llu "
"weight=%u/%u hweight=%llu/%llu",
__get_str(devname), __get_str(cgroup),
__entry->now, __entry->vnow, __entry->vrate,
__entry->last_period, __entry->cur_period,
- __entry->last_vtime, __entry->vtime,
- __entry->inuse, __entry->weight,
+ __entry->vtime, __entry->inuse, __entry->weight,
__entry->hweight_inuse, __entry->hweight_active
)
);
@@ -98,7 +95,7 @@ DECLARE_EVENT_CLASS(iocg_inuse_update,
)
);
-DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback,
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_shortage,
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
u32 old_inuse, u32 new_inuse,
@@ -108,7 +105,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback,
old_hw_inuse, new_hw_inuse)
);
-DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway,
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_transfer,
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
u32 old_inuse, u32 new_inuse,
@@ -118,7 +115,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway,
old_hw_inuse, new_hw_inuse)
);
-DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset,
+DEFINE_EVENT(iocg_inuse_update, iocost_inuse_adjust,
TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now,
u32 old_inuse, u32 new_inuse,
@@ -131,11 +128,9 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset,
TRACE_EVENT(iocost_ioc_vrate_adj,
TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 *missed_ppm,
- u32 rq_wait_pct, int nr_lagging, int nr_shortages,
- int nr_surpluses),
+ u32 rq_wait_pct, int nr_lagging, int nr_shortages),
- TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages,
- nr_surpluses),
+ TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages),
TP_STRUCT__entry (
__string(devname, ioc_name(ioc))
@@ -147,7 +142,6 @@ TRACE_EVENT(iocost_ioc_vrate_adj,
__field(u32, rq_wait_pct)
__field(int, nr_lagging)
__field(int, nr_shortages)
- __field(int, nr_surpluses)
),
TP_fast_assign(
@@ -160,15 +154,13 @@ TRACE_EVENT(iocost_ioc_vrate_adj,
__entry->rq_wait_pct = rq_wait_pct;
__entry->nr_lagging = nr_lagging;
__entry->nr_shortages = nr_shortages;
- __entry->nr_surpluses = nr_surpluses;
),
- TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d",
+ TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d",
__get_str(devname), __entry->old_vrate, __entry->new_vrate,
__entry->busy_level,
__entry->read_missed_ppm, __entry->write_missed_ppm,
- __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages,
- __entry->nr_surpluses
+ __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages
)
);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 395dd0df8d08..c6ca33034147 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -288,6 +288,8 @@ struct vfs_ns_cap_data {
processes and setting the scheduling algorithm used by another
process. */
/* Allow setting cpu affinity on other processes */
+/* Allow setting realtime ioprio class */
+/* Allow setting ioprio class on other processes */
#define CAP_SYS_NICE 23
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 01e2858b5fe3..71385bedcc3a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -335,26 +335,23 @@ static int swsusp_swap_check(void)
{
int res;
- res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
- &hib_resume_bdev);
+ if (swsusp_resume_device)
+ res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+ else
+ res = find_first_swap(&swsusp_resume_device);
if (res < 0)
return res;
-
root_swap = res;
- res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
- if (res)
- return res;
+
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE,
+ NULL);
+ if (IS_ERR(hib_resume_bdev))
+ return PTR_ERR(hib_resume_bdev);
res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
if (res < 0)
blkdev_put(hib_resume_bdev, FMODE_WRITE);
- /*
- * Update the resume device to the one actually used,
- * so the test_resume mode can use it in case it is
- * invoked from hibernate() to test the snapshot.
- */
- swsusp_resume_device = hib_resume_bdev->bd_dev;
return res;
}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d5eedc2baa2a..6510a8f7687f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -35,12 +35,12 @@ static struct snapshot_data {
bool ready;
bool platform_support;
bool free_bitmaps;
- struct inode *bd_inode;
+ dev_t dev;
} snapshot_state;
-int is_hibernate_resume_dev(const struct inode *bd_inode)
+int is_hibernate_resume_dev(dev_t dev)
{
- return hibernation_available() && snapshot_state.bd_inode == bd_inode;
+ return hibernation_available() && snapshot_state.dev == dev;
}
static int snapshot_open(struct inode *inode, struct file *filp)
@@ -69,8 +69,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
memset(&data->handle, 0, sizeof(struct snapshot_handle));
if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
/* Hibernating. The image device should be accessible. */
- data->swap = swsusp_resume_device ?
- swap_type_of(swsusp_resume_device, 0, NULL) : -1;
+ data->swap = swap_type_of(swsusp_resume_device, 0);
data->mode = O_RDONLY;
data->free_bitmaps = false;
error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
@@ -101,7 +100,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->frozen = false;
data->ready = false;
data->platform_support = false;
- data->bd_inode = NULL;
+ data->dev = 0;
Unlock:
unlock_system_sleep();
@@ -117,7 +116,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
swsusp_free();
data = filp->private_data;
- data->bd_inode = NULL;
+ data->dev = 0;
free_all_swap_pages(data->swap);
if (data->frozen) {
pm_restore_gfp_mask();
@@ -210,7 +209,6 @@ struct compat_resume_swap_area {
static int snapshot_set_swap_area(struct snapshot_data *data,
void __user *argp)
{
- struct block_device *bdev;
sector_t offset;
dev_t swdev;
@@ -237,16 +235,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data,
* User space encodes device types as two-byte values,
* so we need to recode them
*/
- if (!swdev) {
- data->swap = -1;
- return -EINVAL;
- }
- data->swap = swap_type_of(swdev, offset, &bdev);
+ data->swap = swap_type_of(swdev, offset);
if (data->swap < 0)
- return -ENODEV;
-
- data->bd_inode = bdev->bd_inode;
- bdput(bdev);
+ return swdev ? -ENODEV : -EINVAL;
+ data->dev = swdev;
return 0;
}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 4b3a42fc3b24..c0887f32f647 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -793,7 +793,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
return cgroup_id(bio_blkcg(bio)->css.cgroup);
}
#else
-u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
return 0;
}
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8e8b00627bb2..408d5051d05b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,9 +14,7 @@
#include <linux/device.h>
#include <trace/events/writeback.h>
-struct backing_dev_info noop_backing_dev_info = {
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
+struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
@@ -204,10 +202,9 @@ static ssize_t stable_pages_required_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
- struct backing_dev_info *bdi = dev_get_drvdata(dev);
-
- return snprintf(page, PAGE_SIZE-1, "%d\n",
- bdi_cap_stable_pages_required(bdi) ? 1 : 0);
+ dev_warn_once(dev,
+ "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
+ return snprintf(page, PAGE_SIZE-1, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);
@@ -746,6 +743,9 @@ struct backing_dev_info *bdi_alloc(int node_id)
kfree(bdi);
return NULL;
}
+ bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
+ bdi->ra_pages = VM_READAHEAD_PAGES;
+ bdi->io_pages = VM_READAHEAD_PAGES;
return bdi;
}
EXPORT_SYMBOL(bdi_alloc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 5202e38ab79e..72aebff32039 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -414,7 +414,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping) ||
+ if (!mapping_can_writeback(mapping) ||
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
@@ -1800,7 +1800,7 @@ repeat:
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
- if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp_mask |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp_mask &= ~__GFP_FS;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cfa6cbad21d5..17e9af5d635d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5643,7 +5643,7 @@ static int mem_cgroup_move_account(struct page *page,
if (PageDirty(page)) {
struct address_space *mapping = page_mapping(page);
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
-nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f1aa6433f404..a1e73943445e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1006,7 +1006,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
*/
mapping = page_mapping(hpage);
if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
- mapping_cap_writeback_dirty(mapping)) {
+ mapping_can_writeback(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
diff --git a/mm/migrate.c b/mm/migrate.c
index aecb1433cf3c..d9f3e072a045 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -503,7 +503,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
__dec_lruvec_state(old_lruvec, NR_SHMEM);
__inc_lruvec_state(new_lruvec, NR_SHMEM);
}
- if (dirty && mapping_cap_account_dirty(mapping)) {
+ if (dirty && mapping_can_writeback(mapping)) {
__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
__dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
__inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
diff --git a/mm/mmap.c b/mm/mmap.c
index 40248d84ad5f..1fc0e92be4ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1666,7 +1666,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
/* Can the mapping track the dirty pages? */
return vma->vm_file && vma->vm_file->f_mapping &&
- mapping_cap_account_dirty(vma->vm_file->f_mapping);
+ mapping_can_writeback(vma->vm_file->f_mapping);
}
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4e4ddd67b71e..358d6f28c627 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1882,7 +1882,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
int ratelimit;
int *p;
- if (!bdi_cap_account_dirty(bdi))
+ if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
return;
if (inode_cgwb_enabled(inode))
@@ -2423,7 +2423,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
trace_writeback_dirty_page(page, mapping);
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
struct bdi_writeback *wb;
inode_attach_wb(inode, page);
@@ -2450,7 +2450,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
void account_page_cleaned(struct page *page, struct address_space *mapping,
struct bdi_writeback *wb)
{
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2513,7 +2513,7 @@ void account_page_redirty(struct page *page)
{
struct address_space *mapping = page->mapping;
- if (mapping && mapping_cap_account_dirty(mapping)) {
+ if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
@@ -2625,7 +2625,7 @@ void __cancel_dirty_page(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
@@ -2665,7 +2665,7 @@ int clear_page_dirty_for_io(struct page *page)
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (mapping && mapping_cap_account_dirty(mapping)) {
+ if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
@@ -2738,7 +2738,7 @@ int test_clear_page_writeback(struct page *page)
if (ret) {
__xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
- if (bdi_cap_account_writeback(bdi)) {
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
dec_wb_stat(wb, WB_WRITEBACK);
@@ -2791,7 +2791,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
- if (bdi_cap_account_writeback(bdi))
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
/*
@@ -2849,7 +2849,7 @@ EXPORT_SYMBOL_GPL(wait_on_page_writeback);
*/
void wait_for_stable_page(struct page *page)
{
- if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+ if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_io.c b/mm/page_io.c
index e485a6e8a6cd..b199b87e0aa9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -403,15 +403,17 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
- if (!ret) {
- if (trylock_page(page)) {
- swap_slot_free_notify(page);
- unlock_page(page);
- }
+ if (sis->flags & SWP_SYNCHRONOUS_IO) {
+ ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
+ if (!ret) {
+ if (trylock_page(page)) {
+ swap_slot_free_notify(page);
+ unlock_page(page);
+ }
- count_vm_event(PSWPIN);
- goto out;
+ count_vm_event(PSWPIN);
+ goto out;
+ }
}
ret = 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 12f59e641b5e..65ef407512b5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1801,13 +1801,12 @@ int free_swap_and_cache(swp_entry_t entry)
*
* This is needed for the suspend to disk (aka swsusp).
*/
-int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
+int swap_type_of(dev_t device, sector_t offset)
{
- struct block_device *bdev = NULL;
int type;
- if (device)
- bdev = bdget(device);
+ if (!device)
+ return -1;
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
@@ -1816,30 +1815,34 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
if (!(sis->flags & SWP_WRITEOK))
continue;
- if (!bdev) {
- if (bdev_p)
- *bdev_p = bdgrab(sis->bdev);
-
- spin_unlock(&swap_lock);
- return type;
- }
- if (bdev == sis->bdev) {
+ if (device == sis->bdev->bd_dev) {
struct swap_extent *se = first_se(sis);
if (se->start_block == offset) {
- if (bdev_p)
- *bdev_p = bdgrab(sis->bdev);
-
spin_unlock(&swap_lock);
- bdput(bdev);
return type;
}
}
}
spin_unlock(&swap_lock);
- if (bdev)
- bdput(bdev);
+ return -ENODEV;
+}
+int find_first_swap(dev_t *device)
+{
+ int type;
+
+ spin_lock(&swap_lock);
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *sis = swap_info[type];
+
+ if (!(sis->flags & SWP_WRITEOK))
+ continue;
+ *device = sis->bdev->bd_dev;
+ spin_unlock(&swap_lock);
+ return type;
+ }
+ spin_unlock(&swap_lock);
return -ENODEV;
}
@@ -2920,10 +2923,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
int error;
if (S_ISBLK(inode->i_mode)) {
- p->bdev = bdgrab(I_BDEV(inode));
- error = blkdev_get(p->bdev,
+ p->bdev = blkdev_get_by_dev(inode->i_rdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
- if (error < 0) {
+ if (IS_ERR(p->bdev)) {
+ error = PTR_ERR(p->bdev);
p->bdev = NULL;
return error;
}
@@ -3234,10 +3237,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
- if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
+ if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
p->flags |= SWP_STABLE_WRITES;
- if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
+ if (p->bdev && p->bdev->bd_disk->fops->rw_page)
p->flags |= SWP_SYNCHRONOUS_IO;
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py
index f4699f9b46ba..c4ff907c078b 100644
--- a/tools/cgroup/iocost_monitor.py
+++ b/tools/cgroup/iocost_monitor.py
@@ -45,8 +45,7 @@ except:
err('The kernel does not have iocost enabled')
IOC_RUNNING = prog['IOC_RUNNING'].value_()
-NR_USAGE_SLOTS = prog['NR_USAGE_SLOTS'].value_()
-HWEIGHT_WHOLE = prog['HWEIGHT_WHOLE'].value_()
+WEIGHT_ONE = prog['WEIGHT_ONE'].value_()
VTIME_PER_SEC = prog['VTIME_PER_SEC'].value_()
VTIME_PER_USEC = prog['VTIME_PER_USEC'].value_()
AUTOP_SSD_FAST = prog['AUTOP_SSD_FAST'].value_()
@@ -100,7 +99,7 @@ class IocStat:
self.period_ms = ioc.period_us.value_() / 1_000
self.period_at = ioc.period_at.value_() / 1_000_000
self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC
- self.vrate_pct = ioc.vtime_rate.counter.value_() * 100 / VTIME_PER_USEC
+ self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC
self.busy_level = ioc.busy_level.value_()
self.autop_idx = ioc.autop_idx.value_()
self.user_cost_model = ioc.user_cost_model.value_()
@@ -136,7 +135,7 @@ class IocStat:
def table_header_str(self):
return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \
- f'{"dbt":>3} {"delay":>6} {"usages%"}'
+ f'{"debt":>7} {"delay":>7} {"usage%"}'
class IocgStat:
def __init__(self, iocg):
@@ -144,11 +143,11 @@ class IocgStat:
blkg = iocg.pd.blkg
self.is_active = not list_empty(iocg.active_list.address_of_())
- self.weight = iocg.weight.value_()
- self.active = iocg.active.value_()
- self.inuse = iocg.inuse.value_()
- self.hwa_pct = iocg.hweight_active.value_() * 100 / HWEIGHT_WHOLE
- self.hwi_pct = iocg.hweight_inuse.value_() * 100 / HWEIGHT_WHOLE
+ self.weight = iocg.weight.value_() / WEIGHT_ONE
+ self.active = iocg.active.value_() / WEIGHT_ONE
+ self.inuse = iocg.inuse.value_() / WEIGHT_ONE
+ self.hwa_pct = iocg.hweight_active.value_() * 100 / WEIGHT_ONE
+ self.hwi_pct = iocg.hweight_inuse.value_() * 100 / WEIGHT_ONE
self.address = iocg.value_()
vdone = iocg.done_vtime.counter.value_()
@@ -160,23 +159,13 @@ class IocgStat:
else:
self.inflight_pct = 0
- # vdebt used to be an atomic64_t and is now u64, support both
- try:
- self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000
- except:
- self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000
-
- self.use_delay = blkg.use_delay.counter.value_()
- self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000
-
- usage_idx = iocg.usage_idx.value_()
- self.usages = []
- self.usage = 0
- for i in range(NR_USAGE_SLOTS):
- usage = iocg.usages[(usage_idx + 1 + i) % NR_USAGE_SLOTS].value_()
- upct = usage * 100 / HWEIGHT_WHOLE
- self.usages.append(upct)
- self.usage = max(self.usage, upct)
+ self.usage = (100 * iocg.usage_delta_us.value_() /
+ ioc.period_us.value_()) if self.active else 0
+ self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000
+ if blkg.use_delay.counter.value_() != 0:
+ self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000
+ else:
+ self.delay_ms = 0
def dict(self, now, path):
out = { 'cgroup' : path,
@@ -189,25 +178,20 @@ class IocgStat:
'hweight_inuse_pct' : self.hwi_pct,
'inflight_pct' : self.inflight_pct,
'debt_ms' : self.debt_ms,
- 'use_delay' : self.use_delay,
'delay_ms' : self.delay_ms,
'usage_pct' : self.usage,
'address' : self.address }
- for i in range(len(self.usages)):
- out[f'usage_pct_{i}'] = str(self.usages[i])
return out
def table_row_str(self, path):
out = f'{path[-28:]:28} ' \
f'{"*" if self.is_active else " "} ' \
- f'{self.inuse:5}/{self.active:5} ' \
+ f'{round(self.inuse):5}/{round(self.active):5} ' \
f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \
f'{self.inflight_pct:6.2f} ' \
- f'{min(math.ceil(self.debt_ms), 999):3} ' \
- f'{min(self.use_delay, 99):2}*'\
- f'{min(math.ceil(self.delay_ms), 999):03} '
- for u in self.usages:
- out += f'{min(round(u), 999):03d}:'
+ f'{self.debt_ms:7.2f} ' \
+ f'{self.delay_ms:7.2f} '\
+ f'{min(self.usage, 999):6.2f}'
out = out.rstrip(':')
return out