summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig18
-rw-r--r--block/Makefile2
-rw-r--r--block/bfq-iosched.c2
-rw-r--r--block/bio-integrity.c3
-rw-r--r--block/bio.c184
-rw-r--r--block/blk-cgroup.c6
-rw-r--r--block/blk-core.c325
-rw-r--r--block/blk-crypto-fallback.c657
-rw-r--r--block/blk-crypto-internal.h201
-rw-r--r--block/blk-crypto.c404
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-flush.c26
-rw-r--r--block/blk-integrity.c7
-rw-r--r--block/blk-iocost.c86
-rw-r--r--block/blk-map.c15
-rw-r--r--block/blk-merge.c76
-rw-r--r--block/blk-mq-debugfs.c3
-rw-r--r--block/blk-mq-sched.c82
-rw-r--r--block/blk-mq-tag.c70
-rw-r--r--block/blk-mq-tag.h6
-rw-r--r--block/blk-mq.c407
-rw-r--r--block/blk-mq.h4
-rw-r--r--block/blk-settings.c68
-rw-r--r--block/blk-sysfs.c13
-rw-r--r--block/blk-throttle.c63
-rw-r--r--block/blk-wbt.c16
-rw-r--r--block/blk-wbt.h4
-rw-r--r--block/blk-zoned.c23
-rw-r--r--block/blk.h88
-rw-r--r--block/bounce.c2
-rw-r--r--block/genhd.c133
-rw-r--r--block/ioctl.c154
-rw-r--r--block/keyslot-manager.c397
-rw-r--r--block/kyber-iosched.c2
-rw-r--r--block/mq-deadline.c2
-rw-r--r--block/partitions/core.c187
36 files changed, 2879 insertions, 859 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 3bc76bb113a0..9382a4acefc3 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -146,6 +146,7 @@ config BLK_CGROUP_IOLATENCY
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
depends on BLK_CGROUP=y
+ select BLK_RQ_IO_DATA_LEN
select BLK_RQ_ALLOC_TIME
---help---
Enabling this option enables the .weight interface for cost
@@ -185,6 +186,23 @@ config BLK_SED_OPAL
Enabling this option enables users to setup/unlock/lock
Locking ranges for SED devices using the Opal protocol.
+config BLK_INLINE_ENCRYPTION
+ bool "Enable inline encryption support in block layer"
+ help
+ Build the blk-crypto subsystem. Enabling this lets the
+ block layer handle encryption, so users can take
+ advantage of inline encryption hardware if present.
+
+config BLK_INLINE_ENCRYPTION_FALLBACK
+ bool "Enable crypto API fallback for blk-crypto"
+ depends on BLK_INLINE_ENCRYPTION
+ select CRYPTO
+ select CRYPTO_SKCIPHER
+ help
+ Enabling this lets the block layer handle inline encryption
+ by falling back to the kernel crypto API when inline
+ encryption hardware is not present.
+
menu "Partition Types"
source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 206b96e9387f..78719169fb2a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,3 +36,5 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3d411716d7ee..50c8f034c01c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6073,7 +6073,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
* comments on bfq_init_rq for the reason behind this delayed
* preparation.
*/
-static void bfq_prepare_request(struct request *rq, struct bio *bio)
+static void bfq_prepare_request(struct request *rq)
{
/*
* Regardless of whether we have an icq attached, we have to
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index bf62c25cde8f..3579ac0f6ec1 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -42,6 +42,9 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
struct bio_set *bs = bio->bi_pool;
unsigned inline_vecs;
+ if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
+ return ERR_PTR(-EOPNOTSUPP);
+
if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask);
inline_vecs = nr_vecs;
diff --git a/block/bio.c b/block/bio.c
index 21cbaa6a1c20..5235da6434aa 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -18,6 +18,7 @@
#include <linux/blk-cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
+#include <linux/blk-crypto.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -237,6 +238,8 @@ void bio_uninit(struct bio *bio)
if (bio_integrity(bio))
bio_integrity_free(bio);
+
+ bio_crypt_free_ctx(bio);
}
EXPORT_SYMBOL(bio_uninit);
@@ -708,6 +711,8 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
__bio_clone_fast(b, bio);
+ bio_crypt_clone(b, bio, gfp_mask);
+
if (bio_integrity(bio)) {
int ret;
@@ -748,9 +753,14 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return true;
}
-static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned len, unsigned offset,
- bool *same_page)
+/*
+ * Try to merge a page into a segment, while obeying the hardware segment
+ * size limit. This is not for normal read/write bios, but for passthrough
+ * or Zone Append operations that we can't split.
+ */
+static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
+ struct page *page, unsigned len,
+ unsigned offset, bool *same_page)
{
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned long mask = queue_segment_boundary(q);
@@ -765,38 +775,32 @@ static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio,
}
/**
- * __bio_add_pc_page - attempt to add page to passthrough bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- * @same_page: return if the merge happen inside the same page
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
+ * bio_add_hw_page - attempt to add a page to a bio with hw constraints
+ * @q: the target queue
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ * @max_sectors: maximum number of sectors that can be added
+ * @same_page: return if the segment has been merged inside the same page
*
- * This should only be used by passthrough bios.
+ * Add a page to a bio while respecting the hardware max_sectors, max_segment
+ * and gap limitations.
*/
-int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
+int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
- bool *same_page)
+ unsigned int max_sectors, bool *same_page)
{
struct bio_vec *bvec;
- /*
- * cloned bio must not modify vec list
- */
- if (unlikely(bio_flagged(bio, BIO_CLONED)))
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
+ if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
return 0;
if (bio->bi_vcnt > 0) {
- if (bio_try_merge_pc_page(q, bio, page, len, offset, same_page))
+ if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
return len;
/*
@@ -823,11 +827,27 @@ int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
return len;
}
+/**
+ * bio_add_pc_page - attempt to add page to passthrough bio
+ * @q: the target queue
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ *
+ * Attempt to add a page to the bio_vec maplist. This can fail for a
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ *
+ * This should only be used by passthrough bios.
+ */
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset)
{
bool same_page = false;
- return __bio_add_pc_page(q, bio, page, len, offset, &same_page);
+ return bio_add_hw_page(q, bio, page, len, offset,
+ queue_max_hw_sectors(q), &same_page);
}
EXPORT_SYMBOL(bio_add_pc_page);
@@ -936,6 +956,7 @@ void bio_release_pages(struct bio *bio, bool mark_dirty)
put_page(bvec->bv_page);
}
}
+EXPORT_SYMBOL_GPL(bio_release_pages);
static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
{
@@ -1010,6 +1031,50 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return 0;
}
+static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
+{
+ unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+ unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
+ struct request_queue *q = bio->bi_disk->queue;
+ unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
+ struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
+ struct page **pages = (struct page **)bv;
+ ssize_t size, left;
+ unsigned len, i;
+ size_t offset;
+
+ if (WARN_ON_ONCE(!max_append_sectors))
+ return 0;
+
+ /*
+ * Move page array up in the allocated memory for the bio vecs as far as
+ * possible so that we can start filling biovecs from the beginning
+ * without overwriting the temporary page array.
+ */
+ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
+ pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
+
+ size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+ if (unlikely(size <= 0))
+ return size ? size : -EFAULT;
+
+ for (left = size, i = 0; left > 0; left -= len, i++) {
+ struct page *page = pages[i];
+ bool same_page = false;
+
+ len = min_t(size_t, PAGE_SIZE - offset, left);
+ if (bio_add_hw_page(q, bio, page, len, offset,
+ max_append_sectors, &same_page) != len)
+ return -EINVAL;
+ if (same_page)
+ put_page(page);
+ offset = 0;
+ }
+
+ iov_iter_advance(iter, size);
+ return 0;
+}
+
/**
* bio_iov_iter_get_pages - add user or kernel pages to a bio
* @bio: bio to add pages to
@@ -1039,16 +1104,23 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return -EINVAL;
do {
- if (is_bvec)
- ret = __bio_iov_bvec_add_pages(bio, iter);
- else
- ret = __bio_iov_iter_get_pages(bio, iter);
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ if (WARN_ON_ONCE(is_bvec))
+ return -EINVAL;
+ ret = __bio_iov_append_get_pages(bio, iter);
+ } else {
+ if (is_bvec)
+ ret = __bio_iov_bvec_add_pages(bio, iter);
+ else
+ ret = __bio_iov_iter_get_pages(bio, iter);
+ }
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
if (is_bvec)
bio_set_flag(bio, BIO_NO_PAGE_REF);
return bio->bi_vcnt ? 0 : ret;
}
+EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
static void submit_bio_wait_endio(struct bio *bio)
{
@@ -1105,6 +1177,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
if (bio_integrity(bio))
bio_integrity_advance(bio, bytes);
+ bio_crypt_advance(bio, bytes);
bio_advance_iter(bio, &bio->bi_iter, bytes);
}
EXPORT_SYMBOL(bio_advance);
@@ -1303,55 +1376,6 @@ defer:
schedule_work(&bio_dirty_work);
}
-void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
-{
- unsigned long stamp;
-again:
- stamp = READ_ONCE(part->stamp);
- if (unlikely(stamp != now)) {
- if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
- __part_stat_add(part, io_ticks, end ? now - stamp : 1);
- }
- }
- if (part->partno) {
- part = &part_to_disk(part)->part0;
- goto again;
- }
-}
-
-void generic_start_io_acct(struct request_queue *q, int op,
- unsigned long sectors, struct hd_struct *part)
-{
- const int sgrp = op_stat_group(op);
-
- part_stat_lock();
-
- update_io_ticks(part, jiffies, false);
- part_stat_inc(part, ios[sgrp]);
- part_stat_add(part, sectors[sgrp], sectors);
- part_inc_in_flight(q, part, op_is_write(op));
-
- part_stat_unlock();
-}
-EXPORT_SYMBOL(generic_start_io_acct);
-
-void generic_end_io_acct(struct request_queue *q, int req_op,
- struct hd_struct *part, unsigned long start_time)
-{
- unsigned long now = jiffies;
- unsigned long duration = now - start_time;
- const int sgrp = op_stat_group(req_op);
-
- part_stat_lock();
-
- update_io_ticks(part, now, true);
- part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
- part_dec_in_flight(q, part, op_is_write(req_op));
-
- part_stat_unlock();
-}
-EXPORT_SYMBOL(generic_end_io_acct);
-
static inline bool bio_remaining_done(struct bio *bio)
{
/*
@@ -1445,6 +1469,10 @@ struct bio *bio_split(struct bio *bio, int sectors,
BUG_ON(sectors <= 0);
BUG_ON(sectors >= bio_sectors(bio));
+ /* Zone append commands cannot be split */
+ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
+ return NULL;
+
split = bio_clone_fast(bio, gfp, bs);
if (!split)
return NULL;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 930212c1a512..0ecc897b225c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1530,6 +1530,10 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
{
u64 old = atomic64_read(&blkg->delay_start);
+ /* negative use_delay means no scaling, see blkcg_set_delay() */
+ if (atomic_read(&blkg->use_delay) < 0)
+ return;
+
/*
* We only want to scale down every second. The idea here is that we
* want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
@@ -1717,6 +1721,8 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
*/
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
{
+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+ return;
blkcg_scale_delay(blkg, now);
atomic64_add(delta, &blkg->delay_nsec);
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 38d7b1f16067..03252af8c82c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,8 @@
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/psi.h>
+#include <linux/sched/sysctl.h>
+#include <linux/blk-crypto.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -121,6 +123,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
refcount_set(&rq->ref, 1);
+ blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -136,6 +139,7 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(ZONE_OPEN),
REQ_OP_NAME(ZONE_CLOSE),
REQ_OP_NAME(ZONE_FINISH),
+ REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_SAME),
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(SCSI_IN),
@@ -241,6 +245,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
bio_advance(bio, nbytes);
+ if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported as the
+ * BIO fragments may end up not being written sequentially.
+ */
+ if (bio->bi_iter.bi_size)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_iter.bi_sector = rq->__sector;
+ }
+
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
@@ -441,6 +456,23 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
}
}
+static inline int bio_queue_enter(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
+ int ret;
+
+ ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
+ if (unlikely(ret)) {
+ if (nowait && !blk_queue_dying(q))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ }
+
+ return ret;
+}
+
void blk_queue_exit(struct request_queue *q)
{
percpu_ref_put(&q->q_usage_counter);
@@ -485,7 +517,7 @@ struct request_queue *__blk_alloc_queue(int node_id)
if (ret)
goto fail_id;
- q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id);
+ q->backing_dev_info = bdi_alloc(node_id);
if (!q->backing_dev_info)
goto fail_split;
@@ -495,7 +527,6 @@ struct request_queue *__blk_alloc_queue(int node_id)
q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
- q->backing_dev_info->name = "block";
q->node = node_id;
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
@@ -606,6 +637,16 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
+static void blk_account_io_merge_bio(struct request *req)
+{
+ if (!blk_do_io_stat(req))
+ return;
+
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+}
+
bool bio_attempt_back_merge(struct request *req, struct bio *bio,
unsigned int nr_segs)
{
@@ -624,7 +665,9 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio,
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
- blk_account_io_start(req, false);
+ bio_crypt_free_ctx(bio);
+
+ blk_account_io_merge_bio(req);
return true;
}
@@ -648,7 +691,9 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio,
req->__sector = bio->bi_iter.bi_sector;
req->__data_len += bio->bi_iter.bi_size;
- blk_account_io_start(req, false);
+ bio_crypt_do_front_merge(req, bio);
+
+ blk_account_io_merge_bio(req);
return true;
}
@@ -670,7 +715,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
- blk_account_io_start(req, false);
+ blk_account_io_merge_bio(req);
return true;
no_merge:
req_set_nomerge(q, req);
@@ -872,6 +917,41 @@ out:
return ret;
}
+/*
+ * Check write append to a zoned block device.
+ */
+static inline blk_status_t blk_check_zone_append(struct request_queue *q,
+ struct bio *bio)
+{
+ sector_t pos = bio->bi_iter.bi_sector;
+ int nr_sectors = bio_sectors(bio);
+
+ /* Only applicable to zoned block devices */
+ if (!blk_queue_is_zoned(q))
+ return BLK_STS_NOTSUPP;
+
+ /* The bio sector must point to the start of a sequential zone */
+ if (pos & (blk_queue_zone_sectors(q) - 1) ||
+ !blk_queue_zone_is_seq(q, pos))
+ return BLK_STS_IOERR;
+
+ /*
+ * Not allowed to cross zone boundaries. Otherwise, the BIO will be
+ * split and could result in non-contiguous sectors being written in
+ * different zones.
+ */
+ if (nr_sectors > q->limits.chunk_sectors)
+ return BLK_STS_IOERR;
+
+ /* Make sure the BIO is small enough and will not get split */
+ if (nr_sectors > q->limits.max_zone_append_sectors)
+ return BLK_STS_IOERR;
+
+ bio->bi_opf |= REQ_NOMERGE;
+
+ return BLK_STS_OK;
+}
+
static noinline_for_stack bool
generic_make_request_checks(struct bio *bio)
{
@@ -941,6 +1021,11 @@ generic_make_request_checks(struct bio *bio)
if (!q->limits.max_write_same_sectors)
goto not_supported;
break;
+ case REQ_OP_ZONE_APPEND:
+ status = blk_check_zone_append(q, bio);
+ if (status != BLK_STS_OK)
+ goto end_io;
+ break;
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
@@ -961,12 +1046,13 @@ generic_make_request_checks(struct bio *bio)
}
/*
- * Various block parts want %current->io_context and lazy ioc
- * allocation ends up trading a lot of pain for a small amount of
- * memory. Just allocate it upfront. This may fail and block
- * layer knows how to live with it.
+ * Various block parts want %current->io_context, so allocate it up
+ * front rather than dealing with lots of pain to allocate it only
+ * where needed. This may fail and the block layer knows how to live
+ * with it.
*/
- create_io_context(GFP_ATOMIC, q->node);
+ if (unlikely(!current->io_context))
+ create_task_io_context(current, GFP_ATOMIC, q->node);
if (!blkcg_bio_issue_check(q, bio))
return false;
@@ -988,29 +1074,28 @@ end_io:
return false;
}
+static blk_qc_t do_make_request(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ blk_qc_t ret = BLK_QC_T_NONE;
+
+ if (blk_crypto_bio_prep(&bio)) {
+ if (!q->make_request_fn)
+ return blk_mq_make_request(q, bio);
+ ret = q->make_request_fn(q, bio);
+ }
+ blk_queue_exit(q);
+ return ret;
+}
+
/**
- * generic_make_request - hand a buffer to its device driver for I/O
+ * generic_make_request - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
*
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status. The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may resubmit the bio to
- * a lower device by calling into generic_make_request recursively, which
- * means the bio should NOT be touched after the call to ->make_request_fn.
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
*/
blk_qc_t generic_make_request(struct bio *bio)
{
@@ -1061,18 +1146,14 @@ blk_qc_t generic_make_request(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
- blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
- BLK_MQ_REQ_NOWAIT : 0;
- if (likely(blk_queue_enter(q, flags) == 0)) {
+ if (likely(bio_queue_enter(bio) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
- ret = q->make_request_fn(q, bio);
-
- blk_queue_exit(q);
+ ret = do_make_request(bio);
/* sort new bios into those for a lower level
* and those for the same level
@@ -1088,12 +1169,6 @@ blk_qc_t generic_make_request(struct bio *bio)
bio_list_merge(&bio_list_on_stack[0], &lower);
bio_list_merge(&bio_list_on_stack[0], &same);
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- } else {
- if (unlikely(!blk_queue_dying(q) &&
- (bio->bi_opf & REQ_NOWAIT)))
- bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
}
bio = bio_list_pop(&bio_list_on_stack[0]);
} while (bio);
@@ -1110,30 +1185,25 @@ EXPORT_SYMBOL(generic_make_request);
*
* This function behaves like generic_make_request(), but does not protect
* against recursion. Must only be used if the called driver is known
- * to not call generic_make_request (or direct_make_request) again from
- * its make_request function. (Calling direct_make_request again from
- * a workqueue is perfectly fine as that doesn't recurse).
+ * to be blk-mq based.
*/
blk_qc_t direct_make_request(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
- bool nowait = bio->bi_opf & REQ_NOWAIT;
- blk_qc_t ret;
+ if (WARN_ON_ONCE(q->make_request_fn)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
if (!generic_make_request_checks(bio))
return BLK_QC_T_NONE;
-
- if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
- if (nowait && !blk_queue_dying(q))
- bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
+ if (unlikely(bio_queue_enter(bio)))
+ return BLK_QC_T_NONE;
+ if (!blk_crypto_bio_prep(&bio)) {
+ blk_queue_exit(q);
return BLK_QC_T_NONE;
}
-
- ret = q->make_request_fn(q, bio);
- blk_queue_exit(q);
- return ret;
+ return blk_mq_make_request(q, bio);
}
EXPORT_SYMBOL_GPL(direct_make_request);
@@ -1141,17 +1211,17 @@ EXPORT_SYMBOL_GPL(direct_make_request);
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
- * submit_bio() is very similar in purpose to generic_make_request(), and
- * uses that function to do most of the work. Both are fairly rough
- * interfaces; @bio must be presetup and ready for I/O.
+ * submit_bio() is used to submit I/O requests to block devices. It is passed a
+ * fully set up &struct bio that describes the I/O that needs to be done. The
+ * bio will be send to the device described by the bi_disk and bi_partno fields.
*
+ * The success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the ->bi_end_io() callback
+ * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
+ * been called.
*/
blk_qc_t submit_bio(struct bio *bio)
{
- bool workingset_read = false;
- unsigned long pflags;
- blk_qc_t ret;
-
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;
@@ -1170,8 +1240,6 @@ blk_qc_t submit_bio(struct bio *bio)
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
- if (bio_flagged(bio, BIO_WORKINGSET))
- workingset_read = true;
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
@@ -1187,20 +1255,24 @@ blk_qc_t submit_bio(struct bio *bio)
}
/*
- * If we're reading data that is part of the userspace
- * workingset, count submission time as memory stall. When the
- * device is congested, or the submitting cgroup IO-throttled,
- * submission can be a significant part of overall IO time.
+ * If we're reading data that is part of the userspace workingset, count
+ * submission time as memory stall. When the device is congested, or
+ * the submitting cgroup IO-throttled, submission can be a significant
+ * part of overall IO time.
*/
- if (workingset_read)
- psi_memstall_enter(&pflags);
-
- ret = generic_make_request(bio);
+ if (unlikely(bio_op(bio) == REQ_OP_READ &&
+ bio_flagged(bio, BIO_WORKINGSET))) {
+ unsigned long pflags;
+ blk_qc_t ret;
- if (workingset_read)
+ psi_memstall_enter(&pflags);
+ ret = generic_make_request(bio);
psi_memstall_leave(&pflags);
- return ret;
+ return ret;
+ }
+
+ return generic_make_request(bio);
}
EXPORT_SYMBOL(submit_bio);
@@ -1261,8 +1333,11 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
+ if (blk_crypto_insert_cloned_request(rq))
+ return BLK_STS_IOERR;
+
if (blk_queue_io_stat(q))
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
/*
* Since we have a scheduler attached on the top device,
@@ -1314,7 +1389,22 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
-void blk_account_io_completion(struct request *req, unsigned int bytes)
+static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
+{
+ unsigned long stamp;
+again:
+ stamp = READ_ONCE(part->stamp);
+ if (unlikely(stamp != now)) {
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
+ __part_stat_add(part, io_ticks, end ? now - stamp : 1);
+ }
+ if (part->partno) {
+ part = &part_to_disk(part)->part0;
+ goto again;
+ }
+}
+
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
const int sgrp = op_stat_group(req_op(req));
@@ -1345,48 +1435,57 @@ void blk_account_io_done(struct request *req, u64 now)
update_io_ticks(part, jiffies, true);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
- part_dec_in_flight(req->q, part, rq_data_dir(req));
+ part_stat_unlock();
hd_struct_put(part);
- part_stat_unlock();
}
}
-void blk_account_io_start(struct request *rq, bool new_io)
+void blk_account_io_start(struct request *rq)
{
- struct hd_struct *part;
- int rw = rq_data_dir(rq);
-
if (!blk_do_io_stat(rq))
return;
+ rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+
part_stat_lock();
+ update_io_ticks(rq->part, jiffies, false);
+ part_stat_unlock();
+}
- if (!new_io) {
- part = rq->part;
- part_stat_inc(part, merges[rw]);
- } else {
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!hd_struct_try_get(part)) {
- /*
- * The partition is already being removed,
- * the request will be accounted on the disk only
- *
- * We take a reference on disk->part0 although that
- * partition will never be deleted, so we can treat
- * it as any other partition.
- */
- part = &rq->rq_disk->part0;
- hd_struct_get(part);
- }
- part_inc_in_flight(rq->q, part, rw);
- rq->part = part;
- }
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
+ unsigned int op)
+{
+ struct hd_struct *part = &disk->part0;
+ const int sgrp = op_stat_group(op);
+ unsigned long now = READ_ONCE(jiffies);
+
+ part_stat_lock();
+ update_io_ticks(part, now, false);
+ part_stat_inc(part, ios[sgrp]);
+ part_stat_add(part, sectors[sgrp], sectors);
+ part_stat_local_inc(part, in_flight[op_is_write(op)]);
+ part_stat_unlock();
- update_io_ticks(part, jiffies, false);
+ return now;
+}
+EXPORT_SYMBOL(disk_start_io_acct);
+
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
+ unsigned long start_time)
+{
+ struct hd_struct *part = &disk->part0;
+ const int sgrp = op_stat_group(op);
+ unsigned long now = READ_ONCE(jiffies);
+ unsigned long duration = now - start_time;
+ part_stat_lock();
+ update_io_ticks(part, now, true);
+ part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock();
}
+EXPORT_SYMBOL(disk_end_io_acct);
/*
* Steal bios from a request and add them to a bio list.
@@ -1636,7 +1735,9 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
rq->ioprio = rq_src->ioprio;
- rq->extra_len = rq_src->extra_len;
+
+ if (rq->bio)
+ blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask);
return 0;
@@ -1778,6 +1879,18 @@ void blk_finish_plug(struct blk_plug *plug)
}
EXPORT_SYMBOL(blk_finish_plug);
+void blk_io_schedule(void)
+{
+ /* Prevent hang_check timer from firing at us during very long I/O */
+ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
+
+ if (timeout)
+ io_schedule_timeout(timeout);
+ else
+ io_schedule();
+}
+EXPORT_SYMBOL_GPL(blk_io_schedule);
+
int __init blk_dev_init(void)
{
BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
new file mode 100644
index 000000000000..6e49688a2d80
--- /dev/null
+++ b/block/blk-crypto-fallback.c
@@ -0,0 +1,657 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Refer to Documentation/block/inline-encryption.rst for detailed explanation.
+ */
+
+#define pr_fmt(fmt) "blk-crypto-fallback: " fmt
+
+#include <crypto/skcipher.h>
+#include <linux/blk-cgroup.h>
+#include <linux/blk-crypto.h>
+#include <linux/blkdev.h>
+#include <linux/crypto.h>
+#include <linux/keyslot-manager.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "blk-crypto-internal.h"
+
+static unsigned int num_prealloc_bounce_pg = 32;
+module_param(num_prealloc_bounce_pg, uint, 0);
+MODULE_PARM_DESC(num_prealloc_bounce_pg,
+ "Number of preallocated bounce pages for the blk-crypto crypto API fallback");
+
+static unsigned int blk_crypto_num_keyslots = 100;
+module_param_named(num_keyslots, blk_crypto_num_keyslots, uint, 0);
+MODULE_PARM_DESC(num_keyslots,
+ "Number of keyslots for the blk-crypto crypto API fallback");
+
+static unsigned int num_prealloc_fallback_crypt_ctxs = 128;
+module_param(num_prealloc_fallback_crypt_ctxs, uint, 0);
+MODULE_PARM_DESC(num_prealloc_crypt_fallback_ctxs,
+ "Number of preallocated bio fallback crypto contexts for blk-crypto to use during crypto API fallback");
+
+struct bio_fallback_crypt_ctx {
+ struct bio_crypt_ctx crypt_ctx;
+ /*
+ * Copy of the bvec_iter when this bio was submitted.
+ * We only want to en/decrypt the part of the bio as described by the
+ * bvec_iter upon submission because bio might be split before being
+ * resubmitted
+ */
+ struct bvec_iter crypt_iter;
+ union {
+ struct {
+ struct work_struct work;
+ struct bio *bio;
+ };
+ struct {
+ void *bi_private_orig;
+ bio_end_io_t *bi_end_io_orig;
+ };
+ };
+};
+
+static struct kmem_cache *bio_fallback_crypt_ctx_cache;
+static mempool_t *bio_fallback_crypt_ctx_pool;
+
+/*
+ * Allocating a crypto tfm during I/O can deadlock, so we have to preallocate
+ * all of a mode's tfms when that mode starts being used. Since each mode may
+ * need all the keyslots at some point, each mode needs its own tfm for each
+ * keyslot; thus, a keyslot may contain tfms for multiple modes. However, to
+ * match the behavior of real inline encryption hardware (which only supports a
+ * single encryption context per keyslot), we only allow one tfm per keyslot to
+ * be used at a time - the rest of the unused tfms have their keys cleared.
+ */
+static DEFINE_MUTEX(tfms_init_lock);
+static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
+
+static struct blk_crypto_keyslot {
+ enum blk_crypto_mode_num crypto_mode;
+ struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
+} *blk_crypto_keyslots;
+
+static struct blk_keyslot_manager blk_crypto_ksm;
+static struct workqueue_struct *blk_crypto_wq;
+static mempool_t *blk_crypto_bounce_page_pool;
+
+/*
+ * This is the key we set when evicting a keyslot. This *should* be the all 0's
+ * key, but AES-XTS rejects that key, so we use some random bytes instead.
+ */
+static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
+
+static void blk_crypto_evict_keyslot(unsigned int slot)
+{
+ struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+ enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
+ int err;
+
+ WARN_ON(slotp->crypto_mode == BLK_ENCRYPTION_MODE_INVALID);
+
+ /* Clear the key in the skcipher */
+ err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], blank_key,
+ blk_crypto_modes[crypto_mode].keysize);
+ WARN_ON(err);
+ slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
+}
+
+static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key,
+ unsigned int slot)
+{
+ struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+ const enum blk_crypto_mode_num crypto_mode =
+ key->crypto_cfg.crypto_mode;
+ int err;
+
+ if (crypto_mode != slotp->crypto_mode &&
+ slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
+ blk_crypto_evict_keyslot(slot);
+
+ slotp->crypto_mode = crypto_mode;
+ err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
+ key->size);
+ if (err) {
+ blk_crypto_evict_keyslot(slot);
+ return err;
+ }
+ return 0;
+}
+
+static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key,
+ unsigned int slot)
+{
+ blk_crypto_evict_keyslot(slot);
+ return 0;
+}
+
+/*
+ * The crypto API fallback KSM ops - only used for a bio when it specifies a
+ * blk_crypto_key that was not supported by the device's inline encryption
+ * hardware.
+ */
+static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
+ .keyslot_program = blk_crypto_keyslot_program,
+ .keyslot_evict = blk_crypto_keyslot_evict,
+};
+
+static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
+{
+ struct bio *src_bio = enc_bio->bi_private;
+ int i;
+
+ for (i = 0; i < enc_bio->bi_vcnt; i++)
+ mempool_free(enc_bio->bi_io_vec[i].bv_page,
+ blk_crypto_bounce_page_pool);
+
+ src_bio->bi_status = enc_bio->bi_status;
+
+ bio_put(enc_bio);
+ bio_endio(src_bio);
+}
+
+static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL);
+ if (!bio)
+ return NULL;
+ bio->bi_disk = bio_src->bi_disk;
+ bio->bi_opf = bio_src->bi_opf;
+ bio->bi_ioprio = bio_src->bi_ioprio;
+ bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
+ bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
+
+ bio_for_each_segment(bv, bio_src, iter)
+ bio->bi_io_vec[bio->bi_vcnt++] = bv;
+
+ bio_clone_blkg_association(bio, bio_src);
+ blkcg_bio_issue_init(bio);
+
+ return bio;
+}
+
+static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
+ struct skcipher_request **ciph_req_ret,
+ struct crypto_wait *wait)
+{
+ struct skcipher_request *ciph_req;
+ const struct blk_crypto_keyslot *slotp;
+ int keyslot_idx = blk_ksm_get_slot_idx(slot);
+
+ slotp = &blk_crypto_keyslots[keyslot_idx];
+ ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
+ GFP_NOIO);
+ if (!ciph_req)
+ return false;
+
+ skcipher_request_set_callback(ciph_req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG |
+ CRYPTO_TFM_REQ_MAY_SLEEP,
+ crypto_req_done, wait);
+ *ciph_req_ret = ciph_req;
+
+ return true;
+}
+
+static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
+{
+ struct bio *bio = *bio_ptr;
+ unsigned int i = 0;
+ unsigned int num_sectors = 0;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ num_sectors += bv.bv_len >> SECTOR_SHIFT;
+ if (++i == BIO_MAX_PAGES)
+ break;
+ }
+ if (num_sectors < bio_sectors(bio)) {
+ struct bio *split_bio;
+
+ split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL);
+ if (!split_bio) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ return false;
+ }
+ bio_chain(split_bio, bio);
+ generic_make_request(bio);
+ *bio_ptr = split_bio;
+ }
+
+ return true;
+}
+
+union blk_crypto_iv {
+ __le64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+ u8 bytes[BLK_CRYPTO_MAX_IV_SIZE];
+};
+
+static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
+ union blk_crypto_iv *iv)
+{
+ int i;
+
+ for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++)
+ iv->dun[i] = cpu_to_le64(dun[i]);
+}
+
+/*
+ * The crypto API fallback's encryption routine.
+ * Allocate a bounce bio for encryption, encrypt the input bio using crypto API,
+ * and replace *bio_ptr with the bounce bio. May split input bio if it's too
+ * large. Returns true on success. Returns false and sets bio->bi_status on
+ * error.
+ */
+static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
+{
+ struct bio *src_bio, *enc_bio;
+ struct bio_crypt_ctx *bc;
+ struct blk_ksm_keyslot *slot;
+ int data_unit_size;
+ struct skcipher_request *ciph_req = NULL;
+ DECLARE_CRYPTO_WAIT(wait);
+ u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+ struct scatterlist src, dst;
+ union blk_crypto_iv iv;
+ unsigned int i, j;
+ bool ret = false;
+ blk_status_t blk_st;
+
+ /* Split the bio if it's too big for single page bvec */
+ if (!blk_crypto_split_bio_if_needed(bio_ptr))
+ return false;
+
+ src_bio = *bio_ptr;
+ bc = src_bio->bi_crypt_context;
+ data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
+
+ /* Allocate bounce bio for encryption */
+ enc_bio = blk_crypto_clone_bio(src_bio);
+ if (!enc_bio) {
+ src_bio->bi_status = BLK_STS_RESOURCE;
+ return false;
+ }
+
+ /*
+ * Use the crypto API fallback keyslot manager to get a crypto_skcipher
+ * for the algorithm and key specified for this bio.
+ */
+ blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+ if (blk_st != BLK_STS_OK) {
+ src_bio->bi_status = blk_st;
+ goto out_put_enc_bio;
+ }
+
+ /* and then allocate an skcipher_request for it */
+ if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+ src_bio->bi_status = BLK_STS_RESOURCE;
+ goto out_release_keyslot;
+ }
+
+ memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun));
+ sg_init_table(&src, 1);
+ sg_init_table(&dst, 1);
+
+ skcipher_request_set_crypt(ciph_req, &src, &dst, data_unit_size,
+ iv.bytes);
+
+ /* Encrypt each page in the bounce bio */
+ for (i = 0; i < enc_bio->bi_vcnt; i++) {
+ struct bio_vec *enc_bvec = &enc_bio->bi_io_vec[i];
+ struct page *plaintext_page = enc_bvec->bv_page;
+ struct page *ciphertext_page =
+ mempool_alloc(blk_crypto_bounce_page_pool, GFP_NOIO);
+
+ enc_bvec->bv_page = ciphertext_page;
+
+ if (!ciphertext_page) {
+ src_bio->bi_status = BLK_STS_RESOURCE;
+ goto out_free_bounce_pages;
+ }
+
+ sg_set_page(&src, plaintext_page, data_unit_size,
+ enc_bvec->bv_offset);
+ sg_set_page(&dst, ciphertext_page, data_unit_size,
+ enc_bvec->bv_offset);
+
+ /* Encrypt each data unit in this page */
+ for (j = 0; j < enc_bvec->bv_len; j += data_unit_size) {
+ blk_crypto_dun_to_iv(curr_dun, &iv);
+ if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req),
+ &wait)) {
+ i++;
+ src_bio->bi_status = BLK_STS_IOERR;
+ goto out_free_bounce_pages;
+ }
+ bio_crypt_dun_increment(curr_dun, 1);
+ src.offset += data_unit_size;
+ dst.offset += data_unit_size;
+ }
+ }
+
+ enc_bio->bi_private = src_bio;
+ enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio;
+ *bio_ptr = enc_bio;
+ ret = true;
+
+ enc_bio = NULL;
+ goto out_free_ciph_req;
+
+out_free_bounce_pages:
+ while (i > 0)
+ mempool_free(enc_bio->bi_io_vec[--i].bv_page,
+ blk_crypto_bounce_page_pool);
+out_free_ciph_req:
+ skcipher_request_free(ciph_req);
+out_release_keyslot:
+ blk_ksm_put_slot(slot);
+out_put_enc_bio:
+ if (enc_bio)
+ bio_put(enc_bio);
+
+ return ret;
+}
+
+/*
+ * The crypto API fallback's main decryption routine.
+ * Decrypts input bio in place, and calls bio_endio on the bio.
+ */
+static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
+{
+ struct bio_fallback_crypt_ctx *f_ctx =
+ container_of(work, struct bio_fallback_crypt_ctx, work);
+ struct bio *bio = f_ctx->bio;
+ struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
+ struct blk_ksm_keyslot *slot;
+ struct skcipher_request *ciph_req = NULL;
+ DECLARE_CRYPTO_WAIT(wait);
+ u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+ union blk_crypto_iv iv;
+ struct scatterlist sg;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
+ unsigned int i;
+ blk_status_t blk_st;
+
+ /*
+ * Use the crypto API fallback keyslot manager to get a crypto_skcipher
+ * for the algorithm and key specified for this bio.
+ */
+ blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+ if (blk_st != BLK_STS_OK) {
+ bio->bi_status = blk_st;
+ goto out_no_keyslot;
+ }
+
+ /* and then allocate an skcipher_request for it */
+ if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ goto out;
+ }
+
+ memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun));
+ sg_init_table(&sg, 1);
+ skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size,
+ iv.bytes);
+
+ /* Decrypt each segment in the bio */
+ __bio_for_each_segment(bv, bio, iter, f_ctx->crypt_iter) {
+ struct page *page = bv.bv_page;
+
+ sg_set_page(&sg, page, data_unit_size, bv.bv_offset);
+
+ /* Decrypt each data unit in the segment */
+ for (i = 0; i < bv.bv_len; i += data_unit_size) {
+ blk_crypto_dun_to_iv(curr_dun, &iv);
+ if (crypto_wait_req(crypto_skcipher_decrypt(ciph_req),
+ &wait)) {
+ bio->bi_status = BLK_STS_IOERR;
+ goto out;
+ }
+ bio_crypt_dun_increment(curr_dun, 1);
+ sg.offset += data_unit_size;
+ }
+ }
+
+out:
+ skcipher_request_free(ciph_req);
+ blk_ksm_put_slot(slot);
+out_no_keyslot:
+ mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
+ bio_endio(bio);
+}
+
+/**
+ * blk_crypto_fallback_decrypt_endio - queue bio for fallback decryption
+ *
+ * @bio: the bio to queue
+ *
+ * Restore bi_private and bi_end_io, and queue the bio for decryption into a
+ * workqueue, since this function will be called from an atomic context.
+ */
+static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
+{
+ struct bio_fallback_crypt_ctx *f_ctx = bio->bi_private;
+
+ bio->bi_private = f_ctx->bi_private_orig;
+ bio->bi_end_io = f_ctx->bi_end_io_orig;
+
+ /* If there was an IO error, don't queue for decrypt. */
+ if (bio->bi_status) {
+ mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
+ bio_endio(bio);
+ return;
+ }
+
+ INIT_WORK(&f_ctx->work, blk_crypto_fallback_decrypt_bio);
+ f_ctx->bio = bio;
+ queue_work(blk_crypto_wq, &f_ctx->work);
+}
+
+/**
+ * blk_crypto_fallback_bio_prep - Prepare a bio to use fallback en/decryption
+ *
+ * @bio_ptr: pointer to the bio to prepare
+ *
+ * If bio is doing a WRITE operation, this splits the bio into two parts if it's
+ * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
+ * for the first part, encrypts it, and update bio_ptr to point to the bounce
+ * bio.
+ *
+ * For a READ operation, we mark the bio for decryption by using bi_private and
+ * bi_end_io.
+ *
+ * In either case, this function will make the bio look like a regular bio (i.e.
+ * as if no encryption context was ever specified) for the purposes of the rest
+ * of the stack except for blk-integrity (blk-integrity and blk-crypto are not
+ * currently supported together).
+ *
+ * Return: true on success. Sets bio->bi_status and returns false on error.
+ */
+bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
+{
+ struct bio *bio = *bio_ptr;
+ struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+ struct bio_fallback_crypt_ctx *f_ctx;
+
+ if (WARN_ON_ONCE(!tfms_inited[bc->bc_key->crypto_cfg.crypto_mode])) {
+ /* User didn't call blk_crypto_start_using_key() first */
+ bio->bi_status = BLK_STS_IOERR;
+ return false;
+ }
+
+ if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
+ &bc->bc_key->crypto_cfg)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ return false;
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ return blk_crypto_fallback_encrypt_bio(bio_ptr);
+
+ /*
+ * bio READ case: Set up a f_ctx in the bio's bi_private and set the
+ * bi_end_io appropriately to trigger decryption when the bio is ended.
+ */
+ f_ctx = mempool_alloc(bio_fallback_crypt_ctx_pool, GFP_NOIO);
+ f_ctx->crypt_ctx = *bc;
+ f_ctx->crypt_iter = bio->bi_iter;
+ f_ctx->bi_private_orig = bio->bi_private;
+ f_ctx->bi_end_io_orig = bio->bi_end_io;
+ bio->bi_private = (void *)f_ctx;
+ bio->bi_end_io = blk_crypto_fallback_decrypt_endio;
+ bio_crypt_free_ctx(bio);
+
+ return true;
+}
+
+int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
+{
+ return blk_ksm_evict_key(&blk_crypto_ksm, key);
+}
+
+static bool blk_crypto_fallback_inited;
+static int blk_crypto_fallback_init(void)
+{
+ int i;
+ int err;
+
+ if (blk_crypto_fallback_inited)
+ return 0;
+
+ prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE);
+
+ err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+ if (err)
+ goto out;
+ err = -ENOMEM;
+
+ blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
+ blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+
+ /* All blk-crypto modes have a crypto API fallback. */
+ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
+ blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
+ blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
+
+ blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
+ WQ_UNBOUND | WQ_HIGHPRI |
+ WQ_MEM_RECLAIM, num_online_cpus());
+ if (!blk_crypto_wq)
+ goto fail_free_ksm;
+
+ blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
+ sizeof(blk_crypto_keyslots[0]),
+ GFP_KERNEL);
+ if (!blk_crypto_keyslots)
+ goto fail_free_wq;
+
+ blk_crypto_bounce_page_pool =
+ mempool_create_page_pool(num_prealloc_bounce_pg, 0);
+ if (!blk_crypto_bounce_page_pool)
+ goto fail_free_keyslots;
+
+ bio_fallback_crypt_ctx_cache = KMEM_CACHE(bio_fallback_crypt_ctx, 0);
+ if (!bio_fallback_crypt_ctx_cache)
+ goto fail_free_bounce_page_pool;
+
+ bio_fallback_crypt_ctx_pool =
+ mempool_create_slab_pool(num_prealloc_fallback_crypt_ctxs,
+ bio_fallback_crypt_ctx_cache);
+ if (!bio_fallback_crypt_ctx_pool)
+ goto fail_free_crypt_ctx_cache;
+
+ blk_crypto_fallback_inited = true;
+
+ return 0;
+fail_free_crypt_ctx_cache:
+ kmem_cache_destroy(bio_fallback_crypt_ctx_cache);
+fail_free_bounce_page_pool:
+ mempool_destroy(blk_crypto_bounce_page_pool);
+fail_free_keyslots:
+ kfree(blk_crypto_keyslots);
+fail_free_wq:
+ destroy_workqueue(blk_crypto_wq);
+fail_free_ksm:
+ blk_ksm_destroy(&blk_crypto_ksm);
+out:
+ return err;
+}
+
+/*
+ * Prepare blk-crypto-fallback for the specified crypto mode.
+ * Returns -ENOPKG if the needed crypto API support is missing.
+ */
+int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
+{
+ const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
+ struct blk_crypto_keyslot *slotp;
+ unsigned int i;
+ int err = 0;
+
+ /*
+ * Fast path
+ * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num]
+ * for each i are visible before we try to access them.
+ */
+ if (likely(smp_load_acquire(&tfms_inited[mode_num])))
+ return 0;
+
+ mutex_lock(&tfms_init_lock);
+ if (tfms_inited[mode_num])
+ goto out;
+
+ err = blk_crypto_fallback_init();
+ if (err)
+ goto out;
+
+ for (i = 0; i < blk_crypto_num_keyslots; i++) {
+ slotp = &blk_crypto_keyslots[i];
+ slotp->tfms[mode_num] = crypto_alloc_skcipher(cipher_str, 0, 0);
+ if (IS_ERR(slotp->tfms[mode_num])) {
+ err = PTR_ERR(slotp->tfms[mode_num]);
+ if (err == -ENOENT) {
+ pr_warn_once("Missing crypto API support for \"%s\"\n",
+ cipher_str);
+ err = -ENOPKG;
+ }
+ slotp->tfms[mode_num] = NULL;
+ goto out_free_tfms;
+ }
+
+ crypto_skcipher_set_flags(slotp->tfms[mode_num],
+ CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+ }
+
+ /*
+ * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num]
+ * for each i are visible before we set tfms_inited[mode_num].
+ */
+ smp_store_release(&tfms_inited[mode_num], true);
+ goto out;
+
+out_free_tfms:
+ for (i = 0; i < blk_crypto_num_keyslots; i++) {
+ slotp = &blk_crypto_keyslots[i];
+ crypto_free_skcipher(slotp->tfms[mode_num]);
+ slotp->tfms[mode_num] = NULL;
+ }
+out:
+ mutex_unlock(&tfms_init_lock);
+ return err;
+}
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
new file mode 100644
index 000000000000..d2b0f565d83c
--- /dev/null
+++ b/block/blk-crypto-internal.h
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef __LINUX_BLK_CRYPTO_INTERNAL_H
+#define __LINUX_BLK_CRYPTO_INTERNAL_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+/* Represents a crypto mode supported by blk-crypto */
+struct blk_crypto_mode {
+ const char *cipher_str; /* crypto API name (for fallback case) */
+ unsigned int keysize; /* key size in bytes */
+ unsigned int ivsize; /* iv size in bytes */
+};
+
+extern const struct blk_crypto_mode blk_crypto_modes[];
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+
+void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
+ unsigned int inc);
+
+bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio);
+
+bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
+ struct bio_crypt_ctx *bc2);
+
+static inline bool bio_crypt_ctx_back_mergeable(struct request *req,
+ struct bio *bio)
+{
+ return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req),
+ bio->bi_crypt_context);
+}
+
+static inline bool bio_crypt_ctx_front_mergeable(struct request *req,
+ struct bio *bio)
+{
+ return bio_crypt_ctx_mergeable(bio->bi_crypt_context,
+ bio->bi_iter.bi_size, req->crypt_ctx);
+}
+
+static inline bool bio_crypt_ctx_merge_rq(struct request *req,
+ struct request *next)
+{
+ return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req),
+ next->crypt_ctx);
+}
+
+static inline void blk_crypto_rq_set_defaults(struct request *rq)
+{
+ rq->crypt_ctx = NULL;
+ rq->crypt_keyslot = NULL;
+}
+
+static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
+{
+ return rq->crypt_ctx;
+}
+
+#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+
+static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
+ struct bio *bio)
+{
+ return true;
+}
+
+static inline bool bio_crypt_ctx_front_mergeable(struct request *req,
+ struct bio *bio)
+{
+ return true;
+}
+
+static inline bool bio_crypt_ctx_back_mergeable(struct request *req,
+ struct bio *bio)
+{
+ return true;
+}
+
+static inline bool bio_crypt_ctx_merge_rq(struct request *req,
+ struct request *next)
+{
+ return true;
+}
+
+static inline void blk_crypto_rq_set_defaults(struct request *rq) { }
+
+static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
+{
+ return false;
+}
+
+#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
+
+void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
+static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes)
+{
+ if (bio_has_crypt_ctx(bio))
+ __bio_crypt_advance(bio, bytes);
+}
+
+void __bio_crypt_free_ctx(struct bio *bio);
+static inline void bio_crypt_free_ctx(struct bio *bio)
+{
+ if (bio_has_crypt_ctx(bio))
+ __bio_crypt_free_ctx(bio);
+}
+
+static inline void bio_crypt_do_front_merge(struct request *rq,
+ struct bio *bio)
+{
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ if (bio_has_crypt_ctx(bio))
+ memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun,
+ sizeof(rq->crypt_ctx->bc_dun));
+#endif
+}
+
+bool __blk_crypto_bio_prep(struct bio **bio_ptr);
+static inline bool blk_crypto_bio_prep(struct bio **bio_ptr)
+{
+ if (bio_has_crypt_ctx(*bio_ptr))
+ return __blk_crypto_bio_prep(bio_ptr);
+ return true;
+}
+
+blk_status_t __blk_crypto_init_request(struct request *rq);
+static inline blk_status_t blk_crypto_init_request(struct request *rq)
+{
+ if (blk_crypto_rq_is_encrypted(rq))
+ return __blk_crypto_init_request(rq);
+ return BLK_STS_OK;
+}
+
+void __blk_crypto_free_request(struct request *rq);
+static inline void blk_crypto_free_request(struct request *rq)
+{
+ if (blk_crypto_rq_is_encrypted(rq))
+ __blk_crypto_free_request(rq);
+}
+
+void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask);
+static inline void blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask)
+{
+ if (bio_has_crypt_ctx(bio))
+ __blk_crypto_rq_bio_prep(rq, bio, gfp_mask);
+}
+
+/**
+ * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted
+ * into a request queue.
+ * @rq: the request being queued
+ *
+ * Return: BLK_STS_OK on success, nonzero on error.
+ */
+static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq)
+{
+
+ if (blk_crypto_rq_is_encrypted(rq))
+ return blk_crypto_init_request(rq);
+ return BLK_STS_OK;
+}
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK
+
+int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num);
+
+bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr);
+
+int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key);
+
+#else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */
+
+static inline int
+blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
+{
+ pr_warn_once("crypto API fallback is disabled\n");
+ return -ENOPKG;
+}
+
+static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
+{
+ pr_warn_once("crypto API fallback disabled; failing request.\n");
+ (*bio_ptr)->bi_status = BLK_STS_NOTSUPP;
+ return false;
+}
+
+static inline int
+blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
+{
+ return 0;
+}
+
+#endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */
+
+#endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
new file mode 100644
index 000000000000..6533c9b36ab8
--- /dev/null
+++ b/block/blk-crypto.c
@@ -0,0 +1,404 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Refer to Documentation/block/inline-encryption.rst for detailed explanation.
+ */
+
+#define pr_fmt(fmt) "blk-crypto: " fmt
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/keyslot-manager.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include "blk-crypto-internal.h"
+
+const struct blk_crypto_mode blk_crypto_modes[] = {
+ [BLK_ENCRYPTION_MODE_AES_256_XTS] = {
+ .cipher_str = "xts(aes)",
+ .keysize = 64,
+ .ivsize = 16,
+ },
+ [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
+ .cipher_str = "essiv(cbc(aes),sha256)",
+ .keysize = 16,
+ .ivsize = 16,
+ },
+ [BLK_ENCRYPTION_MODE_ADIANTUM] = {
+ .cipher_str = "adiantum(xchacha12,aes)",
+ .keysize = 32,
+ .ivsize = 32,
+ },
+};
+
+/*
+ * This number needs to be at least (the number of threads doing IO
+ * concurrently) * (maximum recursive depth of a bio), so that we don't
+ * deadlock on crypt_ctx allocations. The default is chosen to be the same
+ * as the default number of post read contexts in both EXT4 and F2FS.
+ */
+static int num_prealloc_crypt_ctxs = 128;
+
+module_param(num_prealloc_crypt_ctxs, int, 0444);
+MODULE_PARM_DESC(num_prealloc_crypt_ctxs,
+ "Number of bio crypto contexts to preallocate");
+
+static struct kmem_cache *bio_crypt_ctx_cache;
+static mempool_t *bio_crypt_ctx_pool;
+
+static int __init bio_crypt_ctx_init(void)
+{
+ size_t i;
+
+ bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0);
+ if (!bio_crypt_ctx_cache)
+ goto out_no_mem;
+
+ bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs,
+ bio_crypt_ctx_cache);
+ if (!bio_crypt_ctx_pool)
+ goto out_no_mem;
+
+ /* This is assumed in various places. */
+ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
+
+ /* Sanity check that no algorithm exceeds the defined limits. */
+ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) {
+ BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE);
+ BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE);
+ }
+
+ return 0;
+out_no_mem:
+ panic("Failed to allocate mem for bio crypt ctxs\n");
+}
+subsys_initcall(bio_crypt_ctx_init);
+
+void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
+ const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask)
+{
+ struct bio_crypt_ctx *bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+
+ bc->bc_key = key;
+ memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun));
+
+ bio->bi_crypt_context = bc;
+}
+
+void __bio_crypt_free_ctx(struct bio *bio)
+{
+ mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool);
+ bio->bi_crypt_context = NULL;
+}
+
+void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
+{
+ dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+ *dst->bi_crypt_context = *src->bi_crypt_context;
+}
+EXPORT_SYMBOL_GPL(__bio_crypt_clone);
+
+/* Increments @dun by @inc, treating @dun as a multi-limb integer. */
+void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
+ unsigned int inc)
+{
+ int i;
+
+ for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) {
+ dun[i] += inc;
+ /*
+ * If the addition in this limb overflowed, then we need to
+ * carry 1 into the next limb. Else the carry is 0.
+ */
+ if (dun[i] < inc)
+ inc = 1;
+ else
+ inc = 0;
+ }
+}
+
+void __bio_crypt_advance(struct bio *bio, unsigned int bytes)
+{
+ struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+
+ bio_crypt_dun_increment(bc->bc_dun,
+ bytes >> bc->bc_key->data_unit_size_bits);
+}
+
+/*
+ * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to
+ * @next_dun, treating the DUNs as multi-limb integers.
+ */
+bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
+ unsigned int bytes,
+ const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
+{
+ int i;
+ unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits;
+
+ for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) {
+ if (bc->bc_dun[i] + carry != next_dun[i])
+ return false;
+ /*
+ * If the addition in this limb overflowed, then we need to
+ * carry 1 into the next limb. Else the carry is 0.
+ */
+ if ((bc->bc_dun[i] + carry) < carry)
+ carry = 1;
+ else
+ carry = 0;
+ }
+
+ /* If the DUN wrapped through 0, don't treat it as contiguous. */
+ return carry == 0;
+}
+
+/*
+ * Checks that two bio crypt contexts are compatible - i.e. that
+ * they are mergeable except for data_unit_num continuity.
+ */
+static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1,
+ struct bio_crypt_ctx *bc2)
+{
+ if (!bc1)
+ return !bc2;
+
+ return bc2 && bc1->bc_key == bc2->bc_key;
+}
+
+bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio)
+{
+ return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context);
+}
+
+/*
+ * Checks that two bio crypt contexts are compatible, and also
+ * that their data_unit_nums are continuous (and can hence be merged)
+ * in the order @bc1 followed by @bc2.
+ */
+bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
+ struct bio_crypt_ctx *bc2)
+{
+ if (!bio_crypt_ctx_compatible(bc1, bc2))
+ return false;
+
+ return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun);
+}
+
+/* Check that all I/O segments are data unit aligned. */
+static bool bio_crypt_check_alignment(struct bio *bio)
+{
+ const unsigned int data_unit_size =
+ bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+
+ bio_for_each_segment(bv, bio, iter) {
+ if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size))
+ return false;
+ }
+
+ return true;
+}
+
+blk_status_t __blk_crypto_init_request(struct request *rq)
+{
+ return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
+ &rq->crypt_keyslot);
+}
+
+/**
+ * __blk_crypto_free_request - Uninitialize the crypto fields of a request.
+ *
+ * @rq: The request whose crypto fields to uninitialize.
+ *
+ * Completely uninitializes the crypto fields of a request. If a keyslot has
+ * been programmed into some inline encryption hardware, that keyslot is
+ * released. The rq->crypt_ctx is also freed.
+ */
+void __blk_crypto_free_request(struct request *rq)
+{
+ blk_ksm_put_slot(rq->crypt_keyslot);
+ mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
+ blk_crypto_rq_set_defaults(rq);
+}
+
+/**
+ * __blk_crypto_bio_prep - Prepare bio for inline encryption
+ *
+ * @bio_ptr: pointer to original bio pointer
+ *
+ * If the bio crypt context provided for the bio is supported by the underlying
+ * device's inline encryption hardware, do nothing.
+ *
+ * Otherwise, try to perform en/decryption for this bio by falling back to the
+ * kernel crypto API. When the crypto API fallback is used for encryption,
+ * blk-crypto may choose to split the bio into 2 - the first one that will
+ * continue to be processed and the second one that will be resubmitted via
+ * generic_make_request. A bounce bio will be allocated to encrypt the contents
+ * of the aforementioned "first one", and *bio_ptr will be updated to this
+ * bounce bio.
+ *
+ * Caller must ensure bio has bio_crypt_ctx.
+ *
+ * Return: true on success; false on error (and bio->bi_status will be set
+ * appropriately, and bio_endio() will have been called so bio
+ * submission should abort).
+ */
+bool __blk_crypto_bio_prep(struct bio **bio_ptr)
+{
+ struct bio *bio = *bio_ptr;
+ const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
+
+ /* Error if bio has no data. */
+ if (WARN_ON_ONCE(!bio_has_data(bio))) {
+ bio->bi_status = BLK_STS_IOERR;
+ goto fail;
+ }
+
+ if (!bio_crypt_check_alignment(bio)) {
+ bio->bi_status = BLK_STS_IOERR;
+ goto fail;
+ }
+
+ /*
+ * Success if device supports the encryption context, or if we succeeded
+ * in falling back to the crypto API.
+ */
+ if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm,
+ &bc_key->crypto_cfg))
+ return true;
+
+ if (blk_crypto_fallback_bio_prep(bio_ptr))
+ return true;
+fail:
+ bio_endio(*bio_ptr);
+ return false;
+}
+
+/**
+ * __blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio
+ * is inserted
+ *
+ * @rq: The request to prepare
+ * @bio: The first bio being inserted into the request
+ * @gfp_mask: gfp mask
+ */
+void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask)
+{
+ if (!rq->crypt_ctx)
+ rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+ *rq->crypt_ctx = *bio->bi_crypt_context;
+}
+
+/**
+ * blk_crypto_init_key() - Prepare a key for use with blk-crypto
+ * @blk_key: Pointer to the blk_crypto_key to initialize.
+ * @raw_key: Pointer to the raw key. Must be the correct length for the chosen
+ * @crypto_mode; see blk_crypto_modes[].
+ * @crypto_mode: identifier for the encryption algorithm to use
+ * @dun_bytes: number of bytes that will be used to specify the DUN when this
+ * key is used
+ * @data_unit_size: the data unit size to use for en/decryption
+ *
+ * Return: 0 on success, -errno on failure. The caller is responsible for
+ * zeroizing both blk_key and raw_key when done with them.
+ */
+int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
+ enum blk_crypto_mode_num crypto_mode,
+ unsigned int dun_bytes,
+ unsigned int data_unit_size)
+{
+ const struct blk_crypto_mode *mode;
+
+ memset(blk_key, 0, sizeof(*blk_key));
+
+ if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes))
+ return -EINVAL;
+
+ mode = &blk_crypto_modes[crypto_mode];
+ if (mode->keysize == 0)
+ return -EINVAL;
+
+ if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE)
+ return -EINVAL;
+
+ if (!is_power_of_2(data_unit_size))
+ return -EINVAL;
+
+ blk_key->crypto_cfg.crypto_mode = crypto_mode;
+ blk_key->crypto_cfg.dun_bytes = dun_bytes;
+ blk_key->crypto_cfg.data_unit_size = data_unit_size;
+ blk_key->data_unit_size_bits = ilog2(data_unit_size);
+ blk_key->size = mode->keysize;
+ memcpy(blk_key->raw, raw_key, mode->keysize);
+
+ return 0;
+}
+
+/*
+ * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
+ * request queue it's submitted to supports inline crypto, or the
+ * blk-crypto-fallback is enabled and supports the cfg).
+ */
+bool blk_crypto_config_supported(struct request_queue *q,
+ const struct blk_crypto_config *cfg)
+{
+ return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
+ blk_ksm_crypto_cfg_supported(q->ksm, cfg);
+}
+
+/**
+ * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
+ * @key: A key to use on the device
+ * @q: the request queue for the device
+ *
+ * Upper layers must call this function to ensure that either the hardware
+ * supports the key's crypto settings, or the crypto API fallback has transforms
+ * for the needed mode allocated and ready to go. This function may allocate
+ * an skcipher, and *should not* be called from the data path, since that might
+ * cause a deadlock
+ *
+ * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and
+ * blk-crypto-fallback is either disabled or the needed algorithm
+ * is disabled in the crypto API; or another -errno code.
+ */
+int blk_crypto_start_using_key(const struct blk_crypto_key *key,
+ struct request_queue *q)
+{
+ if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+ return 0;
+ return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
+}
+
+/**
+ * blk_crypto_evict_key() - Evict a key from any inline encryption hardware
+ * it may have been programmed into
+ * @q: The request queue who's associated inline encryption hardware this key
+ * might have been programmed into
+ * @key: The key to evict
+ *
+ * Upper layers (filesystems) must call this function to ensure that a key is
+ * evicted from any hardware that it might have been programmed into. The key
+ * must not be in use by any in-flight IO when this function is called.
+ *
+ * Return: 0 on success or if key is not present in the q's ksm, -err on error.
+ */
+int blk_crypto_evict_key(struct request_queue *q,
+ const struct blk_crypto_key *key)
+{
+ if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+ return blk_ksm_evict_key(q->ksm, key);
+
+ /*
+ * If the request queue's associated inline encryption hardware didn't
+ * have support for the key, then the key might have been programmed
+ * into the fallback keyslot manager, so try to evict from there.
+ */
+ return blk_crypto_fallback_evict_key(key);
+}
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e20a852ae432..85324d53d072 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -55,7 +55,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
/*
* don't check dying flag for MQ because the request won't
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c7f396e3d5e2..15ae0155ec07 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -258,7 +258,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
blk_flush_complete_seq(rq, fq, seq, error);
}
- fq->flush_queue_delayed = 0;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
}
@@ -433,41 +432,20 @@ void blk_insert_flush(struct request *rq)
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
* @gfp_mask: memory allocation flags (for bio_alloc)
- * @error_sector: error sector
*
* Description:
- * Issue a flush for the block device in question. Caller can supply
- * room for storing the error offset in case of a flush error, if they
- * wish to.
+ * Issue a flush for the block device in question.
*/
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
- sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
{
- struct request_queue *q;
struct bio *bio;
int ret = 0;
- if (bdev->bd_disk == NULL)
- return -ENXIO;
-
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
bio = bio_alloc(gfp_mask, 0);
bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
ret = submit_bio_wait(bio);
-
- /*
- * The driver must store the error location in ->bi_sector, if
- * it supports it. For non-stacked drivers, this should be
- * copied from blk_rq_pos(rq).
- */
- if (error_sector)
- *error_sector = bio->bi_iter.bi_sector;
-
bio_put(bio);
return ret;
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index ff1070edbb40..c03705cbb9c9 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -409,6 +409,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->tag_size = template->tag_size;
disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ if (disk->queue->ksm) {
+ pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
+ blk_ksm_unregister(disk->queue);
+ }
+#endif
}
EXPORT_SYMBOL(blk_integrity_register);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 7c1fe605d0d6..8ac4aad66ebc 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -260,6 +260,7 @@ enum {
VTIME_PER_SEC_SHIFT = 37,
VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
+ VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
/* bound vrate adjustments within two orders of magnitude */
VRATE_MIN_PPM = 10000, /* 1% */
@@ -1206,14 +1207,14 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
+static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
struct blkcg_gq *blkg = iocg_to_blkg(iocg);
u64 vtime = atomic64_read(&iocg->vtime);
u64 vmargin = ioc->margin_us * now->vrate;
u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
- u64 expires, oexpires;
+ u64 delta_ns, expires, oexpires;
u32 hw_inuse;
lockdep_assert_held(&iocg->waitq.lock);
@@ -1236,15 +1237,10 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
return false;
/* use delay */
- if (cost) {
- u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
- now->vrate);
- blkcg_add_delay(blkg, now->now_ns, cost_ns);
- }
- blkcg_use_delay(blkg);
-
- expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
- now->vrate) * NSEC_PER_USEC;
+ delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
+ now->vrate) * NSEC_PER_USEC;
+ blkcg_set_delay(blkg, delta_ns);
+ expires = now->now_ns + delta_ns;
/* if already active and close enough, don't bother */
oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
@@ -1265,7 +1261,7 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
spin_lock_irqsave(&iocg->waitq.lock, flags);
ioc_now(iocg->ioc, &now);
- iocg_kick_delay(iocg, &now, 0);
+ iocg_kick_delay(iocg, &now);
spin_unlock_irqrestore(&iocg->waitq.lock, flags);
return HRTIMER_NORESTART;
@@ -1383,7 +1379,7 @@ static void ioc_timer_fn(struct timer_list *timer)
if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
/* might be oversleeping vtime / hweight changes, kick */
iocg_kick_waitq(iocg, &now);
- iocg_kick_delay(iocg, &now, 0);
+ iocg_kick_delay(iocg, &now);
} else if (iocg_is_idle(iocg)) {
/* no waiter and idle, deactivate */
iocg->last_inuse = iocg->inuse;
@@ -1543,19 +1539,39 @@ skip_surplus_transfers:
if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
missed_ppm[READ] > ppm_rthr ||
missed_ppm[WRITE] > ppm_wthr) {
+ /* clearly missing QoS targets, slow down vrate */
ioc->busy_level = max(ioc->busy_level, 0);
ioc->busy_level++;
} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
- /* take action iff there is contention */
- if (nr_shortages && !nr_lagging) {
+ /* QoS targets are being met with >25% margin */
+ if (nr_shortages) {
+ /*
+ * We're throttling while the device has spare
+ * capacity. If vrate was being slowed down, stop.
+ */
ioc->busy_level = min(ioc->busy_level, 0);
- /* redistribute surpluses first */
- if (!nr_surpluses)
+
+ /*
+ * If there are IOs spanning multiple periods, wait
+ * them out before pushing the device harder. If
+ * there are surpluses, let redistribution work it
+ * out first.
+ */
+ if (!nr_lagging && !nr_surpluses)
ioc->busy_level--;
+ } else {
+ /*
+ * Nobody is being throttled and the users aren't
+ * issuing enough IOs to saturate the device. We
+ * simply don't know how close the device is to
+ * saturation. Coast.
+ */
+ ioc->busy_level = 0;
}
} else {
+ /* inside the hysterisis margin, we're good */
ioc->busy_level = 0;
}
@@ -1678,6 +1694,31 @@ static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
return cost;
}
+static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
+ u64 *costp)
+{
+ unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
+ break;
+ case REQ_OP_WRITE:
+ *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
+ break;
+ default:
+ *costp = 0;
+ }
+}
+
+static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
+{
+ u64 cost;
+
+ calc_size_vtime_cost_builtin(rq, ioc, &cost);
+ return cost;
+}
+
static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
{
struct blkcg_gq *blkg = bio->bi_blkg;
@@ -1762,7 +1803,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
*/
if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
iocg->abs_vdebt += abs_cost;
- if (iocg_kick_delay(iocg, &now, cost))
+ if (iocg_kick_delay(iocg, &now))
blkcg_schedule_throttle(rqos->q,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
spin_unlock_irq(&iocg->waitq.lock);
@@ -1850,7 +1891,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
spin_lock_irqsave(&iocg->waitq.lock, flags);
if (likely(!list_empty(&iocg->active_list))) {
iocg->abs_vdebt += abs_cost;
- iocg_kick_delay(iocg, &now, cost);
+ iocg_kick_delay(iocg, &now);
} else {
iocg_commit_bio(iocg, bio, cost);
}
@@ -1868,7 +1909,7 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
{
struct ioc *ioc = rqos_to_ioc(rqos);
- u64 on_q_ns, rq_wait_ns;
+ u64 on_q_ns, rq_wait_ns, size_nsec;
int pidx, rw;
if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
@@ -1889,8 +1930,10 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
+ size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
- if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
+ if (on_q_ns <= size_nsec ||
+ on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
else
this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
@@ -2297,6 +2340,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
spin_lock_irq(&ioc->lock);
if (enable) {
+ blk_stat_enable_accounting(ioc->rqos.q);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
ioc->enabled = true;
} else {
diff --git a/block/blk-map.c b/block/blk-map.c
index b72c361911a4..6e804892d5ec 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -257,6 +257,7 @@ out_bmd:
static struct bio *bio_map_user_iov(struct request_queue *q,
struct iov_iter *iter, gfp_t gfp_mask)
{
+ unsigned int max_sectors = queue_max_hw_sectors(q);
int j;
struct bio *bio;
int ret;
@@ -294,8 +295,8 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
if (n > bytes)
n = bytes;
- if (!__bio_add_pc_page(q, bio, page, n, offs,
- &same_page)) {
+ if (!bio_add_hw_page(q, bio, page, n, offs,
+ max_sectors, &same_page)) {
if (same_page)
put_page(page);
break;
@@ -549,6 +550,7 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
rq->biotail->bi_next = *bio;
rq->biotail = *bio;
rq->__data_len += (*bio)->bi_iter.bi_size;
+ bio_crypt_free_ctx(*bio);
}
return 0;
@@ -654,8 +656,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
bio = rq->bio;
} while (iov_iter_count(&i));
- if (!bio_flagged(bio, BIO_USER_MAPPED))
- rq->rq_flags |= RQF_COPY_USER;
return 0;
unmap_rq:
@@ -731,7 +731,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
{
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
- int do_copy = 0;
struct bio *bio, *orig_bio;
int ret;
@@ -740,8 +739,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (!len || !kbuf)
return -EINVAL;
- do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
- if (do_copy)
+ if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf))
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
else
bio = bio_map_kern(q, kbuf, len, gfp_mask);
@@ -752,9 +750,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= req_op(rq);
- if (do_copy)
- rq->rq_flags |= RQF_COPY_USER;
-
orig_bio = bio;
ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1534ed736363..f0b0bae075a0 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -336,16 +336,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
- /*
- * Since we're recursing into make_request here, ensure
- * that we mark this bio as already having entered the queue.
- * If not, and the queue is going away, we can get stuck
- * forever on waiting for the queue reference to drop. But
- * that will never happen, as we're already holding a
- * reference to it.
- */
- bio_set_flag(*bio, BIO_QUEUE_ENTERED);
-
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
@@ -519,44 +509,20 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
*/
-int blk_rq_map_sg(struct request_queue *q, struct request *rq,
- struct scatterlist *sglist)
+int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
+ struct scatterlist *sglist, struct scatterlist **last_sg)
{
- struct scatterlist *sg = NULL;
int nsegs = 0;
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
- nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, &sg);
+ nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
- nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, &sg);
+ nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg);
else if (rq->bio)
- nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
-
- if (unlikely(rq->rq_flags & RQF_COPY_USER) &&
- (blk_rq_bytes(rq) & q->dma_pad_mask)) {
- unsigned int pad_len =
- (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
-
- sg->length += pad_len;
- rq->extra_len += pad_len;
- }
+ nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
- if (q->dma_drain_size && q->dma_drain_needed(rq)) {
- if (op_is_write(req_op(rq)))
- memset(q->dma_drain_buffer, 0, q->dma_drain_size);
-
- sg_unmark_end(sg);
- sg = sg_next(sg);
- sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
- q->dma_drain_size,
- ((unsigned long)q->dma_drain_buffer) &
- (PAGE_SIZE - 1));
- nsegs++;
- rq->extra_len += q->dma_drain_size;
- }
-
- if (sg)
- sg_mark_end(sg);
+ if (*last_sg)
+ sg_mark_end(*last_sg);
/*
* Something must have been wrong if the figured number of
@@ -566,7 +532,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
return nsegs;
}
-EXPORT_SYMBOL(blk_rq_map_sg);
+EXPORT_SYMBOL(__blk_rq_map_sg);
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
@@ -596,6 +562,8 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
if (blk_integrity_rq(req) &&
integrity_req_gap_back_merge(req, bio))
return 0;
+ if (!bio_crypt_ctx_back_mergeable(req, bio))
+ return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
req_set_nomerge(req->q, req);
@@ -612,6 +580,8 @@ int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs
if (blk_integrity_rq(req) &&
integrity_req_gap_front_merge(req, bio))
return 0;
+ if (!bio_crypt_ctx_front_mergeable(req, bio))
+ return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
req_set_nomerge(req->q, req);
@@ -661,6 +631,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (blk_integrity_merge_rq(q, req, next) == false)
return 0;
+ if (!bio_crypt_ctx_merge_rq(req, next))
+ return 0;
+
/* Merge is OK... */
req->nr_phys_segments = total_phys_segments;
return 1;
@@ -696,20 +669,17 @@ void blk_rq_set_mixed_merge(struct request *rq)
rq->rq_flags |= RQF_MIXED_MERGE;
}
-static void blk_account_io_merge(struct request *req)
+static void blk_account_io_merge_request(struct request *req)
{
if (blk_do_io_stat(req)) {
- struct hd_struct *part;
-
part_stat_lock();
- part = req->part;
-
- part_dec_in_flight(req->q, part, rq_data_dir(req));
-
- hd_struct_put(part);
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
part_stat_unlock();
+
+ hd_struct_put(req->part);
}
}
+
/*
* Two cases of handling DISCARD merge:
* If max_discard_segments > 1, the driver takes every bio
@@ -821,7 +791,7 @@ static struct request *attempt_merge(struct request_queue *q,
/*
* 'next' is going away, so update stats accordingly
*/
- blk_account_io_merge(next);
+ blk_account_io_merge_request(next);
/*
* ownership of bio passed from next to req, return 'next' for
@@ -885,6 +855,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
+ /* Only merge if the crypt contexts are compatible */
+ if (!bio_crypt_rq_ctx_compatible(rq, bio))
+ return false;
+
/* must be using the same buffer */
if (req_op(rq) == REQ_OP_WRITE_SAME &&
!blk_write_same_mergeable(rq->bio, bio))
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b3f2ba483992..15df3a36e9fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
+ HCTX_STATE_NAME(INACTIVE),
};
#undef HCTX_STATE_NAME
@@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
+ HCTX_FLAG_NAME(STACKING),
};
#undef HCTX_FLAG_NAME
@@ -292,7 +294,6 @@ static const char *const rqf_name[] = {
RQF_NAME(MQ_INFLIGHT),
RQF_NAME(DONTPREP),
RQF_NAME(PREEMPT),
- RQF_NAME(COPY_USER),
RQF_NAME(FAILED),
RQF_NAME(QUIET),
RQF_NAME(ELVPRIV),
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 74cedea56034..fdcc2c1dd178 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -80,16 +80,22 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
blk_mq_run_hw_queue(hctx, true);
}
+#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
+
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
+ * be run again. This is necessary to avoid starving flushes.
*/
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
+ int ret = 0;
do {
struct request *rq;
@@ -97,12 +103,25 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;
+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = -EAGAIN;
+ break;
+ }
+
if (!blk_mq_get_dispatch_budget(hctx))
break;
rq = e->type->ops.dispatch_request(hctx);
if (!rq) {
blk_mq_put_dispatch_budget(hctx);
+ /*
+ * We're releasing without dispatching. Holding the
+ * budget could have blocked any "hctx"s with the
+ * same queue and if we didn't dispatch then there's
+ * no guarantee anyone will kick the queue. Kick it
+ * ourselves.
+ */
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
break;
}
@@ -113,6 +132,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
*/
list_add(&rq->queuelist, &rq_list);
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+ return ret;
}
static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -130,16 +151,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
+ * to be run again. This is necessary to avoid starving flushes.
*/
-static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+ int ret = 0;
do {
struct request *rq;
+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = -EAGAIN;
+ break;
+ }
+
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
@@ -149,6 +179,14 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
blk_mq_put_dispatch_budget(hctx);
+ /*
+ * We're releasing without dispatching. Holding the
+ * budget could have blocked any "hctx"s with the
+ * same queue and if we didn't dispatch then there's
+ * no guarantee anyone will kick the queue. Kick it
+ * ourselves.
+ */
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
break;
}
@@ -165,21 +203,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
WRITE_ONCE(hctx->dispatch_from, ctx);
+ return ret;
}
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
+ int ret = 0;
LIST_HEAD(rq_list);
- /* RCU or SRCU read lock is needed before checking quiesced flag */
- if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
- return;
-
- hctx->run++;
-
/*
* If we have previous entries on our dispatch list, grab them first for
* more fair dispatch.
@@ -208,19 +242,41 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_mark_restart_hctx(hctx);
if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
if (has_sched_dispatch)
- blk_mq_do_dispatch_sched(hctx);
+ ret = blk_mq_do_dispatch_sched(hctx);
else
- blk_mq_do_dispatch_ctx(hctx);
+ ret = blk_mq_do_dispatch_ctx(hctx);
}
} else if (has_sched_dispatch) {
- blk_mq_do_dispatch_sched(hctx);
+ ret = blk_mq_do_dispatch_sched(hctx);
} else if (hctx->dispatch_busy) {
/* dequeue request one by one from sw queue if queue is busy */
- blk_mq_do_dispatch_ctx(hctx);
+ ret = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list, false);
}
+
+ return ret;
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+
+ /* RCU or SRCU read lock is needed before checking quiesced flag */
+ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * A return of -EAGAIN is an indication that hctx->dispatch is not
+ * empty and we must run again in order to avoid starving flushes.
+ */
+ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
+ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
+ blk_mq_run_hw_queue(hctx, true);
+ }
}
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 586c9d6e904a..96a39d0724a2 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -92,7 +92,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
{
if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
!hctx_may_queue(data->hctx, bt))
- return -1;
+ return BLK_MQ_NO_TAG;
if (data->shallow_depth)
return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
@@ -111,7 +111,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (data->flags & BLK_MQ_REQ_RESERVED) {
if (unlikely(!tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
- return BLK_MQ_TAG_FAIL;
+ return BLK_MQ_NO_TAG;
}
bt = &tags->breserved_tags;
tag_offset = 0;
@@ -121,11 +121,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
}
tag = __blk_mq_get_tag(data, bt);
- if (tag != -1)
+ if (tag != BLK_MQ_NO_TAG)
goto found_tag;
if (data->flags & BLK_MQ_REQ_NOWAIT)
- return BLK_MQ_TAG_FAIL;
+ return BLK_MQ_NO_TAG;
ws = bt_wait_ptr(bt, data->hctx);
do {
@@ -143,13 +143,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
* as running the queue may also have found completions.
*/
tag = __blk_mq_get_tag(data, bt);
- if (tag != -1)
+ if (tag != BLK_MQ_NO_TAG)
break;
sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
- if (tag != -1)
+ if (tag != BLK_MQ_NO_TAG)
break;
bt_prev = bt;
@@ -180,6 +180,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
sbitmap_finish_wait(bt, ws, &wait);
found_tag:
+ /*
+ * Give up this allocation if the hctx is inactive. The caller will
+ * retry on an active hctx.
+ */
+ if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
+ blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
+ return BLK_MQ_NO_TAG;
+ }
return tag + tag_offset;
}
@@ -256,14 +264,17 @@ struct bt_tags_iter_data {
struct blk_mq_tags *tags;
busy_tag_iter_fn *fn;
void *data;
- bool reserved;
+ unsigned int flags;
};
+#define BT_TAG_ITER_RESERVED (1 << 0)
+#define BT_TAG_ITER_STARTED (1 << 1)
+
static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_tags_iter_data *iter_data = data;
struct blk_mq_tags *tags = iter_data->tags;
- bool reserved = iter_data->reserved;
+ bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
struct request *rq;
if (!reserved)
@@ -274,10 +285,12 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* test and set the bit before assining ->rqs[].
*/
rq = tags->rqs[bitnr];
- if (rq && blk_mq_request_started(rq))
- return iter_data->fn(rq, iter_data->data, reserved);
-
- return true;
+ if (!rq)
+ return true;
+ if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
+ !blk_mq_request_started(rq))
+ return true;
+ return iter_data->fn(rq, iter_data->data, reserved);
}
/**
@@ -290,39 +303,47 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* @reserved) where rq is a pointer to a request. Return true
* to continue iterating tags, false to stop.
* @data: Will be passed as second argument to @fn.
- * @reserved: Indicates whether @bt is the breserved_tags member or the
- * bitmap_tags member of struct blk_mq_tags.
+ * @flags: BT_TAG_ITER_*
*/
static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
- busy_tag_iter_fn *fn, void *data, bool reserved)
+ busy_tag_iter_fn *fn, void *data, unsigned int flags)
{
struct bt_tags_iter_data iter_data = {
.tags = tags,
.fn = fn,
.data = data,
- .reserved = reserved,
+ .flags = flags,
};
if (tags->rqs)
sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}
+static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
+ busy_tag_iter_fn *fn, void *priv, unsigned int flags)
+{
+ WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
+
+ if (tags->nr_reserved_tags)
+ bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
+ flags | BT_TAG_ITER_RESERVED);
+ bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
+}
+
/**
- * blk_mq_all_tag_busy_iter - iterate over all started requests in a tag map
+ * blk_mq_all_tag_iter - iterate over all requests in a tag map
* @tags: Tag map to iterate over.
- * @fn: Pointer to the function that will be called for each started
+ * @fn: Pointer to the function that will be called for each
* request. @fn will be called as follows: @fn(rq, @priv,
* reserved) where rq is a pointer to a request. 'reserved'
* indicates whether or not @rq is a reserved request. Return
* true to continue iterating tags, false to stop.
* @priv: Will be passed as second argument to @fn.
*/
-static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
- busy_tag_iter_fn *fn, void *priv)
+void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+ void *priv)
{
- if (tags->nr_reserved_tags)
- bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
- bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
+ return __blk_mq_all_tag_iter(tags, fn, priv, 0);
}
/**
@@ -342,7 +363,8 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
for (i = 0; i < tagset->nr_hw_queues; i++) {
if (tagset->tags && tagset->tags[i])
- blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
+ __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
+ BT_TAG_ITER_STARTED);
}
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 2b8321efb682..d38e48f2a0a4 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -34,6 +34,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
+void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+ void *priv);
static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
struct blk_mq_hw_ctx *hctx)
@@ -44,9 +46,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
}
enum {
- BLK_MQ_TAG_FAIL = -1U,
+ BLK_MQ_NO_TAG = -1U,
BLK_MQ_TAG_MIN = 1,
- BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
+ BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7785df2c944..9a36ac1c1fa1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -26,6 +26,7 @@
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
+#include <linux/blk-crypto.h>
#include <trace/events/block.h>
@@ -270,14 +271,14 @@ static inline bool blk_mq_need_time_stamp(struct request *rq)
}
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
- unsigned int tag, unsigned int op, u64 alloc_time_ns)
+ unsigned int tag, u64 alloc_time_ns)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
req_flags_t rq_flags = 0;
if (data->flags & BLK_MQ_REQ_INTERNAL) {
- rq->tag = -1;
+ rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
} else {
if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
@@ -285,7 +286,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
- rq->internal_tag = -1;
+ rq->internal_tag = BLK_MQ_NO_TAG;
data->hctx->tags->rqs[rq->tag] = rq;
}
@@ -294,7 +295,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->mq_ctx = data->ctx;
rq->mq_hctx = data->hctx;
rq->rq_flags = rq_flags;
- rq->cmd_flags = op;
+ rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
if (blk_queue_io_stat(data->q))
@@ -317,8 +318,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
+ blk_crypto_rq_set_defaults(rq);
/* tag was already set */
- rq->extra_len = 0;
WRITE_ONCE(rq->deadline, 0);
rq->timeout = 0;
@@ -326,35 +327,37 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->end_io = NULL;
rq->end_io_data = NULL;
- data->ctx->rq_dispatched[op_is_sync(op)]++;
+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
refcount_set(&rq->ref, 1);
+
+ if (!op_is_flush(data->cmd_flags)) {
+ struct elevator_queue *e = data->q->elevator;
+
+ rq->elv.icq = NULL;
+ if (e && e->type->ops.prepare_request) {
+ if (e->type->icq_cache)
+ blk_mq_sched_assign_ioc(rq);
+
+ e->type->ops.prepare_request(rq);
+ rq->rq_flags |= RQF_ELVPRIV;
+ }
+ }
+
+ data->hctx->queued++;
return rq;
}
-static struct request *blk_mq_get_request(struct request_queue *q,
- struct bio *bio,
- struct blk_mq_alloc_data *data)
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
{
+ struct request_queue *q = data->q;
struct elevator_queue *e = q->elevator;
- struct request *rq;
- unsigned int tag;
- bool clear_ctx_on_error = false;
u64 alloc_time_ns = 0;
-
- blk_queue_enter_live(q);
+ unsigned int tag;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
- data->q = q;
- if (likely(!data->ctx)) {
- data->ctx = blk_mq_get_ctx(q);
- clear_ctx_on_error = true;
- }
- if (likely(!data->hctx))
- data->hctx = blk_mq_map_queue(q, data->cmd_flags,
- data->ctx);
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -370,37 +373,43 @@ static struct request *blk_mq_get_request(struct request_queue *q,
e->type->ops.limit_depth &&
!(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.limit_depth(data->cmd_flags, data);
- } else {
- blk_mq_tag_busy(data->hctx);
}
- tag = blk_mq_get_tag(data);
- if (tag == BLK_MQ_TAG_FAIL) {
- if (clear_ctx_on_error)
- data->ctx = NULL;
- blk_queue_exit(q);
- return NULL;
- }
+retry:
+ data->ctx = blk_mq_get_ctx(q);
+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+ if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+ blk_mq_tag_busy(data->hctx);
- rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
- if (!op_is_flush(data->cmd_flags)) {
- rq->elv.icq = NULL;
- if (e && e->type->ops.prepare_request) {
- if (e->type->icq_cache)
- blk_mq_sched_assign_ioc(rq);
+ /*
+ * Waiting allocations only fail because of an inactive hctx. In that
+ * case just retry the hctx assignment and tag allocation as CPU hotplug
+ * should have migrated us to an online CPU by now.
+ */
+ tag = blk_mq_get_tag(data);
+ if (tag == BLK_MQ_NO_TAG) {
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
+ return NULL;
- e->type->ops.prepare_request(rq, bio);
- rq->rq_flags |= RQF_ELVPRIV;
- }
+ /*
+ * Give up the CPU and sleep for a random short time to ensure
+ * that thread using a realtime scheduling class are migrated
+ * off the the CPU, and thus off the hctx that is going away.
+ */
+ msleep(3);
+ goto retry;
}
- data->hctx->queued++;
- return rq;
+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
}
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
blk_mq_req_flags_t flags)
{
- struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ .flags = flags,
+ .cmd_flags = op,
+ };
struct request *rq;
int ret;
@@ -408,34 +417,43 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
if (ret)
return ERR_PTR(ret);
- rq = blk_mq_get_request(q, NULL, &alloc_data);
- blk_queue_exit(q);
-
+ rq = __blk_mq_alloc_request(&data);
if (!rq)
- return ERR_PTR(-EWOULDBLOCK);
-
+ goto out_queue_exit;
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
+out_queue_exit:
+ blk_queue_exit(q);
+ return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
- struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
- struct request *rq;
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ .flags = flags,
+ .cmd_flags = op,
+ };
+ u64 alloc_time_ns = 0;
unsigned int cpu;
+ unsigned int tag;
int ret;
+ /* alloc_time includes depth and tag waits */
+ if (blk_queue_rq_alloc_time(q))
+ alloc_time_ns = ktime_get_ns();
+
/*
* If the tag allocator sleeps we could get an allocation for a
* different hardware context. No need to complicate the low level
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
+ if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
@@ -449,21 +467,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
- blk_queue_exit(q);
- return ERR_PTR(-EXDEV);
- }
- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
-
- rq = blk_mq_get_request(q, NULL, &alloc_data);
- blk_queue_exit(q);
+ ret = -EXDEV;
+ data.hctx = q->queue_hw_ctx[hctx_idx];
+ if (!blk_mq_hw_queue_mapped(data.hctx))
+ goto out_queue_exit;
+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
+ data.ctx = __blk_mq_get_ctx(q, cpu);
+
+ if (q->elevator)
+ data.flags |= BLK_MQ_REQ_INTERNAL;
+ else
+ blk_mq_tag_busy(data.hctx);
- if (!rq)
- return ERR_PTR(-EWOULDBLOCK);
+ ret = -EWOULDBLOCK;
+ tag = blk_mq_get_tag(&data);
+ if (tag == BLK_MQ_NO_TAG)
+ goto out_queue_exit;
+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
- return rq;
+out_queue_exit:
+ blk_queue_exit(q);
+ return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
@@ -474,11 +498,12 @@ static void __blk_mq_free_request(struct request *rq)
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
const int sched_tag = rq->internal_tag;
+ blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
- if (rq->tag != -1)
+ if (rq->tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
- if (sched_tag != -1)
+ if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
@@ -527,7 +552,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_stat_add(rq, now);
}
- if (rq->internal_tag != -1)
+ if (rq->internal_tag != BLK_MQ_NO_TAG)
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@@ -557,7 +582,17 @@ static void __blk_mq_complete_request_remote(void *data)
q->mq_ops->complete(rq);
}
-static void __blk_mq_complete_request(struct request *rq)
+/**
+ * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
+ * injection that could drop the completion.
+ * @rq: Request to be force completed
+ *
+ * Drivers should use blk_mq_complete_request() to complete requests in their
+ * normal IO path. For timeout error recovery, drivers may call this forced
+ * completion routine after they've reclaimed timed out requests to bypass
+ * potentially subsequent fake timeouts.
+ */
+void blk_mq_force_complete_rq(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct request_queue *q = rq->q;
@@ -603,6 +638,7 @@ static void __blk_mq_complete_request(struct request *rq)
}
put_cpu();
}
+EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
__releases(hctx->srcu)
@@ -636,7 +672,7 @@ bool blk_mq_complete_request(struct request *rq)
{
if (unlikely(blk_should_fake_timeout(rq->q)))
return false;
- __blk_mq_complete_request(rq);
+ blk_mq_force_complete_rq(rq);
return true;
}
EXPORT_SYMBOL(blk_mq_complete_request);
@@ -667,15 +703,6 @@ void blk_mq_start_request(struct request *rq)
blk_add_timer(rq);
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
- /*
- * Make sure space for the drain appears. We know we can do
- * this because max_hw_segments has been adjusted to be one
- * fewer than the device can handle.
- */
- rq->nr_phys_segments++;
- }
-
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
q->integrity.profile->prepare_fn(rq);
@@ -695,8 +722,6 @@ static void __blk_mq_requeue_request(struct request *rq)
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
rq->rq_flags &= ~RQF_TIMED_OUT;
- if (q->dma_drain_size && blk_rq_bytes(rq))
- rq->nr_phys_segments--;
}
}
@@ -1037,7 +1062,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
};
bool shared;
- if (rq->tag != -1)
+ if (rq->tag != BLK_MQ_NO_TAG)
return true;
if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
@@ -1053,7 +1078,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
data.hctx->tags->rqs[rq->tag] = rq;
}
- return rq->tag != -1;
+ return rq->tag != BLK_MQ_NO_TAG;
}
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
@@ -1195,6 +1220,19 @@ static void blk_mq_handle_dev_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
+static void blk_mq_handle_zone_resource(struct request *rq,
+ struct list_head *zone_list)
+{
+ /*
+ * If we end up here it is because we cannot dispatch a request to a
+ * specific zone due to LLD level zone-write locking or other zone
+ * related resource not being available. In this case, set the request
+ * aside in zone_list for retrying it later.
+ */
+ list_add(&rq->queuelist, zone_list);
+ __blk_mq_requeue_request(rq);
+}
+
/*
* Returns true if we did some work AND can potentially do more.
*/
@@ -1206,6 +1244,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
+ bool no_budget_avail = false;
+ LIST_HEAD(zone_list);
if (list_empty(list))
return false;
@@ -1224,6 +1264,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
hctx = rq->mq_hctx;
if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
blk_mq_put_driver_tag(rq);
+ no_budget_avail = true;
break;
}
@@ -1266,6 +1307,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
blk_mq_handle_dev_resource(rq, list);
break;
+ } else if (ret == BLK_STS_ZONE_RESOURCE) {
+ /*
+ * Move the request to zone_list and keep going through
+ * the dispatch list to find more requests the drive can
+ * accept.
+ */
+ blk_mq_handle_zone_resource(rq, &zone_list);
+ if (list_empty(list))
+ break;
+ continue;
}
if (unlikely(ret != BLK_STS_OK)) {
@@ -1277,6 +1328,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
queued++;
} while (!list_empty(list));
+ if (!list_empty(&zone_list))
+ list_splice_tail_init(&zone_list, list);
+
hctx->dispatched[queued_to_index(queued)]++;
/*
@@ -1320,13 +1374,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
*
* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
* bit is set, run queue after a delay to avoid IO stalls
- * that could otherwise occur if the queue is idle.
+ * that could otherwise occur if the queue is idle. We'll do
+ * similar if we couldn't get budget and SCHED_RESTART is set.
*/
needs_restart = blk_mq_sched_needs_restart(hctx);
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
- else if (needs_restart && (ret == BLK_STS_RESOURCE))
+ else if (needs_restart && (ret == BLK_STS_RESOURCE ||
+ no_budget_avail))
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true);
@@ -1542,6 +1598,25 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
EXPORT_SYMBOL(blk_mq_run_hw_queues);
/**
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
+ * @q: Pointer to the request queue to run.
+ * @msecs: Microseconds of delay to wait before running the queues.
+ */
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (blk_mq_hctx_stopped(hctx))
+ continue;
+
+ blk_mq_delay_run_hw_queue(hctx, msecs);
+ }
+}
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
+
+/**
* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
* @q: request queue.
*
@@ -1782,8 +1857,9 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
rq->__sector = bio->bi_iter.bi_sector;
rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
+ blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
}
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
@@ -1973,39 +2049,42 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*
* Returns: Request queue cookie.
*/
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
- struct blk_mq_alloc_data data = { .flags = 0};
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ };
struct request *rq;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
unsigned int nr_segs;
blk_qc_t cookie;
+ blk_status_t ret;
blk_queue_bounce(q, &bio);
__blk_queue_split(q, &bio, &nr_segs);
if (!bio_integrity_prep(bio))
- return BLK_QC_T_NONE;
+ goto queue_exit;
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
- return BLK_QC_T_NONE;
+ goto queue_exit;
if (blk_mq_sched_bio_merge(q, bio, nr_segs))
- return BLK_QC_T_NONE;
+ goto queue_exit;
rq_qos_throttle(q, bio);
data.cmd_flags = bio->bi_opf;
- rq = blk_mq_get_request(q, bio, &data);
+ rq = __blk_mq_alloc_request(&data);
if (unlikely(!rq)) {
rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
- return BLK_QC_T_NONE;
+ goto queue_exit;
}
trace_block_getrq(q, bio, bio->bi_opf);
@@ -2016,6 +2095,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio, nr_segs);
+ ret = blk_crypto_init_request(rq);
+ if (ret != BLK_STS_OK) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ blk_mq_free_request(rq);
+ return BLK_QC_T_NONE;
+ }
+
plug = blk_mq_plug(q, bio);
if (unlikely(is_flush_fua)) {
/* Bypass scheduler for flush requests */
@@ -2084,7 +2171,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
}
return cookie;
+queue_exit:
+ blk_queue_exit(q);
+ return BLK_QC_T_NONE;
}
+EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
@@ -2260,6 +2351,86 @@ fail:
return -ENOMEM;
}
+struct rq_iter_data {
+ struct blk_mq_hw_ctx *hctx;
+ bool has_rq;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+{
+ struct rq_iter_data *iter_data = data;
+
+ if (rq->mq_hctx != iter_data->hctx)
+ return true;
+ iter_data->has_rq = true;
+ return false;
+}
+
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_tags *tags = hctx->sched_tags ?
+ hctx->sched_tags : hctx->tags;
+ struct rq_iter_data data = {
+ .hctx = hctx,
+ };
+
+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+ return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+ struct blk_mq_hw_ctx *hctx)
+{
+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+ return false;
+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+ return false;
+ return true;
+}
+
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+ struct blk_mq_hw_ctx, cpuhp_online);
+
+ if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
+ return 0;
+
+ /*
+ * Prevent new request from being allocated on the current hctx.
+ *
+ * The smp_mb__after_atomic() Pairs with the implied barrier in
+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
+ * seen once we return from the tag allocator.
+ */
+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+ smp_mb__after_atomic();
+
+ /*
+ * Try to grab a reference to the queue and wait for any outstanding
+ * requests. If we could not grab a reference the queue has been
+ * frozen and there are no requests.
+ */
+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
+ while (blk_mq_hctx_has_requests(hctx))
+ msleep(5);
+ percpu_ref_put(&hctx->queue->q_usage_counter);
+ }
+
+ return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+ struct blk_mq_hw_ctx, cpuhp_online);
+
+ if (cpumask_test_cpu(cpu, hctx->cpumask))
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+ return 0;
+}
+
/*
* 'cpu' is going away. splice any existing rq_list entries from this
* software queue to the hw queue dispatch list, and ensure that it
@@ -2273,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
enum hctx_type type;
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
+ return 0;
+
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type;
@@ -2296,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+ &hctx->cpuhp_online);
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
&hctx->cpuhp_dead);
}
@@ -2355,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
{
hctx->queue_num = hctx_idx;
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+ &hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx];
@@ -2473,7 +2653,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
+ int hctx_idx)
{
int ret = 0;
@@ -2521,18 +2702,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* If the cpu isn't present, the cpu is mapped to first hctx.
*/
for_each_possible_cpu(i) {
- hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
- /* unmapped hw queue can be remapped after CPU topo changed */
- if (!set->tags[hctx_idx] &&
- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
- /*
- * If tags initialization fail for some hctx,
- * that hctx won't be brought online. In this
- * case, remap the current ctx to hctx[0] which
- * is guaranteed to always have tags allocated
- */
- set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
- }
ctx = per_cpu_ptr(q->queue_ctx, i);
for (j = 0; j < set->nr_maps; j++) {
@@ -2541,6 +2710,18 @@ static void blk_mq_map_swqueue(struct request_queue *q)
HCTX_TYPE_DEFAULT, i);
continue;
}
+ hctx_idx = set->map[j].mq_map[i];
+ /* unmapped hw queue can be remapped after CPU topo changed */
+ if (!set->tags[hctx_idx] &&
+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+ /*
+ * If tags initialization fail for some hctx,
+ * that hctx won't be brought online. In this
+ * case, remap the current ctx to hctx[0] which
+ * is guaranteed to always have tags allocated
+ */
+ set->map[j].mq_map[i] = 0;
+ }
hctx = blk_mq_map_queue_type(q, j, i);
ctx->hctxs[j] = hctx;
@@ -2944,7 +3125,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
- q->make_request_fn = blk_mq_make_request;
q->nr_requests = set->queue_depth;
/*
@@ -2988,14 +3168,14 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
int i;
for (i = 0; i < set->nr_hw_queues; i++)
- if (!__blk_mq_alloc_rq_map(set, i))
+ if (!__blk_mq_alloc_map_and_request(set, i))
goto out_unwind;
return 0;
out_unwind:
while (--i >= 0)
- blk_mq_free_rq_map(set->tags[i]);
+ blk_mq_free_map_and_requests(set, i);
return -ENOMEM;
}
@@ -3005,7 +3185,7 @@ out_unwind:
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
{
unsigned int depth;
int err;
@@ -3165,7 +3345,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_mq_map;
- ret = blk_mq_alloc_rq_maps(set);
+ ret = blk_mq_alloc_map_and_requests(set);
if (ret)
goto out_free_mq_map;
@@ -3347,14 +3527,14 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
blk_mq_sysfs_unregister(q);
}
+ prev_nr_hw_queues = set->nr_hw_queues;
if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
0)
goto reregister;
- prev_nr_hw_queues = set->nr_hw_queues;
set->nr_hw_queues = nr_hw_queues;
- blk_mq_update_queue_map(set);
fallback:
+ blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
@@ -3609,6 +3789,9 @@ static int __init blk_mq_init(void)
{
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+ blk_mq_hctx_notify_online,
+ blk_mq_hctx_notify_offline);
return 0;
}
subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 10bfdfb494fa..a139b0631817 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -201,7 +201,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
- rq->tag = -1;
+ rq->tag = BLK_MQ_NO_TAG;
if (rq->rq_flags & RQF_MQ_INFLIGHT) {
rq->rq_flags &= ~RQF_MQ_INFLIGHT;
@@ -211,7 +211,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
static inline void blk_mq_put_driver_tag(struct request *rq)
{
- if (rq->tag == -1 || rq->internal_tag == -1)
+ if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
return;
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 14397b4c4b53..9a2c23cd9700 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -48,6 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_write_zeroes_sectors = 0;
+ lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
@@ -83,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
+ lim->max_zone_append_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -222,6 +224,33 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
/**
+ * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
+ * @q: the request queue for the device
+ * @max_zone_append_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_zone_append_sectors(struct request_queue *q,
+ unsigned int max_zone_append_sectors)
+{
+ unsigned int max_sectors;
+
+ if (WARN_ON(!blk_queue_is_zoned(q)))
+ return;
+
+ max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
+ max_sectors = min(q->limits.chunk_sectors, max_sectors);
+
+ /*
+ * Signal eventual driver bugs resulting in the max_zone_append sectors limit
+ * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
+ * or the max_hw_sectors limit not set.
+ */
+ WARN_ON(!max_sectors);
+
+ q->limits.max_zone_append_sectors = max_sectors;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);
+
+/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
@@ -470,6 +499,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
b->max_write_same_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
+ t->max_zone_append_sectors = min(t->max_zone_append_sectors,
+ b->max_zone_append_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -652,43 +683,6 @@ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
EXPORT_SYMBOL(blk_queue_update_dma_pad);
/**
- * blk_queue_dma_drain - Set up a drain buffer for excess dma.
- * @q: the request queue for the device
- * @dma_drain_needed: fn which returns non-zero if drain is necessary
- * @buf: physically contiguous buffer
- * @size: size of the buffer in bytes
- *
- * Some devices have excess DMA problems and can't simply discard (or
- * zero fill) the unwanted piece of the transfer. They have to have a
- * real area of memory to transfer it into. The use case for this is
- * ATAPI devices in DMA mode. If the packet command causes a transfer
- * bigger than the transfer size some HBAs will lock up if there
- * aren't DMA elements to contain the excess transfer. What this API
- * does is adjust the queue so that the buf is always appended
- * silently to the scatterlist.
- *
- * Note: This routine adjusts max_hw_segments to make room for appending
- * the drain buffer. If you call blk_queue_max_segments() after calling
- * this routine, you must set the limit to one fewer than your device
- * can support otherwise there won't be room for the drain buffer.
- */
-int blk_queue_dma_drain(struct request_queue *q,
- dma_drain_needed_fn *dma_drain_needed,
- void *buf, unsigned int size)
-{
- if (queue_max_segments(q) < 2)
- return -EINVAL;
- /* make room for appending the drain */
- blk_queue_max_segments(q, queue_max_segments(q) - 1);
- q->dma_drain_needed = dma_drain_needed;
- q->dma_drain_buffer = buf;
- q->dma_drain_size = size;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
-
-/**
* blk_queue_segment_boundary - set boundary rules for segment merging
* @q: the request queue for the device
* @mask: the memory boundary mask
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fca9b158f4a0..02643e149d5e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -218,6 +218,13 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
}
+static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
+{
+ unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+
+ return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
+}
+
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
@@ -639,6 +646,11 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
.show = queue_write_zeroes_max_show,
};
+static struct queue_sysfs_entry queue_zone_append_max_entry = {
+ .attr = {.name = "zone_append_max_bytes", .mode = 0444 },
+ .show = queue_zone_append_max_show,
+};
+
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = 0644 },
.show = queue_show_nonrot,
@@ -749,6 +761,7 @@ static struct attribute *queue_attrs[] = {
&queue_discard_zeroes_data_entry.attr,
&queue_write_same_max_entry.attr,
&queue_write_zeroes_max_entry.attr,
+ &queue_zone_append_max_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 98233c9c65a8..209fdd8939fb 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2358,69 +2358,6 @@ void blk_throtl_bio_endio(struct bio *bio)
}
#endif
-/*
- * Dispatch all bios from all children tg's queued on @parent_sq. On
- * return, @parent_sq is guaranteed to not have any active children tg's
- * and all bios from previously active tg's are on @parent_sq->bio_lists[].
- */
-static void tg_drain_bios(struct throtl_service_queue *parent_sq)
-{
- struct throtl_grp *tg;
-
- while ((tg = throtl_rb_first(parent_sq))) {
- struct throtl_service_queue *sq = &tg->service_queue;
- struct bio *bio;
-
- throtl_dequeue_tg(tg);
-
- while ((bio = throtl_peek_queued(&sq->queued[READ])))
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
- while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
- }
-}
-
-/**
- * blk_throtl_drain - drain throttled bios
- * @q: request_queue to drain throttled bios for
- *
- * Dispatch all currently throttled bios on @q through ->make_request_fn().
- */
-void blk_throtl_drain(struct request_queue *q)
- __releases(&q->queue_lock) __acquires(&q->queue_lock)
-{
- struct throtl_data *td = q->td;
- struct blkcg_gq *blkg;
- struct cgroup_subsys_state *pos_css;
- struct bio *bio;
- int rw;
-
- rcu_read_lock();
-
- /*
- * Drain each tg while doing post-order walk on the blkg tree, so
- * that all bios are propagated to td->service_queue. It'd be
- * better to walk service_queue tree directly but blkg walk is
- * easier.
- */
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
- tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
-
- /* finally, transfer bios from top-level tg's into the td */
- tg_drain_bios(&td->service_queue);
-
- rcu_read_unlock();
- spin_unlock_irq(&q->queue_lock);
-
- /* all bios now should be in td->service_queue, issue them */
- for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
- NULL)))
- generic_make_request(bio);
-
- spin_lock_irq(&q->queue_lock);
-}
-
int blk_throtl_init(struct request_queue *q)
{
struct throtl_data *td;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 9cb082f38b93..0fa615eefd52 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -405,7 +405,7 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
rwb_arm_timer(rwb);
}
-static void __wbt_update_limits(struct rq_wb *rwb)
+static void wbt_update_limits(struct rq_wb *rwb)
{
struct rq_depth *rqd = &rwb->rq_depth;
@@ -418,14 +418,6 @@ static void __wbt_update_limits(struct rq_wb *rwb)
rwb_wake_all(rwb);
}
-void wbt_update_limits(struct request_queue *q)
-{
- struct rq_qos *rqos = wbt_rq_qos(q);
- if (!rqos)
- return;
- __wbt_update_limits(RQWB(rqos));
-}
-
u64 wbt_get_min_lat(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@@ -441,7 +433,7 @@ void wbt_set_min_lat(struct request_queue *q, u64 val)
return;
RQWB(rqos)->min_lat_nsec = val;
RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
- __wbt_update_limits(RQWB(rqos));
+ wbt_update_limits(RQWB(rqos));
}
@@ -685,7 +677,7 @@ static int wbt_data_dir(const struct request *rq)
static void wbt_queue_depth_changed(struct rq_qos *rqos)
{
RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q);
- __wbt_update_limits(RQWB(rqos));
+ wbt_update_limits(RQWB(rqos));
}
static void wbt_exit(struct rq_qos *rqos)
@@ -843,7 +835,7 @@ int wbt_init(struct request_queue *q)
rwb->enable_state = WBT_STATE_ON_DEFAULT;
rwb->wc = 1;
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
- __wbt_update_limits(rwb);
+ wbt_update_limits(rwb);
/*
* Assign rwb and add the stats callback.
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 8e4e37660971..16bdc85b8df9 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -88,7 +88,6 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
#ifdef CONFIG_BLK_WBT
int wbt_init(struct request_queue *);
-void wbt_update_limits(struct request_queue *);
void wbt_disable_default(struct request_queue *);
void wbt_enable_default(struct request_queue *);
@@ -108,9 +107,6 @@ static inline int wbt_init(struct request_queue *q)
{
return -EINVAL;
}
-static inline void wbt_update_limits(struct request_queue *q)
-{
-}
static inline void wbt_disable_default(struct request_queue *q)
{
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index f87956e0dcaf..23831fa8701d 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -82,6 +82,20 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+bool blk_req_zone_write_trylock(struct request *rq)
+{
+ unsigned int zno = blk_rq_zone_no(rq);
+
+ if (test_and_set_bit(zno, rq->q->seq_zones_wlock))
+ return false;
+
+ WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+ rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
+
void __blk_req_zone_write_lock(struct request *rq)
{
if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
@@ -457,14 +471,19 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
/**
* blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
* @disk: Target disk
+ * @update_driver_data: Callback to update driver data on the frozen disk
*
* Helper function for low-level device drivers to (re) allocate and initialize
* a disk request queue zone bitmaps. This functions should normally be called
* within the disk ->revalidate method for blk-mq based drivers. For BIO based
* drivers only q->nr_zones needs to be updated so that the sysfs exposed value
* is correct.
+ * If the @update_driver_data callback function is not NULL, the callback is
+ * executed with the device request queue frozen after all zones have been
+ * checked.
*/
-int blk_revalidate_disk_zones(struct gendisk *disk)
+int blk_revalidate_disk_zones(struct gendisk *disk,
+ void (*update_driver_data)(struct gendisk *disk))
{
struct request_queue *q = disk->queue;
struct blk_revalidate_zone_args args = {
@@ -498,6 +517,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
q->nr_zones = args.nr_zones;
swap(q->seq_zones_wlock, args.seq_zones_wlock);
swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
+ if (update_driver_data)
+ update_driver_data(disk);
ret = 0;
} else {
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
diff --git a/block/blk.h b/block/blk.h
index 0a94ec68af32..aa16e524dc35 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -5,7 +5,9 @@
#include <linux/idr.h>
#include <linux/blk-mq.h>
#include <linux/part_stat.h>
+#include <linux/blk-crypto.h>
#include <xen/xen.h>
+#include "blk-crypto-internal.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
@@ -17,7 +19,6 @@ extern struct dentry *blk_debugfs_root;
#endif
struct blk_flush_queue {
- unsigned int flush_queue_delayed:1;
unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1;
blk_status_t rq_status;
@@ -62,17 +63,6 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
void blk_freeze_queue(struct request_queue *q);
-static inline void blk_queue_enter_live(struct request_queue *q)
-{
- /*
- * Given that running in generic_make_request() context
- * guarantees that a live reference against q_usage_counter has
- * been established, further references under that same context
- * need not check that the queue has been frozen (marked dead).
- */
- percpu_ref_get(&q->q_usage_counter);
-}
-
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
{
@@ -195,8 +185,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq);
-void blk_account_io_start(struct request *req, bool new_io);
-void blk_account_io_completion(struct request *req, unsigned int bytes);
+void blk_account_io_start(struct request *req);
void blk_account_io_done(struct request *req, u64 now);
/*
@@ -303,36 +292,14 @@ void ioc_clear_queue(struct request_queue *q);
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
-/**
- * create_io_context - try to create task->io_context
- * @gfp_mask: allocation mask
- * @node: allocation node
- *
- * If %current->io_context is %NULL, allocate a new io_context and install
- * it. Returns the current %current->io_context which may be %NULL if
- * allocation failed.
- *
- * Note that this function can't be called with IRQ disabled because
- * task_lock which protects %current->io_context is IRQ-unsafe.
- */
-static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
-{
- WARN_ON_ONCE(irqs_disabled());
- if (unlikely(!current->io_context))
- create_task_io_context(current, gfp_mask, node);
- return current->io_context;
-}
-
/*
* Internal throttling interface
*/
#ifdef CONFIG_BLK_DEV_THROTTLING
-extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
@@ -375,11 +342,6 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q);
static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
#endif
-void part_dec_in_flight(struct request_queue *q, struct hd_struct *part,
- int rw);
-void part_inc_in_flight(struct request_queue *q, struct hd_struct *part,
- int rw);
-void update_io_ticks(struct hd_struct *part, unsigned long now, bool end);
struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
@@ -389,44 +351,32 @@ char *disk_name(struct gendisk *hd, int partno, char *buf);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
-struct hd_struct *__must_check add_partition(struct gendisk *disk, int partno,
- sector_t start, sector_t len, int flags,
- struct partition_meta_info *info);
-void __delete_partition(struct percpu_ref *ref);
-void delete_partition(struct gendisk *disk, int partno);
+void delete_partition(struct gendisk *disk, struct hd_struct *part);
+int bdev_add_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length);
+int bdev_del_partition(struct block_device *bdev, int partno);
+int bdev_resize_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length);
int disk_expand_part_tbl(struct gendisk *disk, int target);
+int hd_ref_init(struct hd_struct *part);
-static inline int hd_ref_init(struct hd_struct *part)
-{
- if (percpu_ref_init(&part->ref, __delete_partition, 0,
- GFP_KERNEL))
- return -ENOMEM;
- return 0;
-}
-
-static inline void hd_struct_get(struct hd_struct *part)
-{
- percpu_ref_get(&part->ref);
-}
-
+/* no need to get/put refcount of part0 */
static inline int hd_struct_try_get(struct hd_struct *part)
{
- return percpu_ref_tryget_live(&part->ref);
+ if (part->partno)
+ return percpu_ref_tryget_live(&part->ref);
+ return 1;
}
static inline void hd_struct_put(struct hd_struct *part)
{
- percpu_ref_put(&part->ref);
-}
-
-static inline void hd_struct_kill(struct hd_struct *part)
-{
- percpu_ref_kill(&part->ref);
+ if (part->partno)
+ percpu_ref_put(&part->ref);
}
static inline void hd_free_part(struct hd_struct *part)
{
- free_part_stats(part);
+ free_percpu(part->dkstats);
kfree(part->info);
percpu_ref_exit(&part->ref);
}
@@ -484,8 +434,8 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
struct request_queue *__blk_alloc_queue(int node_id);
-int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
+int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
- bool *same_page);
+ unsigned int max_sectors, bool *same_page);
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index f8ed677a1bf7..c3aaed070124 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -267,6 +267,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
break;
}
+ bio_crypt_clone(bio, bio_src, gfp_mask);
+
if (bio_integrity(bio_src)) {
int ret;
diff --git a/block/genhd.c b/block/genhd.c
index 06b642b23a07..1a7659327664 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -92,7 +92,6 @@ const char *bdevname(struct block_device *bdev, char *buf)
}
EXPORT_SYMBOL(bdevname);
-#ifdef CONFIG_SMP
static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
{
int cpu;
@@ -112,44 +111,13 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
stat->io_ticks += ptr->io_ticks;
}
}
-#else /* CONFIG_SMP */
-static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
-{
- memcpy(stat, &part->dkstats, sizeof(struct disk_stats));
-}
-#endif /* CONFIG_SMP */
-
-void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
-{
- if (queue_is_mq(q))
- return;
-
- part_stat_local_inc(part, in_flight[rw]);
- if (part->partno)
- part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]);
-}
-
-void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
-{
- if (queue_is_mq(q))
- return;
-
- part_stat_local_dec(part, in_flight[rw]);
- if (part->partno)
- part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]);
-}
static unsigned int part_in_flight(struct request_queue *q,
struct hd_struct *part)
{
+ unsigned int inflight = 0;
int cpu;
- unsigned int inflight;
- if (queue_is_mq(q)) {
- return blk_mq_in_flight(q, part);
- }
-
- inflight = 0;
for_each_possible_cpu(cpu) {
inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
part_stat_local_read_cpu(part, in_flight[1], cpu);
@@ -165,11 +133,6 @@ static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
{
int cpu;
- if (queue_is_mq(q)) {
- blk_mq_in_flight_rw(q, part, inflight);
- return;
- }
-
inflight[0] = 0;
inflight[1] = 0;
for_each_possible_cpu(cpu) {
@@ -344,11 +307,13 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector)
* primarily used for stats accounting.
*
* CONTEXT:
- * RCU read locked. The returned partition pointer is valid only
- * while preemption is disabled.
+ * RCU read locked. The returned partition pointer is always valid
+ * because its refcount is grabbed except for part0, which lifetime
+ * is same with the disk.
*
* RETURNS:
* Found partition on success, part0 is returned if no partition matches
+ * or the matched partition is being deleted.
*/
struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
{
@@ -356,21 +321,33 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
struct hd_struct *part;
int i;
+ rcu_read_lock();
ptbl = rcu_dereference(disk->part_tbl);
part = rcu_dereference(ptbl->last_lookup);
- if (part && sector_in_part(part, sector))
- return part;
+ if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
+ goto out_unlock;
for (i = 1; i < ptbl->len; i++) {
part = rcu_dereference(ptbl->part[i]);
if (part && sector_in_part(part, sector)) {
+ /*
+ * only live partition can be cached for lookup,
+ * so use-after-free on cached & deleting partition
+ * can be avoided
+ */
+ if (!hd_struct_try_get(part))
+ break;
rcu_assign_pointer(ptbl->last_lookup, part);
- return part;
+ goto out_unlock;
}
}
- return &disk->part0;
+
+ part = &disk->part0;
+out_unlock:
+ rcu_read_unlock();
+ return part;
}
/**
@@ -840,13 +817,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->flags |= GENHD_FL_NO_PART_SCAN;
} else {
+ struct backing_dev_info *bdi = disk->queue->backing_dev_info;
+ struct device *dev = disk_to_dev(disk);
int ret;
/* Register BDI before referencing it from bdev */
- disk_to_dev(disk)->devt = devt;
- ret = bdi_register_owner(disk->queue->backing_dev_info,
- disk_to_dev(disk));
+ dev->devt = devt;
+ ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
WARN_ON(ret);
+ bdi_set_owner(bdi, dev);
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
}
@@ -878,6 +857,25 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+static void invalidate_partition(struct gendisk *disk, int partno)
+{
+ struct block_device *bdev;
+
+ bdev = bdget_disk(disk, partno);
+ if (!bdev)
+ return;
+
+ fsync_bdev(bdev);
+ __invalidate_device(bdev, true);
+
+ /*
+ * Unhash the bdev inode for this device so that it gets evicted as soon
+ * as last inode reference is dropped.
+ */
+ remove_inode_hash(bdev->bd_inode);
+ bdput(bdev);
+}
+
void del_gendisk(struct gendisk *disk)
{
struct disk_part_iter piter;
@@ -896,13 +894,11 @@ void del_gendisk(struct gendisk *disk)
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
while ((part = disk_part_iter_next(&piter))) {
invalidate_partition(disk, part->partno);
- bdev_unhash_inode(part_devt(part));
- delete_partition(disk, part->partno);
+ delete_partition(disk, part);
}
disk_part_iter_exit(&piter);
invalidate_partition(disk, 0);
- bdev_unhash_inode(disk_devt(disk));
set_capacity(disk, 0);
disk->flags &= ~GENHD_FL_UP;
up_write(&disk->lookup_sem);
@@ -1279,7 +1275,10 @@ ssize_t part_stat_show(struct device *dev,
unsigned int inflight;
part_stat_read_all(p, &stat);
- inflight = part_in_flight(q, p);
+ if (queue_is_mq(q))
+ inflight = blk_mq_in_flight(q, p);
+ else
+ inflight = part_in_flight(q, p);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
@@ -1318,7 +1317,11 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
struct request_queue *q = part_to_disk(p)->queue;
unsigned int inflight[2];
- part_in_flight_rw(q, p, inflight);
+ if (queue_is_mq(q))
+ blk_mq_in_flight_rw(q, p, inflight);
+ else
+ part_in_flight_rw(q, p, inflight);
+
return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}
@@ -1573,7 +1576,10 @@ static int diskstats_show(struct seq_file *seqf, void *v)
disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
while ((hd = disk_part_iter_next(&piter))) {
part_stat_read_all(hd, &stat);
- inflight = part_in_flight(gp->queue, hd);
+ if (queue_is_mq(gp->queue))
+ inflight = blk_mq_in_flight(gp->queue, hd);
+ else
+ inflight = part_in_flight(gp->queue, hd);
seq_printf(seqf, "%4d %7d %s "
"%lu %lu %lu %u "
@@ -1680,14 +1686,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (disk) {
- if (!init_part_stats(&disk->part0)) {
+ disk->part0.dkstats = alloc_percpu(struct disk_stats);
+ if (!disk->part0.dkstats) {
kfree(disk);
return NULL;
}
init_rwsem(&disk->lookup_sem);
disk->node_id = node_id;
if (disk_expand_part_tbl(disk, 0)) {
- free_part_stats(&disk->part0);
+ free_percpu(disk->part0.dkstats);
kfree(disk);
return NULL;
}
@@ -1703,7 +1710,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
* TODO: Ideally set_capacity() and get_capacity() should be
* converted to make use of bd_mutex and sequence counters.
*/
- seqcount_init(&disk->part0.nr_sects_seq);
+ hd_sects_seq_init(&disk->part0);
if (hd_ref_init(&disk->part0)) {
hd_free_part(&disk->part0);
kfree(disk);
@@ -1806,20 +1813,6 @@ int bdev_read_only(struct block_device *bdev)
EXPORT_SYMBOL(bdev_read_only);
-int invalidate_partition(struct gendisk *disk, int partno)
-{
- int res = 0;
- struct block_device *bdev = bdget_disk(disk, partno);
- if (bdev) {
- fsync_bdev(bdev);
- res = __invalidate_device(bdev, true);
- bdput(bdev);
- }
- return res;
-}
-
-EXPORT_SYMBOL(invalidate_partition);
-
/*
* Disk events - monitor disk events like media change and eject request.
*/
diff --git a/block/ioctl.c b/block/ioctl.c
index 6e827de1a4c4..bdb3bbb253d9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -16,142 +16,44 @@
static int blkpg_do_ioctl(struct block_device *bdev,
struct blkpg_partition __user *upart, int op)
{
- struct block_device *bdevp;
- struct gendisk *disk;
- struct hd_struct *part, *lpart;
struct blkpg_partition p;
- struct disk_part_iter piter;
long long start, length;
- int partno;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&p, upart, sizeof(struct blkpg_partition)))
return -EFAULT;
- disk = bdev->bd_disk;
if (bdev != bdev->bd_contains)
return -EINVAL;
- partno = p.pno;
- if (partno <= 0)
+
+ if (p.pno <= 0)
return -EINVAL;
+
+ if (op == BLKPG_DEL_PARTITION)
+ return bdev_del_partition(bdev, p.pno);
+
+ start = p.start >> SECTOR_SHIFT;
+ length = p.length >> SECTOR_SHIFT;
+
+ /* check for fit in a hd_struct */
+ if (sizeof(sector_t) < sizeof(long long)) {
+ long pstart = start, plength = length;
+
+ if (pstart != start || plength != length || pstart < 0 ||
+ plength < 0 || p.pno > 65535)
+ return -EINVAL;
+ }
+
switch (op) {
- case BLKPG_ADD_PARTITION:
- start = p.start >> 9;
- length = p.length >> 9;
- /* check for fit in a hd_struct */
- if (sizeof(sector_t) == sizeof(long) &&
- sizeof(long long) > sizeof(long)) {
- long pstart = start, plength = length;
- if (pstart != start || plength != length
- || pstart < 0 || plength < 0 || partno > 65535)
- return -EINVAL;
- }
- /* check if partition is aligned to blocksize */
- if (p.start & (bdev_logical_block_size(bdev) - 1))
- return -EINVAL;
-
- mutex_lock(&bdev->bd_mutex);
-
- /* overlap? */
- disk_part_iter_init(&piter, disk,
- DISK_PITER_INCL_EMPTY);
- while ((part = disk_part_iter_next(&piter))) {
- if (!(start + length <= part->start_sect ||
- start >= part->start_sect + part->nr_sects)) {
- disk_part_iter_exit(&piter);
- mutex_unlock(&bdev->bd_mutex);
- return -EBUSY;
- }
- }
- disk_part_iter_exit(&piter);
-
- /* all seems OK */
- part = add_partition(disk, partno, start, length,
- ADDPART_FLAG_NONE, NULL);
- mutex_unlock(&bdev->bd_mutex);
- return PTR_ERR_OR_ZERO(part);
- case BLKPG_DEL_PARTITION:
- part = disk_get_part(disk, partno);
- if (!part)
- return -ENXIO;
-
- bdevp = bdget(part_devt(part));
- disk_put_part(part);
- if (!bdevp)
- return -ENOMEM;
-
- mutex_lock(&bdevp->bd_mutex);
- if (bdevp->bd_openers) {
- mutex_unlock(&bdevp->bd_mutex);
- bdput(bdevp);
- return -EBUSY;
- }
- /* all seems OK */
- fsync_bdev(bdevp);
- invalidate_bdev(bdevp);
-
- mutex_lock_nested(&bdev->bd_mutex, 1);
- delete_partition(disk, partno);
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&bdevp->bd_mutex);
- bdput(bdevp);
-
- return 0;
- case BLKPG_RESIZE_PARTITION:
- start = p.start >> 9;
- /* new length of partition in bytes */
- length = p.length >> 9;
- /* check for fit in a hd_struct */
- if (sizeof(sector_t) == sizeof(long) &&
- sizeof(long long) > sizeof(long)) {
- long pstart = start, plength = length;
- if (pstart != start || plength != length
- || pstart < 0 || plength < 0)
- return -EINVAL;
- }
- part = disk_get_part(disk, partno);
- if (!part)
- return -ENXIO;
- bdevp = bdget(part_devt(part));
- if (!bdevp) {
- disk_put_part(part);
- return -ENOMEM;
- }
- mutex_lock(&bdevp->bd_mutex);
- mutex_lock_nested(&bdev->bd_mutex, 1);
- if (start != part->start_sect) {
- mutex_unlock(&bdevp->bd_mutex);
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdevp);
- disk_put_part(part);
- return -EINVAL;
- }
- /* overlap? */
- disk_part_iter_init(&piter, disk,
- DISK_PITER_INCL_EMPTY);
- while ((lpart = disk_part_iter_next(&piter))) {
- if (lpart->partno != partno &&
- !(start + length <= lpart->start_sect ||
- start >= lpart->start_sect + lpart->nr_sects)
- ) {
- disk_part_iter_exit(&piter);
- mutex_unlock(&bdevp->bd_mutex);
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdevp);
- disk_put_part(part);
- return -EBUSY;
- }
- }
- disk_part_iter_exit(&piter);
- part_nr_sects_write(part, (sector_t)length);
- i_size_write(bdevp->bd_inode, p.length);
- mutex_unlock(&bdevp->bd_mutex);
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdevp);
- disk_put_part(part);
- return 0;
- default:
+ case BLKPG_ADD_PARTITION:
+ /* check if partition is aligned to blocksize */
+ if (p.start & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
+ return bdev_add_partition(bdev, p.pno, start, length);
+ case BLKPG_RESIZE_PARTITION:
+ return bdev_resize_partition(bdev, p.pno, start, length);
+ default:
+ return -EINVAL;
}
}
@@ -302,12 +204,12 @@ static int put_u64(u64 __user *argp, u64 val)
}
#ifdef CONFIG_COMPAT
-static int compat_put_long(compat_long_t *argp, long val)
+static int compat_put_long(compat_long_t __user *argp, long val)
{
return put_user(val, argp);
}
-static int compat_put_ulong(compat_ulong_t *argp, compat_ulong_t val)
+static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val)
{
return put_user(val, argp);
}
diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c
new file mode 100644
index 000000000000..c2ef41b3147b
--- /dev/null
+++ b/block/keyslot-manager.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/**
+ * DOC: The Keyslot Manager
+ *
+ * Many devices with inline encryption support have a limited number of "slots"
+ * into which encryption contexts may be programmed, and requests can be tagged
+ * with a slot number to specify the key to use for en/decryption.
+ *
+ * As the number of slots is limited, and programming keys is expensive on
+ * many inline encryption hardware, we don't want to program the same key into
+ * multiple slots - if multiple requests are using the same key, we want to
+ * program just one slot with that key and use that slot for all requests.
+ *
+ * The keyslot manager manages these keyslots appropriately, and also acts as
+ * an abstraction between the inline encryption hardware and the upper layers.
+ *
+ * Lower layer devices will set up a keyslot manager in their request queue
+ * and tell it how to perform device specific operations like programming/
+ * evicting keys from keyslots.
+ *
+ * Upper layers will call blk_ksm_get_slot_for_key() to program a
+ * key into some slot in the inline encryption hardware.
+ */
+
+#define pr_fmt(fmt) "blk-crypto: " fmt
+
+#include <linux/keyslot-manager.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/pm_runtime.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+
+struct blk_ksm_keyslot {
+ atomic_t slot_refs;
+ struct list_head idle_slot_node;
+ struct hlist_node hash_node;
+ const struct blk_crypto_key *key;
+ struct blk_keyslot_manager *ksm;
+};
+
+static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm)
+{
+ /*
+ * Calling into the driver requires ksm->lock held and the device
+ * resumed. But we must resume the device first, since that can acquire
+ * and release ksm->lock via blk_ksm_reprogram_all_keys().
+ */
+ if (ksm->dev)
+ pm_runtime_get_sync(ksm->dev);
+ down_write(&ksm->lock);
+}
+
+static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm)
+{
+ up_write(&ksm->lock);
+ if (ksm->dev)
+ pm_runtime_put_sync(ksm->dev);
+}
+
+/**
+ * blk_ksm_init() - Initialize a keyslot manager
+ * @ksm: The keyslot_manager to initialize.
+ * @num_slots: The number of key slots to manage.
+ *
+ * Allocate memory for keyslots and initialize a keyslot manager. Called by
+ * e.g. storage drivers to set up a keyslot manager in their request_queue.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots)
+{
+ unsigned int slot;
+ unsigned int i;
+ unsigned int slot_hashtable_size;
+
+ memset(ksm, 0, sizeof(*ksm));
+
+ if (num_slots == 0)
+ return -EINVAL;
+
+ ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL);
+ if (!ksm->slots)
+ return -ENOMEM;
+
+ ksm->num_slots = num_slots;
+
+ init_rwsem(&ksm->lock);
+
+ init_waitqueue_head(&ksm->idle_slots_wait_queue);
+ INIT_LIST_HEAD(&ksm->idle_slots);
+
+ for (slot = 0; slot < num_slots; slot++) {
+ ksm->slots[slot].ksm = ksm;
+ list_add_tail(&ksm->slots[slot].idle_slot_node,
+ &ksm->idle_slots);
+ }
+
+ spin_lock_init(&ksm->idle_slots_lock);
+
+ slot_hashtable_size = roundup_pow_of_two(num_slots);
+ ksm->log_slot_ht_size = ilog2(slot_hashtable_size);
+ ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size,
+ sizeof(ksm->slot_hashtable[0]),
+ GFP_KERNEL);
+ if (!ksm->slot_hashtable)
+ goto err_destroy_ksm;
+ for (i = 0; i < slot_hashtable_size; i++)
+ INIT_HLIST_HEAD(&ksm->slot_hashtable[i]);
+
+ return 0;
+
+err_destroy_ksm:
+ blk_ksm_destroy(ksm);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_ksm_init);
+
+static inline struct hlist_head *
+blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key)
+{
+ return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)];
+}
+
+static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot)
+{
+ struct blk_keyslot_manager *ksm = slot->ksm;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ksm->idle_slots_lock, flags);
+ list_del(&slot->idle_slot_node);
+ spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
+}
+
+static struct blk_ksm_keyslot *blk_ksm_find_keyslot(
+ struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key)
+{
+ const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key);
+ struct blk_ksm_keyslot *slotp;
+
+ hlist_for_each_entry(slotp, head, hash_node) {
+ if (slotp->key == key)
+ return slotp;
+ }
+ return NULL;
+}
+
+static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot(
+ struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key)
+{
+ struct blk_ksm_keyslot *slot;
+
+ slot = blk_ksm_find_keyslot(ksm, key);
+ if (!slot)
+ return NULL;
+ if (atomic_inc_return(&slot->slot_refs) == 1) {
+ /* Took first reference to this slot; remove it from LRU list */
+ blk_ksm_remove_slot_from_lru_list(slot);
+ }
+ return slot;
+}
+
+unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot)
+{
+ return slot - slot->ksm->slots;
+}
+EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx);
+
+/**
+ * blk_ksm_get_slot_for_key() - Program a key into a keyslot.
+ * @ksm: The keyslot manager to program the key into.
+ * @key: Pointer to the key object to program, including the raw key, crypto
+ * mode, and data unit size.
+ * @slot_ptr: A pointer to return the pointer of the allocated keyslot.
+ *
+ * Get a keyslot that's been programmed with the specified key. If one already
+ * exists, return it with incremented refcount. Otherwise, wait for a keyslot
+ * to become idle and program it.
+ *
+ * Context: Process context. Takes and releases ksm->lock.
+ * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the
+ * allocated keyslot), or some other blk_status_t otherwise (and
+ * keyslot is set to NULL).
+ */
+blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key,
+ struct blk_ksm_keyslot **slot_ptr)
+{
+ struct blk_ksm_keyslot *slot;
+ int slot_idx;
+ int err;
+
+ *slot_ptr = NULL;
+ down_read(&ksm->lock);
+ slot = blk_ksm_find_and_grab_keyslot(ksm, key);
+ up_read(&ksm->lock);
+ if (slot)
+ goto success;
+
+ for (;;) {
+ blk_ksm_hw_enter(ksm);
+ slot = blk_ksm_find_and_grab_keyslot(ksm, key);
+ if (slot) {
+ blk_ksm_hw_exit(ksm);
+ goto success;
+ }
+
+ /*
+ * If we're here, that means there wasn't a slot that was
+ * already programmed with the key. So try to program it.
+ */
+ if (!list_empty(&ksm->idle_slots))
+ break;
+
+ blk_ksm_hw_exit(ksm);
+ wait_event(ksm->idle_slots_wait_queue,
+ !list_empty(&ksm->idle_slots));
+ }
+
+ slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot,
+ idle_slot_node);
+ slot_idx = blk_ksm_get_slot_idx(slot);
+
+ err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx);
+ if (err) {
+ wake_up(&ksm->idle_slots_wait_queue);
+ blk_ksm_hw_exit(ksm);
+ return errno_to_blk_status(err);
+ }
+
+ /* Move this slot to the hash list for the new key. */
+ if (slot->key)
+ hlist_del(&slot->hash_node);
+ slot->key = key;
+ hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key));
+
+ atomic_set(&slot->slot_refs, 1);
+
+ blk_ksm_remove_slot_from_lru_list(slot);
+
+ blk_ksm_hw_exit(ksm);
+success:
+ *slot_ptr = slot;
+ return BLK_STS_OK;
+}
+
+/**
+ * blk_ksm_put_slot() - Release a reference to a slot
+ * @slot: The keyslot to release the reference of.
+ *
+ * Context: Any context.
+ */
+void blk_ksm_put_slot(struct blk_ksm_keyslot *slot)
+{
+ struct blk_keyslot_manager *ksm;
+ unsigned long flags;
+
+ if (!slot)
+ return;
+
+ ksm = slot->ksm;
+
+ if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
+ &ksm->idle_slots_lock, flags)) {
+ list_add_tail(&slot->idle_slot_node, &ksm->idle_slots);
+ spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
+ wake_up(&ksm->idle_slots_wait_queue);
+ }
+}
+
+/**
+ * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is
+ * supported by a ksm.
+ * @ksm: The keyslot manager to check
+ * @cfg: The crypto configuration to check for.
+ *
+ * Checks for crypto_mode/data unit size/dun bytes support.
+ *
+ * Return: Whether or not this ksm supports the specified crypto config.
+ */
+bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_config *cfg)
+{
+ if (!ksm)
+ return false;
+ if (!(ksm->crypto_modes_supported[cfg->crypto_mode] &
+ cfg->data_unit_size))
+ return false;
+ if (ksm->max_dun_bytes_supported < cfg->dun_bytes)
+ return false;
+ return true;
+}
+
+/**
+ * blk_ksm_evict_key() - Evict a key from the lower layer device.
+ * @ksm: The keyslot manager to evict from
+ * @key: The key to evict
+ *
+ * Find the keyslot that the specified key was programmed into, and evict that
+ * slot from the lower layer device. The slot must not be in use by any
+ * in-flight IO when this function is called.
+ *
+ * Context: Process context. Takes and releases ksm->lock.
+ * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY
+ * if the keyslot is still in use, or another -errno value on other
+ * error.
+ */
+int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key)
+{
+ struct blk_ksm_keyslot *slot;
+ int err = 0;
+
+ blk_ksm_hw_enter(ksm);
+ slot = blk_ksm_find_keyslot(ksm, key);
+ if (!slot)
+ goto out_unlock;
+
+ if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
+ err = -EBUSY;
+ goto out_unlock;
+ }
+ err = ksm->ksm_ll_ops.keyslot_evict(ksm, key,
+ blk_ksm_get_slot_idx(slot));
+ if (err)
+ goto out_unlock;
+
+ hlist_del(&slot->hash_node);
+ slot->key = NULL;
+ err = 0;
+out_unlock:
+ blk_ksm_hw_exit(ksm);
+ return err;
+}
+
+/**
+ * blk_ksm_reprogram_all_keys() - Re-program all keyslots.
+ * @ksm: The keyslot manager
+ *
+ * Re-program all keyslots that are supposed to have a key programmed. This is
+ * intended only for use by drivers for hardware that loses its keys on reset.
+ *
+ * Context: Process context. Takes and releases ksm->lock.
+ */
+void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm)
+{
+ unsigned int slot;
+
+ /* This is for device initialization, so don't resume the device */
+ down_write(&ksm->lock);
+ for (slot = 0; slot < ksm->num_slots; slot++) {
+ const struct blk_crypto_key *key = ksm->slots[slot].key;
+ int err;
+
+ if (!key)
+ continue;
+
+ err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot);
+ WARN_ON(err);
+ }
+ up_write(&ksm->lock);
+}
+EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys);
+
+void blk_ksm_destroy(struct blk_keyslot_manager *ksm)
+{
+ if (!ksm)
+ return;
+ kvfree(ksm->slot_hashtable);
+ memzero_explicit(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots);
+ kvfree(ksm->slots);
+ memzero_explicit(ksm, sizeof(*ksm));
+}
+EXPORT_SYMBOL_GPL(blk_ksm_destroy);
+
+bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q)
+{
+ if (blk_integrity_queue_supports_integrity(q)) {
+ pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
+ return false;
+ }
+ q->ksm = ksm;
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_ksm_register);
+
+void blk_ksm_unregister(struct request_queue *q)
+{
+ q->ksm = NULL;
+}
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 34dcea0ef637..a38c5ab103d1 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -579,7 +579,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
return merged;
}
-static void kyber_prepare_request(struct request *rq, struct bio *bio)
+static void kyber_prepare_request(struct request *rq)
{
rq_set_domain_token(rq, -1);
}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b490f47fd553..b57470e154c8 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -541,7 +541,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
* Nothing to do here. This is defined only to ensure that .finish_request
* method is called upon request completion.
*/
-static void dd_prepare_request(struct request *rq, struct bio *bio)
+static void dd_prepare_request(struct request *rq)
{
}
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 9ef48a8cff86..78951e33b2d7 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -274,10 +274,10 @@ struct device_type part_type = {
.uevent = part_uevent,
};
-static void delete_partition_work_fn(struct work_struct *work)
+static void hd_struct_free_work(struct work_struct *work)
{
- struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct,
- rcu_work);
+ struct hd_struct *part =
+ container_of(to_rcu_work(work), struct hd_struct, rcu_work);
part->start_sect = 0;
part->nr_sects = 0;
@@ -285,32 +285,42 @@ static void delete_partition_work_fn(struct work_struct *work)
put_device(part_to_dev(part));
}
-void __delete_partition(struct percpu_ref *ref)
+static void hd_struct_free(struct percpu_ref *ref)
{
struct hd_struct *part = container_of(ref, struct hd_struct, ref);
- INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn);
+ struct gendisk *disk = part_to_disk(part);
+ struct disk_part_tbl *ptbl =
+ rcu_dereference_protected(disk->part_tbl, 1);
+
+ rcu_assign_pointer(ptbl->last_lookup, NULL);
+ put_device(disk_to_dev(disk));
+
+ INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work);
queue_rcu_work(system_wq, &part->rcu_work);
}
+int hd_ref_init(struct hd_struct *part)
+{
+ if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL))
+ return -ENOMEM;
+ return 0;
+}
+
/*
* Must be called either with bd_mutex held, before a disk can be opened or
* after all disk users are gone.
*/
-void delete_partition(struct gendisk *disk, int partno)
+void delete_partition(struct gendisk *disk, struct hd_struct *part)
{
struct disk_part_tbl *ptbl =
rcu_dereference_protected(disk->part_tbl, 1);
- struct hd_struct *part;
-
- if (partno >= ptbl->len)
- return;
-
- part = rcu_dereference_protected(ptbl->part[partno], 1);
- if (!part)
- return;
- rcu_assign_pointer(ptbl->part[partno], NULL);
- rcu_assign_pointer(ptbl->last_lookup, NULL);
+ /*
+ * ->part_tbl is referenced in this part's release handler, so
+ * we have to hold the disk device
+ */
+ get_device(disk_to_dev(part_to_disk(part)));
+ rcu_assign_pointer(ptbl->part[part->partno], NULL);
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
@@ -321,7 +331,7 @@ void delete_partition(struct gendisk *disk, int partno)
* "in-use" until we really free the gendisk.
*/
blk_invalidate_devt(part_devt(part));
- hd_struct_kill(part);
+ percpu_ref_kill(&part->ref);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -335,7 +345,7 @@ static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
* Must be called either with bd_mutex held, before a disk can be opened or
* after all disk users are gone.
*/
-struct hd_struct *add_partition(struct gendisk *disk, int partno,
+static struct hd_struct *add_partition(struct gendisk *disk, int partno,
sector_t start, sector_t len, int flags,
struct partition_meta_info *info)
{
@@ -377,12 +387,13 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!p)
return ERR_PTR(-EBUSY);
- if (!init_part_stats(p)) {
+ p->dkstats = alloc_percpu(struct disk_stats);
+ if (!p->dkstats) {
err = -ENOMEM;
goto out_free;
}
- seqcount_init(&p->nr_sects_seq);
+ hd_sects_seq_init(p);
pdev = part_to_dev(p);
p->start_sect = start;
@@ -458,7 +469,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
out_free_info:
kfree(p->info);
out_free_stats:
- free_part_stats(p);
+ free_percpu(p->dkstats);
out_free:
kfree(p);
return ERR_PTR(err);
@@ -472,6 +483,121 @@ out_put:
return ERR_PTR(err);
}
+static bool partition_overlaps(struct gendisk *disk, sector_t start,
+ sector_t length, int skip_partno)
+{
+ struct disk_part_iter piter;
+ struct hd_struct *part;
+ bool overlap = false;
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+ while ((part = disk_part_iter_next(&piter))) {
+ if (part->partno == skip_partno ||
+ start >= part->start_sect + part->nr_sects ||
+ start + length <= part->start_sect)
+ continue;
+ overlap = true;
+ break;
+ }
+
+ disk_part_iter_exit(&piter);
+ return overlap;
+}
+
+int bdev_add_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length)
+{
+ struct hd_struct *part;
+
+ mutex_lock(&bdev->bd_mutex);
+ if (partition_overlaps(bdev->bd_disk, start, length, -1)) {
+ mutex_unlock(&bdev->bd_mutex);
+ return -EBUSY;
+ }
+
+ part = add_partition(bdev->bd_disk, partno, start, length,
+ ADDPART_FLAG_NONE, NULL);
+ mutex_unlock(&bdev->bd_mutex);
+ return PTR_ERR_OR_ZERO(part);
+}
+
+int bdev_del_partition(struct block_device *bdev, int partno)
+{
+ struct block_device *bdevp;
+ struct hd_struct *part;
+ int ret = 0;
+
+ part = disk_get_part(bdev->bd_disk, partno);
+ if (!part)
+ return -ENXIO;
+
+ ret = -ENOMEM;
+ bdevp = bdget(part_devt(part));
+ if (!bdevp)
+ goto out_put_part;
+
+ mutex_lock(&bdevp->bd_mutex);
+
+ ret = -EBUSY;
+ if (bdevp->bd_openers)
+ goto out_unlock;
+
+ sync_blockdev(bdevp);
+ invalidate_bdev(bdevp);
+
+ mutex_lock_nested(&bdev->bd_mutex, 1);
+ delete_partition(bdev->bd_disk, part);
+ mutex_unlock(&bdev->bd_mutex);
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&bdevp->bd_mutex);
+ bdput(bdevp);
+out_put_part:
+ disk_put_part(part);
+ return ret;
+}
+
+int bdev_resize_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length)
+{
+ struct block_device *bdevp;
+ struct hd_struct *part;
+ int ret = 0;
+
+ part = disk_get_part(bdev->bd_disk, partno);
+ if (!part)
+ return -ENXIO;
+
+ ret = -ENOMEM;
+ bdevp = bdget(part_devt(part));
+ if (!bdevp)
+ goto out_put_part;
+
+ mutex_lock(&bdevp->bd_mutex);
+ mutex_lock_nested(&bdev->bd_mutex, 1);
+
+ ret = -EINVAL;
+ if (start != part->start_sect)
+ goto out_unlock;
+
+ ret = -EBUSY;
+ if (partition_overlaps(bdev->bd_disk, start, length, partno))
+ goto out_unlock;
+
+ part_nr_sects_write(part, (sector_t)length);
+ i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT);
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&bdevp->bd_mutex);
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdevp);
+out_put_part:
+ disk_put_part(part);
+ return ret;
+}
+
static bool disk_unlock_native_capacity(struct gendisk *disk)
{
const struct block_device_operations *bdops = disk->fops;
@@ -488,27 +614,30 @@ static bool disk_unlock_native_capacity(struct gendisk *disk)
}
}
-int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev)
+int blk_drop_partitions(struct block_device *bdev)
{
struct disk_part_iter piter;
struct hd_struct *part;
- int res;
- if (!disk_part_scan_enabled(disk))
+ if (!disk_part_scan_enabled(bdev->bd_disk))
return 0;
if (bdev->bd_part_count)
return -EBUSY;
- res = invalidate_partition(disk, 0);
- if (res)
- return res;
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+ sync_blockdev(bdev);
+ invalidate_bdev(bdev);
+
+ disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY);
while ((part = disk_part_iter_next(&piter)))
- delete_partition(disk, part->partno);
+ delete_partition(bdev->bd_disk, part);
disk_part_iter_exit(&piter);
return 0;
}
+#ifdef CONFIG_S390
+/* for historic reasons in the DASD driver */
+EXPORT_SYMBOL_GPL(blk_drop_partitions);
+#endif
static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
struct parsed_partitions *state, int p)