From c4e21bcd0f9d01f9c5d6c52007f5541871a5b1de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Jul 2023 11:42:38 +0200 Subject: block: cleanup queue_wc_store Get rid of the local queue_wc_store variable and handling setting and clearing the QUEUE_FLAG_WC flag diretly instead the if / else if. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230707094239.107968-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index afc797fb0dfc..0cde6598fb2f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -449,21 +449,13 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - int set = -1; - if (!strncmp(page, "write back", 10)) - set = 1; + blk_queue_flag_set(QUEUE_FLAG_WC, q); else if (!strncmp(page, "write through", 13) || !strncmp(page, "none", 4)) - set = 0; - - if (set == -1) - return -EINVAL; - - if (set) - blk_queue_flag_set(QUEUE_FLAG_WC, q); - else blk_queue_flag_clear(QUEUE_FLAG_WC, q); + else + return -EINVAL; return count; } -- cgit v1.2.3 From 43c9835b144c7ce29efe142d662529662a9eb376 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Jul 2023 11:42:39 +0200 Subject: block: don't allow enabling a cache on devices that don't support it Currently the write_cache attribute allows enabling the QUEUE_FLAG_WC flag on devices that never claimed the capability. Fix that by adding a QUEUE_FLAG_HW_WC flag that is set by blk_queue_write_cache and guards re-enabling the cache through sysfs. Note that any rescan that calls blk_queue_write_cache will still re-enable the write cache as in the current code. Fixes: 93e9d8e836cb ("block: add ability to flag write back caching on a device") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230707094239.107968-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 7 +++++-- block/blk-sysfs.c | 11 +++++++---- include/linux/blkdev.h | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 4dd59059b788..0046b447268f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -830,10 +830,13 @@ EXPORT_SYMBOL(blk_set_queue_depth); */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { - if (wc) + if (wc) { + blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q); - else + } else { + blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0cde6598fb2f..63e481262336 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -449,13 +449,16 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - if (!strncmp(page, "write back", 10)) + if (!strncmp(page, "write back", 10)) { + if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) + return -EINVAL; blk_queue_flag_set(QUEUE_FLAG_WC, q); - else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) + } else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) { blk_queue_flag_clear(QUEUE_FLAG_WC, q); - else + } else { return -EINVAL; + } return count; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ed44a997f629..2f5371b8482c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -538,6 +538,7 @@ struct request_queue { #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ +#define QUEUE_FLAG_HW_WC 18 /* Write back caching supported */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ -- cgit v1.2.3 From 660e802c76c89e871c29cd3174c07c8d23e39c35 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:55 +0800 Subject: blk-mq: use percpu csd to remote complete instead of per-rq csd If request need to be completed remotely, we insert it into percpu llist, and smp_call_function_single_async() if llist is empty previously. We don't need to use per-rq csd, percpu csd is enough. And the size of struct request is decreased by 24 bytes. This way is cleaner, and looks correct, given block softirq is guaranteed to be scheduled to consume the list if one new request is added to this percpu list, either smp_call_function_single_async() returns -EBUSY or 0. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-2-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++------ include/linux/blk-mq.h | 5 +---- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d50b1d62a3d9..d98654869615 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -43,6 +43,7 @@ #include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); +static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, @@ -1157,15 +1158,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) static void blk_mq_complete_send_ipi(struct request *rq) { - struct llist_head *list; unsigned int cpu; cpu = rq->mq_ctx->cpu; - list = &per_cpu(blk_cpu_done, cpu); - if (llist_add(&rq->ipi_list, list)) { - INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); - smp_call_function_single_async(cpu, &rq->csd); - } + if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) + smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) @@ -4829,6 +4826,9 @@ static int __init blk_mq_init(void) for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); + for_each_possible_cpu(i) + INIT_CSD(&per_cpu(blk_cpu_csd, i), + __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b96e00499f9e..67f810857634 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -182,10 +182,7 @@ struct request { rq_end_io_fn *saved_end_io; } flush; - union { - struct __call_single_data csd; - u64 fifo_time; - }; + u64 fifo_time; /* * completion callback. -- cgit v1.2.3 From 28b241237470981a96fbd82077c8044466b61e5f Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:56 +0800 Subject: blk-flush: fix rq->flush.seq for post-flush requests If the policy == (REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH), it means that the data sequence and post-flush sequence need to be done for this request. The rq->flush.seq should record what sequences have been done (or don't need to be done). So in this case, pre-flush doesn't need to be done, we should init rq->flush.seq to REQ_FSEQ_PREFLUSH not REQ_FSEQ_POSTFLUSH. Fixes: 615939a2ae73 ("blk-mq: defer to the normal submission path for post-flush requests") Signed-off-by: Chengming Zhou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-3-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 8220517c2d67..fdc489e0ea16 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -443,7 +443,7 @@ bool blk_insert_flush(struct request *rq) * the post flush, and then just pass the command on. */ blk_rq_init_flush(rq); - rq->flush.seq |= REQ_FSEQ_POSTFLUSH; + rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); spin_unlock_irq(&fq->mq_flush_lock); -- cgit v1.2.3 From b175c86739d38e41044d3136065f092a6d95aee6 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:57 +0800 Subject: blk-flush: count inflight flush_data requests The flush state machine use a double list to link all inflight flush_data requests, to avoid issuing separate post-flushes for these flush_data requests which shared PREFLUSH. So we can't reuse rq->queuelist, this is why we need rq->flush.list In preparation of the next patch that reuse rq->queuelist for flush state machine, we change the double linked list to unsigned long counter, which count all inflight flush_data requests. This is ok since we only need to know if there is any inflight flush_data request, so unsigned long counter is good. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230717040058.3993930-4-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 9 +++++---- block/blk.h | 5 ++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index fdc489e0ea16..fedb39031647 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -187,7 +187,8 @@ static void blk_flush_complete_seq(struct request *rq, break; case REQ_FSEQ_DATA: - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + list_del_init(&rq->flush.list); + fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); list_add(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); @@ -299,7 +300,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, return; /* C2 and C3 */ - if (!list_empty(&fq->flush_data_in_flight) && + if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; @@ -374,6 +375,7 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + fq->flush_data_in_flight--; blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); @@ -445,7 +447,7 @@ bool blk_insert_flush(struct request *rq) blk_rq_init_flush(rq); rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + fq->flush_data_in_flight++; spin_unlock_irq(&fq->mq_flush_lock); return false; default: @@ -496,7 +498,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); - INIT_LIST_HEAD(&fq->flush_data_in_flight); return fq; diff --git a/block/blk.h b/block/blk.h index 608c5dcc516b..686712e13835 100644 --- a/block/blk.h +++ b/block/blk.h @@ -15,15 +15,14 @@ struct elevator_type; extern struct dentry *blk_debugfs_root; struct blk_flush_queue { + spinlock_t mq_flush_lock; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; - struct list_head flush_data_in_flight; + unsigned long flush_data_in_flight; struct request *flush_rq; - - spinlock_t mq_flush_lock; }; bool is_flush_rq(struct request *req); -- cgit v1.2.3 From 81ada09cc25e4bf2de7d2951925fb409338a545d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 17 Jul 2023 12:00:58 +0800 Subject: blk-flush: reuse rq queuelist in flush state machine Since we don't need to maintain inflight flush_data requests list anymore, we can reuse rq->queuelist for flush pending list. Note in mq_flush_data_end_io(), we need to re-initialize rq->queuelist before reusing it in the state machine when end, since the rq->rq_next also reuse it, may have corrupted rq->queuelist by the driver. This patch decrease the size of struct request by 16 bytes. Signed-off-by: Chengming Zhou Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230717040058.3993930-5-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++++++++++------- include/linux/blk-mq.h | 1 - 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index fedb39031647..e73dc22d05c1 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -183,14 +183,13 @@ static void blk_flush_complete_seq(struct request *rq, /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; - list_move_tail(&rq->flush.list, pending); + list_move_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: - list_del_init(&rq->flush.list); fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); - list_add(&rq->queuelist, &q->requeue_list); + list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; @@ -202,7 +201,7 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - list_del_init(&rq->flush.list); + list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; @@ -258,7 +257,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq, fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ - list_for_each_entry_safe(rq, n, running, flush.list) { + list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); @@ -292,7 +291,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = - list_first_entry(pending, struct request, flush.list); + list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ @@ -376,6 +375,11 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, */ spin_lock_irqsave(&fq->mq_flush_lock, flags); fq->flush_data_in_flight--; + /* + * May have been corrupted by rq->rq_next reuse, we need to + * re-initialize rq->queuelist before reusing it here. + */ + INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); @@ -386,7 +390,6 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; - INIT_LIST_HEAD(&rq->flush.list); rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 67f810857634..01e8c31db665 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -178,7 +178,6 @@ struct request { struct { unsigned int seq; - struct list_head list; rq_end_io_fn *saved_end_io; } flush; -- cgit v1.2.3 From 8f63fef5867fb5e8c29d9c14b6d739bfc1869d32 Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Wed, 19 Jul 2023 17:46:08 +0530 Subject: block: refactor to use helper Reduce some code by making use of bio_integrity_bytes(). Signed-off-by: Nitesh Shetty Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230719121608.32105-1-nj.shetty@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4533eb491661..8f0af7ac8573 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -199,7 +199,6 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - unsigned int intervals; blk_status_t status; if (!bi) @@ -224,10 +223,9 @@ bool bio_integrity_prep(struct bio *bio) !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; } - intervals = bio_integrity_intervals(bi, bio_sectors(bio)); /* Allocate kernel buffer for protection data */ - len = intervals * bi->tuple_size; + len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, GFP_NOIO); status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { -- cgit v1.2.3 From cd1d83e24e689f25de7e34bea697971750138d5f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:26 -0700 Subject: block: tidy up the bio full checks in bio_add_hw_page bio_add_hw_page already checks if the number of bytes trying to be added even fit into max_hw_sectors limit of the queue. Remove the call to bio_full and just do a check for the smaller of the number of segments in the bio and the queue max segments limit, and do this cheap check before the more expensive gap to previous check. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8672179213b9..72488ecea47a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1014,6 +1014,10 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) return len; + if (bio->bi_vcnt >= + min(bio->bi_max_vecs, queue_max_segments(q))) + return 0; + /* * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. @@ -1023,12 +1027,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; } - if (bio_full(bio, len)) - return 0; - - if (bio->bi_vcnt >= queue_max_segments(q)) - return 0; - bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); bio->bi_vcnt++; bio->bi_iter.bi_size += len; -- cgit v1.2.3 From 6850b2dd5c25f27f7b74414553f047d4c12dd66c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:27 -0700 Subject: block: use SECTOR_SHIFT bio_add_hw_page Use the SECTOR_SHIFT magic constant instead of the magic number. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 72488ecea47a..445be4bdd99b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1007,7 +1007,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) return 0; if (bio->bi_vcnt > 0) { -- cgit v1.2.3 From 939e1a370330841b2c0292a483d7b38f3ee45f88 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:28 -0700 Subject: block: move the BIO_CLONED checks out of __bio_try_merge_page __bio_try_merge_page is a way too low-level helper to assert that the bio is not cloned. Move the check into bio_add_page and bio_iov_iter_get_pages instead, which are the high level entry points that should enforce this variant. bio_add_hw_page already this check, coverig the third (indirect) caller of __bio_try_merge_page. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 445be4bdd99b..3ac72e60e7f1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -945,9 +945,6 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, static bool __bio_try_merge_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off, bool *same_page) { - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; - if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; @@ -1127,6 +1124,9 @@ int bio_add_page(struct bio *bio, struct page *page, { bool same_page = false; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { if (bio_full(bio, len)) return 0; @@ -1335,6 +1335,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { int ret = 0; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EIO; + if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); -- cgit v1.2.3 From 0eca8b6f97ac705c5806f7d062207379094fb114 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:29 -0700 Subject: block: move the bi_vcnt check out of __bio_try_merge_page Move the bi_vcnt out of __bio_try_merge_page and into the two callers that don't already have it in preparation for additional changes to __bio_try_merge_page. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 3ac72e60e7f1..3dcbe98580dc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -945,20 +945,17 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, static bool __bio_try_merge_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off, bool *same_page) { - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (!page_is_mergeable(bv, page, len, off, same_page)) + return false; + if (bio->bi_iter.bi_size > UINT_MAX - len) { + *same_page = false; + return false; } - return false; + bv->bv_len += len; + bio->bi_iter.bi_size += len; + return true; } /* @@ -1127,11 +1124,13 @@ int bio_add_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (bio_full(bio, len)) - return 0; - __bio_add_page(bio, page, len, offset); - } + if (bio->bi_vcnt > 0 && + __bio_try_merge_page(bio, page, len, offset, &same_page)) + return len; + + if (bio_full(bio, len)) + return 0; + __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); @@ -1205,13 +1204,13 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, { bool same_page = false; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - __bio_add_page(bio, page, len, offset); + if (bio->bi_vcnt > 0 && + __bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (same_page) + bio_release_page(bio, page); return 0; } - - if (same_page) - bio_release_page(bio, page); + __bio_add_page(bio, page, len, offset); return 0; } -- cgit v1.2.3 From 613699050a49760f1d70c74f71bd0b013ca3c356 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:30 -0700 Subject: block: move the bi_size overflow check in __bio_try_merge_page Checking for availability in bi_size in a function that attempts to merge into an existing segment is a bit odd, as the limit also applies when adding a new segment. This code works fine as we always call __bio_try_merge_page, but contributes to sub-optimal calling conventions and doesn't lead to clear code. Move it to two of the callers instead, the third one already has a more strict check that includes max_hw_segments anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230724165433.117645-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 3dcbe98580dc..17f57fd2cff2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -949,10 +949,6 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page, if (!page_is_mergeable(bv, page, len, off, same_page)) return false; - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } bv->bv_len += len; bio->bi_iter.bi_size += len; return true; @@ -1123,6 +1119,8 @@ int bio_add_page(struct bio *bio, struct page *page, if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return 0; if (bio->bi_vcnt > 0 && __bio_try_merge_page(bio, page, len, offset, &same_page)) @@ -1204,6 +1202,9 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, { bool same_page = false; + if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) + return -EIO; + if (bio->bi_vcnt > 0 && __bio_try_merge_page(bio, page, len, offset, &same_page)) { if (same_page) -- cgit v1.2.3 From 80232b520314214d846eb0a65faef8b51b702fa7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:31 -0700 Subject: block: downgrade a bio_full call in bio_add_page bio_add_page already checks that there is space in bi_size a little earlier. So after we failed to add to an existing segment, just check that there is another one available instead of duplicating the bi_size check. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-7-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 17f57fd2cff2..d8e0e8de8cf4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1126,7 +1126,7 @@ int bio_add_page(struct bio *bio, struct page *page, __bio_try_merge_page(bio, page, len, offset, &same_page)) return len; - if (bio_full(bio, len)) + if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; __bio_add_page(bio, page, len, offset); return len; -- cgit v1.2.3 From 858c708d9efb7e8e5c6320793b778cc17cf8368a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:32 -0700 Subject: block: move the bi_size update out of __bio_try_merge_page The update of bi_size is the only thing in __bio_try_merge_page that needs a bio. Move it to the callers, and merge __bio_try_merge_page and page_is_mergeable into a single bvec_try_merge_page that only takes the current bvec instead of a full bio. This will allow reusing this function for supporting multi-page integrity payload bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-8-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 57 ++++++++++++++++++++------------------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index d8e0e8de8cf4..23b7a001b500 100644 --- a/block/bio.c +++ b/block/bio.c @@ -903,9 +903,8 @@ static inline bool bio_full(struct bio *bio, unsigned len) return false; } -static inline bool page_is_mergeable(const struct bio_vec *bv, - struct page *page, unsigned int len, unsigned int off, - bool *same_page) +static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, + unsigned int len, unsigned int off, bool *same_page) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -919,38 +918,14 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (*same_page) - return true; - else if (IS_ENABLED(CONFIG_KMSAN)) - return false; - return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); -} - -/** - * __bio_try_merge_page - try appending data to an existing bvec. - * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * useful optimisation for file systems with a block size smaller than the - * page size. - * - * Warn if (@len, @off) crosses pages in case that @same_page is true. - * - * Return %true on success or %false on failure. - */ -static bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) -{ - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (!*same_page) { + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) + return false; + } - if (!page_is_mergeable(bv, page, len, off, same_page)) - return false; bv->bv_len += len; - bio->bi_iter.bi_size += len; return true; } @@ -972,7 +947,7 @@ static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, return false; if (bv->bv_len + len > queue_max_segment_size(q)) return false; - return __bio_try_merge_page(bio, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset, same_page); } /** @@ -1001,8 +976,11 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) + if (bio_try_merge_hw_seg(q, bio, page, len, offset, + same_page)) { + bio->bi_iter.bi_size += len; return len; + } if (bio->bi_vcnt >= min(bio->bi_max_vecs, queue_max_segments(q))) @@ -1123,8 +1101,11 @@ int bio_add_page(struct bio *bio, struct page *page, return 0; if (bio->bi_vcnt > 0 && - __bio_try_merge_page(bio, page, len, offset, &same_page)) + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; return len; + } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; @@ -1206,7 +1187,9 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, return -EIO; if (bio->bi_vcnt > 0 && - __bio_try_merge_page(bio, page, len, offset, &same_page)) { + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; if (same_page) bio_release_page(bio, page); return 0; -- cgit v1.2.3 From ae42f0b3bf65912e122fc2e8d5f6d94b51156dba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 09:54:33 -0700 Subject: block: don't pass a bio to bio_try_merge_hw_seg There is no good reason to pass the bio to bio_try_merge_hw_seg. Just pass the current bvec and rename the function to bvec_try_merge_hw_page. This will allow reusing this function for supporting multi-page integrity payload bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Jinyoung Choi Link: https://lore.kernel.org/r/20230724165433.117645-9-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 23b7a001b500..c92dda962449 100644 --- a/block/bio.c +++ b/block/bio.c @@ -934,11 +934,10 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, - struct page *page, unsigned len, - unsigned offset, bool *same_page) +static bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; @@ -967,8 +966,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { - struct bio_vec *bvec; - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; @@ -976,7 +973,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, same_page)) { bio->bi_iter.bi_size += len; return len; @@ -990,8 +989,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. */ - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bvec_gap_to_prev(&q->limits, bvec, offset)) + if (bvec_gap_to_prev(&q->limits, bv, offset)) return 0; } -- cgit v1.2.3 From 65a558f66c308251e256317957b75d1e643c33c3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 21 Jul 2023 10:27:30 -0700 Subject: block: Improve performance for BLK_MQ_F_BLOCKING drivers blk_mq_run_queue() runs the queue asynchronously if BLK_MQ_F_BLOCKING has been set. This is suboptimal since running the queue asynchronously is slower than running the queue synchronously. This patch modifies blk_mq_run_queue() as follows if BLK_MQ_F_BLOCKING has been set: - Run the queue synchronously if it is allowed to sleep. - Run the queue asynchronously if it is not allowed to sleep. Additionally, blk_mq_run_hw_queue(hctx, false) calls are modified into blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING) if the caller may be invoked from atomic context. The following caller chains have been reviewed: blk_mq_run_hw_queue(hctx, false) blk_mq_get_tag() /* may sleep, hence the functions it calls may also sleep */ blk_execute_rq() /* may sleep */ blk_mq_run_hw_queues(q, async=false) blk_freeze_queue_start() /* may sleep */ blk_mq_requeue_work() /* may sleep */ scsi_kick_queue() scsi_requeue_run_queue() /* may sleep */ scsi_run_host_queues() scsi_ioctl_reset() /* may sleep */ blk_mq_insert_requests(hctx, ctx, list, run_queue_async=false) blk_mq_dispatch_plug_list(plug, from_sched=false) blk_mq_flush_plug_list(plug, from_schedule=false) __blk_flush_plug(plug, from_schedule=false) blk_add_rq_to_plug() blk_mq_submit_bio() /* may sleep if REQ_NOWAIT has not been set */ blk_mq_plug_issue_direct() blk_mq_flush_plug_list() /* see above */ blk_mq_dispatch_plug_list(plug, from_sched=false) blk_mq_flush_plug_list() /* see above */ blk_mq_try_issue_directly() blk_mq_submit_bio() /* may sleep if REQ_NOWAIT has not been set */ blk_mq_try_issue_list_directly(hctx, list) blk_mq_insert_requests() /* see above */ Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230721172731.955724-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++------ drivers/scsi/scsi_lib.c | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d98654869615..687ec3f4f10d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1323,7 +1323,7 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); @@ -2222,6 +2222,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) */ WARN_ON_ONCE(!async && in_interrupt()); + might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); + /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue @@ -2237,8 +2239,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) if (!need_run) return; - if (async || (hctx->flags & BLK_MQ_F_BLOCKING) || - !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { + if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } @@ -2373,7 +2374,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -2403,7 +2404,8 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) unsigned long i; queue_for_each_hw_ctx(q, hctx, i) - blk_mq_start_stopped_hw_queue(hctx, async); + blk_mq_start_stopped_hw_queue(hctx, async || + (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); @@ -2461,6 +2463,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); + if (rq->cmd_flags & REQ_NOWAIT) + run_queue_async = true; } spin_lock(&ctx->lock); @@ -2621,7 +2625,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d4c514ab9fe8..59176946ab56 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -335,7 +335,8 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev) * but in most cases, we will be first. Ideally, each LU on the * target would get some limited time or requests on the target. */ - blk_mq_run_hw_queues(current_sdev->request_queue, false); + blk_mq_run_hw_queues(current_sdev->request_queue, + shost->queuecommand_may_block); spin_lock_irqsave(shost->host_lock, flags); if (!starget->starget_sdev_user) -- cgit v1.2.3 From 51d74ec9b62f5813767a60226acaf943e26e7d7a Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Tue, 25 Jul 2023 14:18:39 +0900 Subject: block: cleanup bio_integrity_prep If a problem occurs in the process of creating an integrity payload, the status of bio is always BLK_STS_RESOURCE. Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230725051839epcms2p8e4d20ad6c51326ad032e8406f59d0aaa@epcms2p8 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8f0af7ac8573..045553a164e0 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -199,7 +199,6 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - blk_status_t status; if (!bi) return true; @@ -227,7 +226,6 @@ bool bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, GFP_NOIO); - status = BLK_STS_RESOURCE; if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; @@ -242,7 +240,6 @@ bool bio_integrity_prep(struct bio *bio) if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - status = BLK_STS_RESOURCE; goto err_end_io; } @@ -270,7 +267,6 @@ bool bio_integrity_prep(struct bio *bio) if (ret == 0) { printk(KERN_ERR "could not attach integrity payload\n"); - status = BLK_STS_RESOURCE; goto err_end_io; } @@ -292,7 +288,7 @@ bool bio_integrity_prep(struct bio *bio) return true; err_end_io: - bio->bi_status = status; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; -- cgit v1.2.3 From 727cfe976758b79f8d2f8051c75a5ccb14539a56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:21:58 +0200 Subject: block: open code __generic_file_write_iter for blkdev writes Open code __generic_file_write_iter to remove the indirect call into ->direct_IO and to prepare using the iomap based write code. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20230801172201.1923299-4-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index a286bf3325c5..8a05d99166e3 100644 --- a/block/fops.c +++ b/block/fops.c @@ -533,6 +533,30 @@ static int blkdev_release(struct inode *inode, struct file *filp) return 0; } +static ssize_t +blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + size_t count = iov_iter_count(from); + ssize_t written; + + written = kiocb_invalidate_pages(iocb, count); + if (written) { + if (written == -EBUSY) + return 0; + return written; + } + + written = blkdev_direct_IO(iocb, from); + if (written > 0) { + kiocb_invalidate_post_direct_write(iocb, count); + iocb->ki_pos += written; + count -= written; + } + if (written != -EIOCBQUEUED) + iov_iter_revert(from, count - iov_iter_count(from)); + return written; +} + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -542,7 +566,8 @@ static int blkdev_release(struct inode *inode, struct file *filp) */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct inode *bd_inode = bdev->bd_inode; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; @@ -569,7 +594,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, size); } - ret = __generic_file_write_iter(iocb, from); + ret = file_remove_privs(file); + if (ret) + return ret; + + ret = file_update_time(file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = blkdev_direct_write(iocb, from); + if (ret >= 0 && iov_iter_count(from)) + ret = direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); + } else { + ret = generic_perform_write(iocb, from); + } + if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); -- cgit v1.2.3 From a05f7bd9578b17521a9a5f3689f3934c082c6390 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:21:59 +0200 Subject: block: stop setting ->direct_IO Direct I/O on block devices now nevers goes through aops->direct_IO. Stop setting it and set the FMODE_CAN_ODIRECT in ->open instead. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230801172201.1923299-5-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index 8a05d99166e3..f0b822c28ddf 100644 --- a/block/fops.c +++ b/block/fops.c @@ -428,7 +428,6 @@ const struct address_space_operations def_blk_aops = { .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .direct_IO = blkdev_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; @@ -505,7 +504,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; /* * Use the file private data to store the holder for exclusive openes. -- cgit v1.2.3 From 487c607df790d366e67a7d6a30adf785cdd98e55 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:22:00 +0200 Subject: block: use iomap for writes to block devices Use iomap in buffer_head compat mode to write to block devices. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Pankaj Raghav Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20230801172201.1923299-6-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 1 + block/fops.c | 31 +++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 86122e459fe0..1a13ef0b1ca1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,6 +5,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select FS_IOMAP select SBITMAP help Provide block layer support for the kernel. diff --git a/block/fops.c b/block/fops.c index f0b822c28ddf..063ece37d44e 100644 --- a/block/fops.c +++ b/block/fops.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "blk.h" @@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct block_device *bdev = I_BDEV(inode); + loff_t isize = i_size_read(inode); + + iomap->bdev = bdev; + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); + if (iomap->offset >= isize) + return -EIO; + iomap->type = IOMAP_MAPPED; + iomap->addr = iomap->offset; + iomap->length = isize - iomap->offset; + iomap->flags |= IOMAP_F_BUFFER_HEAD; + return 0; +} + +static const struct iomap_ops blkdev_iomap_ops = { + .iomap_begin = blkdev_iomap_begin, +}; + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -556,6 +578,11 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) return written; } +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) +{ + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); +} + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -605,9 +632,9 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = blkdev_direct_write(iocb, from); if (ret >= 0 && iov_iter_count(from)) ret = direct_write_fallback(iocb, from, ret, - generic_perform_write(iocb, from)); + blkdev_buffered_write(iocb, from)); } else { - ret = generic_perform_write(iocb, from); + ret = blkdev_buffered_write(iocb, from); } if (ret > 0) -- cgit v1.2.3 From 925c86a19bacf8ce10eb666328fb3fa5aff7b951 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Aug 2023 19:22:01 +0200 Subject: fs: add CONFIG_BUFFER_HEAD Add a new config option that controls building the buffer_head code, and select it from all file systems and stacking drivers that need it. For the block device nodes and alternative iomap based buffered I/O path is provided when buffer_head support is not enabled, and iomap needs a a small tweak to define the IOMAP_F_BUFFER_HEAD flag to 0 to not call into the buffer_head code when it doesn't exist. Otherwise this is just Kconfig and ifdef changes. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230801172201.1923299-7-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 70 +++++++++++++++++++++++++++++++++++++------- drivers/md/Kconfig | 1 + fs/Kconfig | 4 +++ fs/Makefile | 2 +- fs/adfs/Kconfig | 1 + fs/affs/Kconfig | 1 + fs/befs/Kconfig | 1 + fs/bfs/Kconfig | 1 + fs/efs/Kconfig | 1 + fs/exfat/Kconfig | 1 + fs/ext2/Kconfig | 1 + fs/ext4/Kconfig | 1 + fs/f2fs/Kconfig | 1 + fs/fat/Kconfig | 1 + fs/freevxfs/Kconfig | 1 + fs/gfs2/Kconfig | 1 + fs/hfs/Kconfig | 1 + fs/hfsplus/Kconfig | 1 + fs/hpfs/Kconfig | 1 + fs/isofs/Kconfig | 1 + fs/jfs/Kconfig | 1 + fs/minix/Kconfig | 1 + fs/nilfs2/Kconfig | 1 + fs/ntfs/Kconfig | 1 + fs/ntfs3/Kconfig | 1 + fs/ocfs2/Kconfig | 1 + fs/omfs/Kconfig | 1 + fs/qnx4/Kconfig | 1 + fs/qnx6/Kconfig | 1 + fs/reiserfs/Kconfig | 1 + fs/sysv/Kconfig | 1 + fs/udf/Kconfig | 1 + fs/ufs/Kconfig | 1 + include/linux/buffer_head.h | 32 ++++++++++---------- include/linux/iomap.h | 4 +++ include/trace/events/block.h | 2 ++ mm/migrate.c | 4 +-- 37 files changed, 119 insertions(+), 29 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index 063ece37d44e..eaa98a987213 100644 --- a/block/fops.c +++ b/block/fops.c @@ -24,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file) return file->f_mapping->host; } -static int blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; @@ -400,7 +391,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; - iomap->flags |= IOMAP_F_BUFFER_HEAD; + iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ return 0; } @@ -408,6 +399,16 @@ static const struct iomap_ops blkdev_iomap_ops = { .iomap_begin = blkdev_iomap_begin, }; +#ifdef CONFIG_BUFFER_HEAD +static int blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); @@ -453,6 +454,55 @@ const struct address_space_operations def_blk_aops = { .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; +#else /* CONFIG_BUFFER_HEAD */ +static int blkdev_read_folio(struct file *file, struct folio *folio) +{ + return iomap_read_folio(folio, &blkdev_iomap_ops); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &blkdev_iomap_ops); +} + +static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + loff_t isize = i_size_read(inode); + + if (WARN_ON_ONCE(offset >= isize)) + return -EIO; + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + return blkdev_iomap_begin(inode, offset, isize - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops blkdev_writeback_ops = { + .map_blocks = blkdev_map_blocks, +}; + +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); +} + +const struct address_space_operations def_blk_aops = { + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .read_folio = blkdev_read_folio, + .readahead = blkdev_readahead, + .writepages = blkdev_writepages, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .migrate_folio = filemap_migrate_folio, +}; +#endif /* CONFIG_BUFFER_HEAD */ /* * for a block special file file_inode(file)->i_size is zero diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 444517d1a233..2a8b081bce7d 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -15,6 +15,7 @@ if MD config BLK_DEV_MD tristate "RAID support" select BLOCK_HOLDER_DEPRECATED if SYSFS + select BUFFER_HEAD # BLOCK_LEGACY_AUTOLOAD requirement should be removed # after relevant mdadm enhancements - to make "names=yes" # the default - are widely available. diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7953..e8b17c81b83a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +config BUFFER_HEAD + bool + # old blockdev_direct_IO implementation. Use iomap for new code instead config LEGACY_DIRECT_IO + depends on BUFFER_HEAD bool if BLOCK diff --git a/fs/Makefile b/fs/Makefile index e513aaee0603..f9541f40be4e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o -obj-$(CONFIG_BLOCK) += buffer.o mpage.o +obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-y += notify/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index 44738fed6625..1b97058f0c4a 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -2,6 +2,7 @@ config ADFS_FS tristate "ADFS file system support" depends on BLOCK + select BUFFER_HEAD help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 962b86374e1c..1ae432d266c3 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -2,6 +2,7 @@ config AFFS_FS tristate "Amiga FFS file system support" depends on BLOCK + select BUFFER_HEAD select LEGACY_DIRECT_IO help The Fast File System (FFS) is the common file system used on hard diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig index 9550b6462b81..5fcfc4024ffe 100644 --- a/fs/befs/Kconfig +++ b/fs/befs/Kconfig @@ -2,6 +2,7 @@ config BEFS_FS tristate "BeOS file system (BeFS) support (read only)" depends on BLOCK + select BUFFER_HEAD select NLS help The BeOS File System (BeFS) is the native file system of Be, Inc's diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3a757805b585..8e7ef866b62a 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -2,6 +2,7 @@ config BFS_FS tristate "BFS file system support" depends on BLOCK + select BUFFER_HEAD help Boot File System (BFS) is a file system used under SCO UnixWare to allow the bootloader access to the kernel image and other important diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig index 2df1bac8b375..0833e533df9d 100644 --- a/fs/efs/Kconfig +++ b/fs/efs/Kconfig @@ -2,6 +2,7 @@ config EFS_FS tristate "EFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help EFS is an older file system used for non-ISO9660 CD-ROMs and hard disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig index 147edeb04469..cbeca8e44d9b 100644 --- a/fs/exfat/Kconfig +++ b/fs/exfat/Kconfig @@ -2,6 +2,7 @@ config EXFAT_FS tristate "exFAT filesystem support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 77393fda99af..74d98965902e 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EXT2_FS tristate "Second extended fs support" + select BUFFER_HEAD select FS_IOMAP select LEGACY_DIRECT_IO help diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 86699c8cab28..e20d59221fc0 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -28,6 +28,7 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" + select BUFFER_HEAD select JBD2 select CRC16 select CRYPTO diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 03ef087537c7..68a1e23e1557 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,6 +2,7 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select BUFFER_HEAD select NLS select CRYPTO select CRYPTO_CRC32 diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index afe83b4e7172..25fae1c83725 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config FAT_FS tristate + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index 0e2fc08f7de4..912107ebea6f 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -2,6 +2,7 @@ config VXFS_FS tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" depends on BLOCK + select BUFFER_HEAD help FreeVxFS is a file system driver that support the VERITAS VxFS(TM) file system format. VERITAS VxFS(TM) is the standard file system diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 03c966840422..be7f87a8e11a 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config GFS2_FS tristate "GFS2 file system support" + select BUFFER_HEAD select FS_POSIX_ACL select CRC32 select LIBCRC32C diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index d985066006d5..5ea5cd8ecea9 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -2,6 +2,7 @@ config HFS_FS tristate "Apple Macintosh file system support" depends on BLOCK + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 8034e7827a69..8ce4a33a9ac7 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -2,6 +2,7 @@ config HFSPLUS_FS tristate "Apple Extended HFS file system support" depends on BLOCK + select BUFFER_HEAD select NLS select NLS_UTF8 select LEGACY_DIRECT_IO diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index ec975f466877..ac1e9318e65a 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -2,6 +2,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK + select BUFFER_HEAD select FS_IOMAP help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 08ffd37b9bb8..51434f2a471b 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISO9660_FS tristate "ISO 9660 CDROM file system support" + select BUFFER_HEAD help This is the standard file system used on CD-ROMs. It was previously known as "High Sierra File System" and is called "hsfs" on other diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 51e856f0e4b8..17488440eef1 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config JFS_FS tristate "JFS filesystem support" + select BUFFER_HEAD select NLS select CRC32 select LEGACY_DIRECT_IO diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index de2003974ff0..90ddfad2a75e 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -2,6 +2,7 @@ config MINIX_FS tristate "Minix file system support" depends on BLOCK + select BUFFER_HEAD help Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 7d59567465e1..7dae168e346e 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NILFS2_FS tristate "NILFS2 file system support" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index f93e69a61283..7b2509741735 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS_FS tristate "NTFS file system support" + select BUFFER_HEAD select NLS help NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig index 96cc236f7f7b..cdfdf51e55d7 100644 --- a/fs/ntfs3/Kconfig +++ b/fs/ntfs3/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS3_FS tristate "NTFS Read-Write file system support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 3123da7cfb30..2514d36cbe01 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -2,6 +2,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on INET && SYSFS && CONFIGFS_FS + select BUFFER_HEAD select JBD2 select CRC32 select QUOTA diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig index 42b2ec35a05b..8470f6c3e64e 100644 --- a/fs/omfs/Kconfig +++ b/fs/omfs/Kconfig @@ -2,6 +2,7 @@ config OMFS_FS tristate "SonicBlue Optimized MPEG File System support" depends on BLOCK + select BUFFER_HEAD select CRC_ITU_T help This is the proprietary file system used by the Rio Karma music diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig index 45b5b98376c4..a2eb826e76c6 100644 --- a/fs/qnx4/Kconfig +++ b/fs/qnx4/Kconfig @@ -2,6 +2,7 @@ config QNX4FS_FS tristate "QNX4 file system support (read only)" depends on BLOCK + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 4 and QNX 6 (the latter is also called QNX RTP). diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig index 6a9d6bce1586..8e865d72204e 100644 --- a/fs/qnx6/Kconfig +++ b/fs/qnx6/Kconfig @@ -2,6 +2,7 @@ config QNX6FS_FS tristate "QNX6 file system support (read only)" depends on BLOCK && CRC32 + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 6 (also called QNX RTP). diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 4d22ecfe0fab..0e6fe26458fe 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config REISERFS_FS tristate "Reiserfs support (deprecated)" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index b4e23e03fbeb..67b3f90afbfd 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -2,6 +2,7 @@ config SYSV_FS tristate "System V/Xenix/V7/Coherent file system support" depends on BLOCK + select BUFFER_HEAD help SCO, Xenix and Coherent are commercial Unix systems for Intel machines, and Version 7 was used on the DEC PDP-11. Saying Y diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 82e8bfa2dfd9..8f7ce30d47fd 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config UDF_FS tristate "UDF file system support" + select BUFFER_HEAD select CRC_ITU_T select NLS select LEGACY_DIRECT_IO diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index 6d30adb6b890..9301e7ecd092 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -2,6 +2,7 @@ config UFS_FS tristate "UFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, OpenBSD and NeXTstep) use a file system called UFS. Some System V diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 7002a9ff63a3..c89ef50d5112 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -16,8 +16,6 @@ #include #include -#ifdef CONFIG_BLOCK - enum bh_state_bits { BH_Uptodate, /* Contains valid data */ BH_Dirty, /* Is dirty */ @@ -198,7 +196,6 @@ void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); -bool try_to_free_buffers(struct folio *); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, bool retry); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, @@ -213,10 +210,6 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); -int inode_has_buffers(struct inode *); -void invalidate_inode_buffers(struct inode *); -int remove_inode_buffers(struct inode *inode); -int sync_mapping_buffers(struct address_space *mapping); int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, bool datasync); int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, @@ -240,9 +233,6 @@ void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); -void invalidate_bh_lrus(void); -void invalidate_bh_lrus_cpu(void); -bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); @@ -258,8 +248,6 @@ int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock); -extern int buffer_heads_over_limit; - /* * Generic address_space_operations implementations for buffer_head-backed * address_spaces. @@ -304,8 +292,6 @@ extern int buffer_migrate_folio_norefs(struct address_space *, #define buffer_migrate_folio_norefs NULL #endif -void buffer_init(void); - /* * inline definitions */ @@ -465,7 +451,20 @@ __bread(struct block_device *bdev, sector_t block, unsigned size) bool block_dirty_folio(struct address_space *mapping, struct folio *folio); -#else /* CONFIG_BLOCK */ +#ifdef CONFIG_BUFFER_HEAD + +void buffer_init(void); +bool try_to_free_buffers(struct folio *folio); +int inode_has_buffers(struct inode *inode); +void invalidate_inode_buffers(struct inode *inode); +int remove_inode_buffers(struct inode *inode); +int sync_mapping_buffers(struct address_space *mapping); +void invalidate_bh_lrus(void); +void invalidate_bh_lrus_cpu(void); +bool has_bh_in_lru(int cpu, void *dummy); +extern int buffer_heads_over_limit; + +#else /* CONFIG_BUFFER_HEAD */ static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } @@ -473,9 +472,10 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } +static inline void invalidate_bh_lrus(void) {} static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 -#endif /* CONFIG_BLOCK */ +#endif /* CONFIG_BUFFER_HEAD */ #endif /* _LINUX_BUFFER_HEAD_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e2b836c2e119..54f50d34fd9d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -58,7 +58,11 @@ struct vm_fault; #define IOMAP_F_DIRTY (1U << 1) #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) +#ifdef CONFIG_BUFFER_HEAD #define IOMAP_F_BUFFER_HEAD (1U << 4) +#else +#define IOMAP_F_BUFFER_HEAD 0 +#endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) /* diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 40e60c33cc6f..0e128ad51460 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -12,6 +12,7 @@ #define RWBS_LEN 8 +#ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), @@ -61,6 +62,7 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_ARGS(bh) ); +#endif /* CONFIG_BUFFER_HEAD */ /** * block_rq_requeue - place block IO request back on a queue diff --git a/mm/migrate.c b/mm/migrate.c index 24baad2571e3..fe6f8d454aff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst, } EXPORT_SYMBOL(migrate_folio); -#ifdef CONFIG_BLOCK +#ifdef CONFIG_BUFFER_HEAD /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode) @@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping, return __buffer_migrate_folio(mapping, dst, src, mode, true); } EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); -#endif +#endif /* CONFIG_BUFFER_HEAD */ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) -- cgit v1.2.3 From d47f9717e5cfd0dd8c0ba2ecfa47c38d140f1bb6 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Thu, 3 Aug 2023 19:12:42 +0800 Subject: block/mq-deadline: use correct way to throttling write requests The original formula was inaccurate: dd->async_depth = max(1UL, 3 * q->nr_requests / 4); For write requests, when we assign a tags from sched_tags, data->shallow_depth will be passed to sbitmap_find_bit, see the following code: nr = sbitmap_find_bit_in_word(&sb->map[index], min_t (unsigned int, __map_depth(sb, index), depth), alloc_hint, wrap); The smaller of data->shallow_depth and __map_depth(sb, index) will be used as the maximum range when allocating bits. For a mmc device (one hw queue, deadline I/O scheduler): q->nr_requests = sched_tags = 128, so according to the previous calculation method, dd->async_depth = data->shallow_depth = 96, and the platform is 64bits with 8 cpus, sched_tags.bitmap_tags.sb.shift=5, sb.maps[]=32/32/32/32, 32 is smaller than 96, whether it is a read or a write I/O, tags can be allocated to the maximum range each time, which has not throttling effect. In addition, refer to the methods of bfg/kyber I/O scheduler, limit ratiois are calculated base on sched_tags.bitmap_tags.sb.shift. This patch can throttle write requests really. Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests") Signed-off-by: Zhiguo Niu Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/1691061162-22898-1-git-send-email-zhiguo.niu@unisoc.com Signed-off-by: Jens Axboe --- block/mq-deadline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 02a916ba62ee..f958e79277b8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags.sb.shift; - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + dd->async_depth = max(1U, 3 * (1U << shift) / 4); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } -- cgit v1.2.3 From 7c8998f75d2d42ddefb172239b0f689392958309 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:48:27 +0900 Subject: block: make bvec_try_merge_hw_page() non-static This will be used for multi-page configuration for integrity payload. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803024827epcms2p838d9e9131492c86a159fff25d195658f@epcms2p8 Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index c92dda962449..8d1533af7c60 100644 --- a/block/bio.c +++ b/block/bio.c @@ -934,7 +934,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, bool *same_page) { diff --git a/block/blk.h b/block/blk.h index 686712e13835..9d22ec3a53bc 100644 --- a/block/blk.h +++ b/block/blk.h @@ -75,6 +75,10 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page); + static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { -- cgit v1.2.3 From 80814b8e359f7207595f52702aea432a7bd61200 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:49:56 +0900 Subject: bio-integrity: update the payload size in bio_integrity_add_page() Previously, the bip's bi_size has been set before an integrity pages were added. If a problem occurs in the process of adding pages for bip, the bi_size mismatch problem must be dealt with. When the page is successfully added to bvec, the bi_size is updated. The parts affected by the change were also contained in this commit. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803024956epcms2p38186a17392706650c582d38ef3dbcd32@epcms2p3 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- drivers/md/dm-crypt.c | 1 - drivers/nvme/host/ioctl.c | 1 - drivers/nvme/target/io-cmd-bdev.c | 3 +-- drivers/target/target_core_iblock.c | 3 +-- 5 files changed, 3 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 045553a164e0..6220a99977a4 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -137,6 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; + bip->bip_iter.bi_size += len; return len; } @@ -244,7 +245,6 @@ bool bio_integrity_prep(struct bio *bio) } bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1dc6227d353e..f2662c21a6df 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1160,7 +1160,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); - bip->bip_iter.bi_size = tag_len; bip->bip_iter.bi_sector = io->cc->start + io->sector; ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 5c3250f36ce7..19a5177bc360 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -118,7 +118,6 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, goto out_free_meta; } - bip->bip_iter.bi_size = len; bip->bip_iter.bi_sector = seed; ret = bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 2733e0158585..468833675cc9 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -206,12 +206,11 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, return PTR_ERR(bip); } - bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); /* virtual start sector must be in integrity interval units */ bip_set_seed(bip, bio->bi_iter.bi_sector >> (bi->interval_exp - SECTOR_SHIFT)); - resid = bip->bip_iter.bi_size; + resid = bio_integrity_bytes(bi, bio_sectors(bio)); while (resid > 0 && sg_miter_next(miter)) { len = min_t(size_t, miter->length, resid); rc = bio_integrity_add_page(bio, miter->page, len, diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 3d1b511ea284..a7050f63b7cc 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -689,7 +689,6 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio, return PTR_ERR(bip); } - bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); /* virtual start sector must be in integrity interval units */ bip_set_seed(bip, bio->bi_iter.bi_sector >> (bi->interval_exp - SECTOR_SHIFT)); @@ -697,7 +696,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio, pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size, (unsigned long long)bip->bip_iter.bi_sector); - resid = bip->bip_iter.bi_size; + resid = bio_integrity_bytes(bi, bio_sectors(bio)); while (resid > 0 && sg_miter_next(miter)) { len = min_t(size_t, miter->length, resid); -- cgit v1.2.3 From d1f04c2e23c99258049c6081c3147bae69e5bcb8 Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:50:58 +0900 Subject: bio-integrity: cleanup adding integrity pages to bip's bvec. bio_integrity_add_page() returns the add length if successful, else 0, just as bio_add_page. Simply check return value checking in bio_integrity_prep to not deal with a > 0 but < len case that can't happen. Cc: Christoph Hellwig Cc: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803025058epcms2p5a4d0db5da2ad967668932d463661c633@epcms2p5 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6220a99977a4..c6b3bc86e1f9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -252,27 +252,18 @@ bool bio_integrity_prep(struct bio *bio) /* Map it */ offset = offset_in_page(buf); - for (i = 0 ; i < nr_pages ; i++) { - int ret; + for (i = 0; i < nr_pages && len > 0; i++) { bytes = PAGE_SIZE - offset; - if (len <= 0) - break; - if (bytes > len) bytes = len; - ret = bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset); - - if (ret == 0) { + if (bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset) < bytes) { printk(KERN_ERR "could not attach integrity payload\n"); goto err_end_io; } - if (ret < bytes) - break; - buf += bytes; len -= bytes; offset = 0; @@ -291,7 +282,6 @@ err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; - } EXPORT_SYMBOL(bio_integrity_prep); -- cgit v1.2.3 From 0ece1d649b6dd615925a72bc1824d6b9fa5b998a Mon Sep 17 00:00:00 2001 From: Jinyoung Choi Date: Thu, 3 Aug 2023 11:52:02 +0900 Subject: bio-integrity: create multi-page bvecs in bio_integrity_add_page() In general, the bvec data structure consists of one for physically continuous pages. But, in the bvec configuration for bip, physically continuous integrity pages are composed of each bvec. Allow bio_integrity_add_page() to create multi-page bvecs, just like the bio payloads. This simplifies adding larger payloads, and fixes support for non-tiny workloads with nvme, which stopped using scatterlist for metadata a while ago. Cc: Christoph Hellwig Cc: Martin K. Petersen Fixes: 783b94bd9250 ("nvme-pci: do not build a scatterlist to map metadata") Reviewed-by: Christoph Hellwig Signed-off-by: Jinyoung Choi Tested-by: "Martin K. Petersen" Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20230803025202epcms2p82f57cbfe32195da38c776377b55aed59@epcms2p8 Signed-off-by: Jens Axboe --- block/bio-integrity.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c6b3bc86e1f9..ec8ac8cf6e1b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -123,17 +123,34 @@ void bio_integrity_free(struct bio *bio) int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_integrity_payload *bip = bio_integrity(bio); - if (bip->bip_vcnt >= bip->bip_max_vcnt) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); + if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) > + queue_max_hw_sectors(q)) return 0; - } - if (bip->bip_vcnt && - bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, - &bip->bip_vec[bip->bip_vcnt - 1], offset)) - return 0; + if (bip->bip_vcnt > 0) { + struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; + bool same_page = false; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + &same_page)) { + bip->bip_iter.bi_size += len; + return len; + } + + if (bip->bip_vcnt >= + min(bip->bip_max_vcnt, queue_max_integrity_segments(q))) + return 0; + + /* + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. + */ + if (bvec_gap_to_prev(&q->limits, bv, offset)) + return 0; + } bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; -- cgit v1.2.3 From 4eb44d10766ac0fae5973998fd2a0103df1d3fe1 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 10 Aug 2023 11:51:11 +0800 Subject: block: remove init_mutex and open-code blk_iolatency_try_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a13696b83da4 ("blk-iolatency: Make initialization lazy") adds a mutex named "init_mutex" in blk_iolatency_try_init for the race condition of initializing RQ_QOS_LATENCY. Now a new lock has been add to struct request_queue by commit a13bd91be223 ("block/rq_qos: protect rq_qos apis with a new lock"). And it has been held in blkg_conf_open_bdev before calling blk_iolatency_init. So it's not necessary to keep init_mutex in blk_iolatency_try_init, just remove it. Since init_mutex has been removed, blk_iolatency_try_init can be open-coded back to iolatency_set_limit() like ioc_qos_write(). Signed-off-by: Li Lingfeng Reviewed-by: Michal Koutný Link: https://lore.kernel.org/r/20230810035111.2236335-1-lilingfeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index fd5fec989e39..c16aef4be036 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -824,29 +824,6 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg) } } -static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx) -{ - static DEFINE_MUTEX(init_mutex); - int ret; - - ret = blkg_conf_open_bdev(ctx); - if (ret) - return ret; - - /* - * blk_iolatency_init() may fail after rq_qos_add() succeeds which can - * confuse iolat_rq_qos() test. Make the test and init atomic. - */ - mutex_lock(&init_mutex); - - if (!iolat_rq_qos(ctx->bdev->bd_queue)) - ret = blk_iolatency_init(ctx->bdev->bd_disk); - - mutex_unlock(&init_mutex); - - return ret; -} - static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -861,7 +838,17 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, blkg_conf_init(&ctx, buf); - ret = blk_iolatency_try_init(&ctx); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out; + + /* + * blk_iolatency_init() may fail after rq_qos_add() succeeds which can + * confuse iolat_rq_qos() test. Make the test and init atomic. + */ + lockdep_assert_held(ctx.bdev->bd_queue->rq_qos_mutex); + if (!iolat_rq_qos(ctx.bdev->bd_queue)) + ret = blk_iolatency_init(ctx.bdev->bd_disk); if (ret) goto out; -- cgit v1.2.3 From 18267a0365d6ec8bbe85ba8cbea5af12d9e59610 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Aug 2023 17:24:53 -0600 Subject: block: fix bad lockdep annotation in blk-iolatency A previous commit added a lockdep annotation, but botched it. Use the right type. Fixes: 4eb44d10766a ("block: remove init_mutex and open-code blk_iolatency_try_init") Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c16aef4be036..c1a6aba1d59e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -846,7 +846,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, * blk_iolatency_init() may fail after rq_qos_add() succeeds which can * confuse iolat_rq_qos() test. Make the test and init atomic. */ - lockdep_assert_held(ctx.bdev->bd_queue->rq_qos_mutex); + lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex); if (!iolat_rq_qos(ctx.bdev->bd_queue)) ret = blk_iolatency_init(ctx.bdev->bd_disk); if (ret) -- cgit v1.2.3 From 7ba3792718709d410be5d971732b9251cbda67b6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Aug 2023 14:26:34 -0400 Subject: block: Add some exports for bcachefs - bio_set_pages_dirty(), bio_check_pages_dirty() - dio path - blk_status_to_str() - error messages - bio_add_folio() - this should definitely be exported for everyone, it's the modern version of bio_add_page() Signed-off-by: Kent Overstreet Cc: linux-block@vger.kernel.org Cc: Jens Axboe Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20230813182636.2966159-2-kent.overstreet@linux.dev Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ block/blk-core.c | 1 + block/blk.h | 1 - include/linux/blkdev.h | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8d1533af7c60..bb3ea4e05d4c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1472,6 +1472,7 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } +EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. @@ -1531,6 +1532,7 @@ defer: spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } +EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { diff --git a/block/blk-core.c b/block/blk-core.c index 99d8b9812b18..141e54545cc1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status) return ""; return blk_errors[idx].name; } +EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue diff --git a/block/blk.h b/block/blk.h index 9d22ec3a53bc..08a358bc0919 100644 --- a/block/blk.h +++ b/block/blk.h @@ -254,7 +254,6 @@ static inline void bio_integrity_free(struct bio *bio) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -const char *blk_status_to_str(blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f5371b8482c..4feed1fc141f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -847,6 +847,7 @@ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); +const char *blk_status_to_str(blk_status_t status); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) -- cgit v1.2.3 From 168145f617d57bf4e474901b7ffa869337a802e6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Aug 2023 14:26:35 -0400 Subject: block: Allow bio_iov_iter_get_pages() with bio->bi_bdev unset bio_iov_iter_get_pages() trims the IO based on the block size of the block device the IO will be issued to. However, bcachefs is a multi device filesystem; when we're creating the bio we don't yet know which block device the bio will be submitted to - we have to handle the alignment checks elsewhere. Thus this is needed to avoid a null ptr deref. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: linux-block@vger.kernel.org Link: https://lore.kernel.org/r/20230813182636.2966159-3-kent.overstreet@linux.dev Signed-off-by: Jens Axboe --- block/bio.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index bb3ea4e05d4c..be484d87142b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1231,7 +1231,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size, left; unsigned len, i = 0; - size_t offset, trim; + size_t offset; int ret = 0; /* @@ -1260,10 +1260,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); - iov_iter_revert(iter, trim); + if (bio->bi_bdev) { + size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); + iov_iter_revert(iter, trim); + size -= trim; + } - size -= trim; if (unlikely(!size)) { ret = -EFAULT; goto out; -- cgit v1.2.3 From 649f070e69739d22c57c22dbce0788b72cd93fac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Aug 2023 14:26:36 -0400 Subject: block: Bring back zero_fill_bio_iter This reverts 6f822e1b5d9dda3d20e87365de138046e3baa03a - this helper is used by bcachefs. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: linux-block@vger.kernel.org Link: https://lore.kernel.org/r/20230813182636.2966159-4-kent.overstreet@linux.dev Signed-off-by: Jens Axboe --- block/bio.c | 6 +++--- include/linux/bio.h | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index be484d87142b..816d412c06e9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) } EXPORT_SYMBOL(bio_kmalloc); -void zero_fill_bio(struct bio *bio) +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) + __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } -EXPORT_SYMBOL(zero_fill_bio); +EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size diff --git a/include/linux/bio.h b/include/linux/bio.h index c4f5b5228105..8b99210eb7fb 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -488,7 +488,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); -void zero_fill_bio(struct bio *bio); +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); + +static inline void zero_fill_bio(struct bio *bio) +{ + zero_fill_bio_iter(bio, bio->bi_iter); +} static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { -- cgit v1.2.3 From ec14a87ee1999b19d8b7ed0fa95fea80644624ae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Aug 2023 09:56:23 -1000 Subject: blk-cgroup: Fix NULL deref caused by blkg_policy_data being installed before init blk-iocost sometimes causes the following crash: BUG: kernel NULL pointer dereference, address: 00000000000000e0 ... RIP: 0010:_raw_spin_lock+0x17/0x30 Code: be 01 02 00 00 e8 79 38 39 ff 31 d2 89 d0 5d c3 0f 1f 00 0f 1f 44 00 00 55 48 89 e5 65 ff 05 48 d0 34 7e b9 01 00 00 00 31 c0 0f b1 0f 75 02 5d c3 89 c6 e8 ea 04 00 00 5d c3 0f 1f 84 00 00 RSP: 0018:ffffc900023b3d40 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 00000000000000e0 RCX: 0000000000000001 RDX: ffffc900023b3d20 RSI: ffffc900023b3cf0 RDI: 00000000000000e0 RBP: ffffc900023b3d40 R08: ffffc900023b3c10 R09: 0000000000000003 R10: 0000000000000064 R11: 000000000000000a R12: ffff888102337000 R13: fffffffffffffff2 R14: ffff88810af408c8 R15: ffff8881070c3600 FS: 00007faaaf364fc0(0000) GS:ffff88842fdc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000e0 CR3: 00000001097b1000 CR4: 0000000000350ea0 Call Trace: ioc_weight_write+0x13d/0x410 cgroup_file_write+0x7a/0x130 kernfs_fop_write_iter+0xf5/0x170 vfs_write+0x298/0x370 ksys_write+0x5f/0xb0 __x64_sys_write+0x1b/0x20 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 This happens because iocg->ioc is NULL. The field is initialized by ioc_pd_init() and never cleared. The NULL deref is caused by blkcg_activate_policy() installing blkg_policy_data before initializing it. blkcg_activate_policy() was doing the following: 1. Allocate pd's for all existing blkg's and install them in blkg->pd[]. 2. Initialize all pd's. 3. Online all pd's. blkcg_activate_policy() only grabs the queue_lock and may release and re-acquire the lock as allocation may need to sleep. ioc_weight_write() grabs blkcg->lock and iterates all its blkg's. The two can race and if ioc_weight_write() runs during #1 or between #1 and #2, it can encounter a pd which is not initialized yet, leading to crash. The crash can be reproduced with the following script: #!/bin/bash echo +io > /sys/fs/cgroup/cgroup.subtree_control systemd-run --unit touch-sda --scope dd if=/dev/sda of=/dev/null bs=1M count=1 iflag=direct echo 100 > /sys/fs/cgroup/system.slice/io.weight bash -c "echo '8:0 enable=1' > /sys/fs/cgroup/io.cost.qos" & sleep .2 echo 100 > /sys/fs/cgroup/system.slice/io.weight with the following patch applied: > diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c > index fc49be622e05..38d671d5e10c 100644 > --- a/block/blk-cgroup.c > +++ b/block/blk-cgroup.c > @@ -1553,6 +1553,12 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) > pd->online = false; > } > > + if (system_state == SYSTEM_RUNNING) { > + spin_unlock_irq(&q->queue_lock); > + ssleep(1); > + spin_lock_irq(&q->queue_lock); > + } > + > /* all allocated, init in the same order */ > if (pol->pd_init_fn) > list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) I don't see a reason why all pd's should be allocated, initialized and onlined together. The only ordering requirement is that parent blkgs to be initialized and onlined before children, which is guaranteed from the walking order. Let's fix the bug by allocating, initializing and onlining pd for each blkg and holding blkcg->lock over initialization and onlining. This ensures that an installed blkg is always fully initialized and onlined removing the the race window. Signed-off-by: Tejun Heo Reported-by: Breno Leitao Fixes: 9d179b865449 ("blkcg: Fix multiple bugs in blkcg_activate_policy()") Link: https://lore.kernel.org/r/ZN0p5_W-Q9mAHBVY@slm.duckdns.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fc49be622e05..638400bf049f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1509,7 +1509,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to allocate parents first */ + /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; @@ -1547,21 +1547,20 @@ retry: goto enomem; } - blkg->pd[pol->plid] = pd; + spin_lock(&blkg->blkcg->lock); + pd->blkg = blkg; pd->plid = pol->plid; - pd->online = false; - } + blkg->pd[pol->plid] = pd; - /* all allocated, init in the same order */ - if (pol->pd_init_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) - pol->pd_init_fn(blkg->pd[pol->plid]); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { if (pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[pol->plid]); - blkg->pd[pol->plid]->online = true; + pol->pd_online_fn(pd); + pd->online = true; + + spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); @@ -1578,14 +1577,19 @@ out: return ret; enomem: - /* alloc failed, nothing's initialized yet, free everything */ + /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; + struct blkg_policy_data *pd; spin_lock(&blkcg->lock); - if (blkg->pd[pol->plid]) { - pol->pd_free_fn(blkg->pd[pol->plid]); + pd = blkg->pd[pol->plid]; + if (pd) { + if (pd->online && pol->pd_offline_fn) + pol->pd_offline_fn(pd); + pd->online = false; + pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); -- cgit v1.2.3 From e1dd7bc93029024af5688253b0c05181d6e01f8e Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 21 Aug 2023 17:56:00 +0800 Subject: blk-mq: fix tags leak when shrink nr_hw_queues Although we don't need to realloc set->tags[] when shrink nr_hw_queues, we need to free them. Or these tags will be leaked. How to reproduce: 1. mount -t configfs configfs /mnt 2. modprobe null_blk nr_devices=0 submit_queues=8 3. mkdir /mnt/nullb/nullb0 4. echo 1 > /mnt/nullb/nullb0/power 5. echo 4 > /mnt/nullb/nullb0/submit_queues 6. rmdir /mnt/nullb/nullb0 In step 4, will alloc 9 tags (8 submit queues and 1 poll queue), then in step 5, new_nr_hw_queues = 5 (4 submit queues and 1 poll queue). At last in step 6, only these 5 tags are freed, the other 4 tags leaked. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230821095602.70742-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 687ec3f4f10d..afad6d06eaf7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4379,9 +4379,13 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; + int i; - if (set->nr_hw_queues >= new_nr_hw_queues) + if (set->nr_hw_queues >= new_nr_hw_queues) { + for (i = new_nr_hw_queues; i < set->nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); goto done; + } new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); -- cgit v1.2.3 From 2bc4d7a355a4d617452eaf1b21d6d261194b3667 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 21 Aug 2023 17:56:01 +0800 Subject: blk-mq: delete redundant tagset map update when fallback When we increase nr_hw_queues fail, the fallback path will use blk_mq_update_queue_map() to clear and update all maps. Obviously, this line of update of HCTX_TYPE_DEFAULT only is not needed, so delete it. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230821095602.70742-2-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index afad6d06eaf7..22397ba815ca 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4730,7 +4730,6 @@ fallback: __blk_mq_free_map_and_rqs(set, i); set->nr_hw_queues = prev_nr_hw_queues; - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; } blk_mq_map_swqueue(q); -- cgit v1.2.3 From 7222657e51b5626d10154b3e48ad441c33b5da96 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 21 Aug 2023 17:56:02 +0800 Subject: blk-mq: prealloc tags when increase tagset nr_hw_queues Just like blk_mq_alloc_tag_set(), it's better to prepare all tags before using to map to queue ctxs in blk_mq_map_swqueue(), which now have to consider empty set->tags[]. The good point is that we can fallback easily if increasing nr_hw_queues fail, instead of just mapping to hctx[0] when fail in blk_mq_map_swqueue(). And the fallback path already has tags free & clean handling, so all is good. Signed-off-by: Chengming Zhou Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230821095602.70742-3-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 22397ba815ca..84400157c5f4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4397,6 +4397,16 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; + + for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { + if (!__blk_mq_alloc_map_and_rqs(set, i)) { + while (--i >= set->nr_hw_queues) + __blk_mq_free_map_and_rqs(set, i); + return -ENOMEM; + } + cond_resched(); + } + done: set->nr_hw_queues = new_nr_hw_queues; return 0; -- cgit v1.2.3 From 9fb10726ecc5145550180aec4fd0adf0a7b1d634 Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Fri, 21 Jul 2023 16:15:32 -0500 Subject: block: sed-opal: Implement IOC_OPAL_DISCOVERY Add IOC_OPAL_DISCOVERY ioctl to return raw discovery data to a SED Opal application. This allows the application to display drive capabilities and state. Signed-off-by: Greg Joyce Reviewed-by: Christoph Hellwig Reviewed-by: Jonathan Derrick Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20230721211534.3437070-2-gjoyce@linux.vnet.ibm.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 38 +++++++++++++++++++++++++++++++++++--- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 6 ++++++ 3 files changed, 42 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/sed-opal.c b/block/sed-opal.c index c18339446ef3..67c6c4f2b4b0 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -463,8 +463,11 @@ out_error: return error; } -static int opal_discovery0_end(struct opal_dev *dev) +static int opal_discovery0_end(struct opal_dev *dev, void *data) { + struct opal_discovery *discv_out = data; /* may be NULL */ + u8 __user *buf_out; + u64 len_out; bool found_com_id = false, supported = true, single_user = false; const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; @@ -480,6 +483,15 @@ static int opal_discovery0_end(struct opal_dev *dev) return -EFAULT; } + if (discv_out) { + buf_out = (u8 __user *)(uintptr_t)discv_out->data; + len_out = min_t(u64, discv_out->size, hlen); + if (buf_out && copy_to_user(buf_out, dev->resp, len_out)) + return -EFAULT; + + discv_out->size = hlen; /* actual size of data */ + } + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ @@ -565,13 +577,13 @@ static int opal_discovery0(struct opal_dev *dev, void *data) if (ret) return ret; - return opal_discovery0_end(dev); + return opal_discovery0_end(dev, data); } static int opal_discovery0_step(struct opal_dev *dev) { const struct opal_step discovery0_step = { - opal_discovery0, + opal_discovery0, NULL }; return execute_step(dev, &discovery0_step, 0); @@ -2435,6 +2447,22 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, return ret; } +static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) +{ + const struct opal_step discovery0_step = { + opal_discovery0, discv + }; + int ret = 0; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_step(dev, &discovery0_step, 0); + mutex_unlock(&dev->dev_lock); + if (ret) + return ret; + return discv->size; /* modified to actual length of data */ +} + static int opal_erase_locking_range(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -3056,6 +3084,10 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GET_GEOMETRY: ret = opal_get_geometry(dev, arg); break; + case IOC_OPAL_DISCOVERY: + ret = opal_get_discv(dev, p); + break; + default: break; } diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index bbae1e52ab4f..ef65f589fbeb 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -47,6 +47,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_GET_STATUS: case IOC_OPAL_GET_LR_STATUS: case IOC_OPAL_GET_GEOMETRY: + case IOC_OPAL_DISCOVERY: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index dc2efd345133..7f5732c5bdc5 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -173,6 +173,11 @@ struct opal_geometry { __u8 __align[3]; }; +struct opal_discovery { + __u64 data; + __u64 size; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -192,5 +197,6 @@ struct opal_geometry { #define IOC_OPAL_GET_STATUS _IOR('p', 236, struct opal_status) #define IOC_OPAL_GET_LR_STATUS _IOW('p', 237, struct opal_lr_status) #define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry) +#define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 5c82efc1aee8eb0919aa67a0d2559de5a326bd7c Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Fri, 21 Jul 2023 16:15:33 -0500 Subject: block: sed-opal: Implement IOC_OPAL_REVERT_LSP This is used in conjunction with IOC_OPAL_REVERT_TPR to return a drive to Original Factory State without erasing the data. If IOC_OPAL_REVERT_LSP is called with opal_revert_lsp.options bit OPAL_PRESERVE set prior to calling IOC_OPAL_REVERT_TPR, the drive global locking range will not be erased. Signed-off-by: Greg Joyce Reviewed-by: Christoph Hellwig Reviewed-by: Jonathan Derrick Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20230721211534.3437070-3-gjoyce@linux.vnet.ibm.com Signed-off-by: Jens Axboe --- block/opal_proto.h | 4 ++++ block/sed-opal.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 11 +++++++++++ 4 files changed, 56 insertions(+) (limited to 'block') diff --git a/block/opal_proto.h b/block/opal_proto.h index a4e56845dd82..dec7ce3a3edb 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -225,6 +225,10 @@ enum opal_parameter { OPAL_SUM_SET_LIST = 0x060000, }; +enum opal_revertlsp { + OPAL_KEEP_GLOBAL_RANGE_KEY = 0x060000, +}; + /* Packets derived from: * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Secion: 3.2.3 ComPackets, Packets & Subpackets diff --git a/block/sed-opal.c b/block/sed-opal.c index 67c6c4f2b4b0..e2aed7f4ebdf 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1769,6 +1769,26 @@ static int internal_activate_user(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int revert_lsp(struct opal_dev *dev, void *data) +{ + struct opal_revert_lsp *rev = data; + int err; + + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], + opalmethod[OPAL_REVERTSP]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_KEEP_GLOBAL_RANGE_KEY); + add_token_u8(&err, dev, (rev->options & OPAL_PRESERVE) ? + OPAL_TRUE : OPAL_FALSE); + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) { + pr_debug("Error building REVERT SP command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + static int erase_locking_range(struct opal_dev *dev, void *data) { struct opal_session_info *session = data; @@ -2463,6 +2483,23 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) return discv->size; /* modified to actual length of data */ } +static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev) +{ + /* controller will terminate session */ + const struct opal_step steps[] = { + { start_admin1LSP_opal_session, &rev->key }, + { revert_lsp, rev } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_erase_locking_range(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -3084,6 +3121,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GET_GEOMETRY: ret = opal_get_geometry(dev, arg); break; + case IOC_OPAL_REVERT_LSP: + ret = opal_revertlsp(dev, p); + break; case IOC_OPAL_DISCOVERY: ret = opal_get_discv(dev, p); break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index ef65f589fbeb..2f189546e133 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -48,6 +48,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_GET_LR_STATUS: case IOC_OPAL_GET_GEOMETRY: case IOC_OPAL_DISCOVERY: + case IOC_OPAL_REVERT_LSP: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 7f5732c5bdc5..4e10675751b4 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -56,6 +56,10 @@ struct opal_key { __u8 key[OPAL_KEY_MAX]; }; +enum opal_revert_lsp_opts { + OPAL_PRESERVE = 0x01, +}; + struct opal_lr_act { struct opal_key key; __u32 sum; @@ -178,6 +182,12 @@ struct opal_discovery { __u64 size; }; +struct opal_revert_lsp { + struct opal_key key; + __u32 options; + __u32 __pad; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -198,5 +208,6 @@ struct opal_discovery { #define IOC_OPAL_GET_LR_STATUS _IOW('p', 237, struct opal_lr_status) #define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry) #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) +#define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 3bfeb61256643281ac4be5b8a57e9d9da3db4335 Mon Sep 17 00:00:00 2001 From: Greg Joyce Date: Fri, 21 Jul 2023 16:15:34 -0500 Subject: block: sed-opal: keyring support for SED keys Extend the SED block driver so it can alternatively obtain a key from a sed-opal kernel keyring. The SED ioctls will indicate the source of the key, either directly in the ioctl data or from the keyring. This allows the use of SED commands in scripts such as udev scripts so that drives may be automatically unlocked as they become available. Signed-off-by: Greg Joyce Reviewed-by: Jonathan Derrick Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20230721211534.3437070-4-gjoyce@linux.vnet.ibm.com Signed-off-by: Jens Axboe --- block/Kconfig | 2 + block/sed-opal.c | 174 +++++++++++++++++++++++++++++++++++++++++- include/linux/sed-opal.h | 3 + include/uapi/linux/sed-opal.h | 8 +- 4 files changed, 184 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 1a13ef0b1ca1..f1364d1c0d93 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -184,6 +184,8 @@ config BLK_DEBUG_FS_ZONED config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" + depends on KEYS + select PSERIES_PLPKS if PPC_PSERIES help Builds Logic for interfacing with Opal enabled controllers. Enabling this option enables users to setup/unlock/lock diff --git a/block/sed-opal.c b/block/sed-opal.c index e2aed7f4ebdf..6d7f25d1711b 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include "opal_proto.h" @@ -29,6 +32,8 @@ /* Number of bytes needed by cmd_finalize. */ #define CMD_FINALIZE_BYTES_NEEDED 7 +static struct key *sed_opal_keyring; + struct opal_step { int (*fn)(struct opal_dev *dev, void *data); void *data; @@ -269,6 +274,101 @@ static void print_buffer(const u8 *ptr, u32 length) #endif } +/* + * Allocate/update a SED Opal key and add it to the SED Opal keyring. + */ +static int update_sed_opal_key(const char *desc, u_char *key_data, int keylen) +{ + key_ref_t kr; + + if (!sed_opal_keyring) + return -ENOKEY; + + kr = key_create_or_update(make_key_ref(sed_opal_keyring, true), "user", + desc, (const void *)key_data, keylen, + KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_WRITE, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(kr)) { + pr_err("Error adding SED key (%ld)\n", PTR_ERR(kr)); + return PTR_ERR(kr); + } + + return 0; +} + +/* + * Read a SED Opal key from the SED Opal keyring. + */ +static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen) +{ + int ret; + key_ref_t kref; + struct key *key; + + if (!sed_opal_keyring) + return -ENOKEY; + + kref = keyring_search(make_key_ref(sed_opal_keyring, true), + &key_type_user, key_name, true); + + if (IS_ERR(kref)) + ret = PTR_ERR(kref); + + key = key_ref_to_ptr(kref); + down_read(&key->sem); + ret = key_validate(key); + if (ret == 0) { + if (buflen > key->datalen) + buflen = key->datalen; + + ret = key->type->read(key, (char *)buffer, buflen); + } + up_read(&key->sem); + + key_ref_put(kref); + + return ret; +} + +static int opal_get_key(struct opal_dev *dev, struct opal_key *key) +{ + int ret = 0; + + switch (key->key_type) { + case OPAL_INCLUDED: + /* the key is ready to use */ + break; + case OPAL_KEYRING: + /* the key is in the keyring */ + ret = read_sed_opal_key(OPAL_AUTH_KEY, key->key, OPAL_KEY_MAX); + if (ret > 0) { + if (ret > U8_MAX) { + ret = -ENOSPC; + goto error; + } + key->key_len = ret; + key->key_type = OPAL_INCLUDED; + } + break; + default: + ret = -EINVAL; + break; + } + if (ret < 0) + goto error; + + /* must have a PEK by now or it's an error */ + if (key->key_type != OPAL_INCLUDED || key->key_len == 0) { + ret = -EINVAL; + goto error; + } + return 0; +error: + pr_debug("Error getting password: %d\n", ret); + return ret; +} + static bool check_tper(const void *data) { const struct d0_tper_features *tper = data; @@ -2459,6 +2559,9 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); @@ -2492,6 +2595,9 @@ static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev) }; int ret; + ret = opal_get_key(dev, &rev->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); @@ -2510,6 +2616,9 @@ static int opal_erase_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); @@ -2538,6 +2647,9 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, opal_mbr->enable_disable != OPAL_MBR_DISABLE) return -EINVAL; + ret = opal_get_key(dev, &opal_mbr->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2563,6 +2675,9 @@ static int opal_set_mbr_done(struct opal_dev *dev, mbr_done->done_flag != OPAL_MBR_NOT_DONE) return -EINVAL; + ret = opal_get_key(dev, &mbr_done->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2584,6 +2699,9 @@ static int opal_write_shadow_mbr(struct opal_dev *dev, if (info->size == 0) return 0; + ret = opal_get_key(dev, &info->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2641,6 +2759,9 @@ static int opal_add_user_to_lr(struct opal_dev *dev, return -EINVAL; } + ret = opal_get_key(dev, &lk_unlk->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); @@ -2663,6 +2784,10 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psi int ret; + ret = opal_get_key(dev, opal); + + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); if (psid) @@ -2763,6 +2888,9 @@ static int opal_lock_unlock(struct opal_dev *dev, if (lk_unlk->session.who > OPAL_USER9) return -EINVAL; + ret = opal_get_key(dev, &lk_unlk->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); opal_lock_check_for_saved_key(dev, lk_unlk); ret = __opal_lock_unlock(dev, lk_unlk); @@ -2786,6 +2914,9 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) if (!dev) return -ENODEV; + ret = opal_get_key(dev, opal); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps)); @@ -2808,6 +2939,9 @@ static int opal_activate_lsp(struct opal_dev *dev, if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) return -EINVAL; + ret = opal_get_key(dev, &opal_lr_act->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); @@ -2826,6 +2960,9 @@ static int opal_setup_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); @@ -2879,6 +3016,14 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); mutex_unlock(&dev->dev_lock); + if (ret) + return ret; + + /* update keyring with new password */ + ret = update_sed_opal_key(OPAL_AUTH_KEY, + opal_pw->new_user_pw.opal_key.key, + opal_pw->new_user_pw.opal_key.key_len); + return ret; } @@ -2899,6 +3044,9 @@ static int opal_activate_user(struct opal_dev *dev, return -EINVAL; } + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps)); @@ -2985,6 +3133,9 @@ static int opal_generic_read_write_table(struct opal_dev *dev, { int ret, bit_set; + ret = opal_get_key(dev, &rw_tbl->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); @@ -3053,9 +3204,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!dev) - return -ENOTSUPP; + return -EOPNOTSUPP; if (!(dev->flags & OPAL_FL_SUPPORTED)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (cmd & IOC_IN) { p = memdup_user(arg, _IOC_SIZE(cmd)); @@ -3137,3 +3288,22 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) return ret; } EXPORT_SYMBOL_GPL(sed_ioctl); + +static int __init sed_opal_init(void) +{ + struct key *kr; + + kr = keyring_alloc(".sed_opal", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | + KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(kr)) + return PTR_ERR(kr); + + sed_opal_keyring = kr; + + return 0; +} +late_initcall(sed_opal_init); diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 2f189546e133..2ac50822554e 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -25,6 +25,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev); struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv); int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *ioctl_ptr); +#define OPAL_AUTH_KEY "opal-boot-pin" +#define OPAL_AUTH_KEY_PREV "opal-boot-pin-prev" + static inline bool is_sed_ioctl(unsigned int cmd) { switch (cmd) { diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 4e10675751b4..d3994b7716bc 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -49,10 +49,16 @@ enum opal_lock_flags { OPAL_SAVE_FOR_LOCK = 0x01, }; +enum opal_key_type { + OPAL_INCLUDED = 0, /* key[] is the key */ + OPAL_KEYRING, /* key is in keyring */ +}; + struct opal_key { __u8 lr; __u8 key_len; - __u8 __align[6]; + __u8 key_type; + __u8 __align[5]; __u8 key[OPAL_KEY_MAX]; }; -- cgit v1.2.3 From 146afeb235ccec10c17ad8ea26327c0c79dbd968 Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Sat, 3 Dec 2022 14:22:58 +0800 Subject: block: use strscpy() to instead of strncpy() The implementation of strscpy() is more robust and safer. That's now the recommended way to copy NUL terminated strings. Signed-off-by: Xu Panda Signed-off-by: Yang Yang Reviewed-by: Justin Stitt Link: https://lore.kernel.org/r/202212031422587503771@zte.com.cn Signed-off-by: Jens Axboe --- block/partitions/cmdline.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 1af610f0ba8c..c03bc105e575 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -81,8 +81,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) length = min_t(int, next - partdef, sizeof(new_subpart->name) - 1); - strncpy(new_subpart->name, partdef, length); - new_subpart->name[length] = '\0'; + strscpy(new_subpart->name, partdef, length); partdef = ++next; } else @@ -140,8 +139,7 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) } length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strncpy(newparts->name, bdevdef, length); - newparts->name[length] = '\0'; + strscpy(newparts->name, bdevdef, length); newparts->nr_subparts = 0; next_subpart = &newparts->subpart; @@ -153,8 +151,7 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) length = (!next) ? (sizeof(buf) - 1) : min_t(int, next - bdevdef, sizeof(buf) - 1); - strncpy(buf, bdevdef, length); - buf[length] = '\0'; + strscpy(buf, bdevdef, length); ret = parse_subpart(next_subpart, buf); if (ret) @@ -267,8 +264,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, label_min = min_t(int, sizeof(info->volname) - 1, sizeof(subpart->name)); - strncpy(info->volname, subpart->name, label_min); - info->volname[label_min] = '\0'; + strscpy(info->volname, subpart->name, label_min); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE); -- cgit v1.2.3