summaryrefslogtreecommitdiffstats
path: root/lib/scatterlist.c
diff options
context:
space:
mode:
authorDavid Howells <dhowells@redhat.com>2024-06-19 01:20:42 +0200
committerChristian Brauner <brauner@kernel.org>2024-09-12 12:20:21 +0200
commitdb0aa2e9566fda2d23dc8f6c102856ead95578a4 (patch)
treed39f09396b198bfe21a67cc96d419cda12c3fa7e /lib/scatterlist.c
parentnetfs: Use bh-disabling spinlocks for rreq->lock (diff)
downloadlinux-db0aa2e9566fda2d23dc8f6c102856ead95578a4.tar.xz
linux-db0aa2e9566fda2d23dc8f6c102856ead95578a4.zip
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a list of folio_queue structures to be used to provide a buffer to iov_iter-taking functions, such as sendmsg and recvmsg. The folio_queue structure looks like: struct folio_queue { struct folio_batch vec; u8 orders[PAGEVEC_SIZE]; struct folio_queue *next; struct folio_queue *prev; unsigned long marks; unsigned long marks2; }; It does not use a list_head so that next and/or prev can be set to NULL at the ends of the list, allowing iov_iter-handling routines to determine that they *are* the ends without needing to store a head pointer in the iov_iter struct. A folio_batch struct is used to hold the folio pointers which allows the batch to be passed to batch handling functions. Two mark bits are available per slot. The intention is to use at least one of them to mark folios that need putting, but that might not be ultimately necessary. Accessor functions are used to access the slots to do the masking and an additional accessor function is used to indicate the size of the array. The order of each folio is also stored in the structure to avoid the need for iov_iter_advance() and iov_iter_revert() to have to query each folio to find its size. With careful barriering, this can be used as an extending buffer with new folios inserted and new folio_queue structs added without the need for a lock. Further, provided we always keep at least one struct in the buffer, we can also remove consumed folios and consumed structs from the head end as we without the need for locks. [Questions/thoughts] (1) To manage this, I need a head pointer, a tail pointer, a tail slot number (assuming insertion happens at the tail end and the next pointers point from head to tail). Should I put these into a struct of their own, say "folio_queue_head" or "rolling_buffer"? I will end up with two of these in netfs_io_request eventually, one keeping track of the pagecache I'm dealing with for buffered I/O and the other to hold a bounce buffer when we need one. (2) Should I make the slots {folio,off,len} or bio_vec? (3) This is intended to replace ITER_XARRAY eventually. Using an xarray in I/O iteration requires the taking of the RCU read lock, doing copying under the RCU read lock, walking the xarray (which may change under us), handling retries and dealing with special values. The advantage of ITER_XARRAY is that when we're dealing with the pagecache directly, we don't need any allocation - but if we're doing encrypted comms, there's a good chance we'd be using a bounce buffer anyway. This will require afs, erofs, cifs, orangefs and fscache to be converted to not use this. afs still uses it for dirs and symlinks; some of erofs usages should be easy to change, but there's one which won't be so easy; ceph's use via fscache can be fixed by porting ceph to netfslib; cifs is using xarray as a bounce buffer - that can be moved to use sheaves instead; and orangefs has a similar problem to erofs - maybe orangefs could use netfslib? Signed-off-by: David Howells <dhowells@redhat.com> cc: Matthew Wilcox <willy@infradead.org> cc: Jeff Layton <jlayton@kernel.org> cc: Steve French <sfrench@samba.org> cc: Ilya Dryomov <idryomov@gmail.com> cc: Gao Xiang <xiang@kernel.org> cc: Mike Marshall <hubcap@omnibond.com> cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: ceph-devel@vger.kernel.org cc: linux-erofs@lists.ozlabs.org cc: devel@lists.orangefs.org Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'lib/scatterlist.c')
-rw-r--r--lib/scatterlist.c69
1 files changed, 67 insertions, 2 deletions
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 7bc2220fea80..473b2646f71c 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -11,6 +11,7 @@
#include <linux/kmemleak.h>
#include <linux/bvec.h>
#include <linux/uio.h>
+#include <linux/folio_queue.h>
/**
* sg_next - return the next scatterlist entry in a list
@@ -1262,6 +1263,67 @@ static ssize_t extract_kvec_to_sg(struct iov_iter *iter,
}
/*
+ * Extract up to sg_max folios from an FOLIOQ-type iterator and add them to
+ * the scatterlist. The pages are not pinned.
+ */
+static ssize_t extract_folioq_to_sg(struct iov_iter *iter,
+ ssize_t maxsize,
+ struct sg_table *sgtable,
+ unsigned int sg_max,
+ iov_iter_extraction_t extraction_flags)
+{
+ const struct folio_queue *folioq = iter->folioq;
+ struct scatterlist *sg = sgtable->sgl + sgtable->nents;
+ unsigned int slot = iter->folioq_slot;
+ ssize_t ret = 0;
+ size_t offset = iter->iov_offset;
+
+ BUG_ON(!folioq);
+
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = folioq->next;
+ if (WARN_ON_ONCE(!folioq))
+ return 0;
+ slot = 0;
+ }
+
+ do {
+ struct folio *folio = folioq_folio(folioq, slot);
+ size_t fsize = folioq_folio_size(folioq, slot);
+
+ if (offset < fsize) {
+ size_t part = umin(maxsize - ret, fsize - offset);
+
+ sg_set_page(sg, folio_page(folio, 0), part, offset);
+ sgtable->nents++;
+ sg++;
+ sg_max--;
+ offset += part;
+ ret += part;
+ }
+
+ if (offset >= fsize) {
+ offset = 0;
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ if (!folioq->next) {
+ WARN_ON_ONCE(ret < iter->count);
+ break;
+ }
+ folioq = folioq->next;
+ slot = 0;
+ }
+ }
+ } while (sg_max > 0 && ret < maxsize);
+
+ iter->folioq = folioq;
+ iter->folioq_slot = slot;
+ iter->iov_offset = offset;
+ iter->count -= ret;
+ return ret;
+}
+
+/*
* Extract up to sg_max folios from an XARRAY-type iterator and add them to
* the scatterlist. The pages are not pinned.
*/
@@ -1323,8 +1385,8 @@ static ssize_t extract_xarray_to_sg(struct iov_iter *iter,
* addition of @sg_max elements.
*
* The pages referred to by UBUF- and IOVEC-type iterators are extracted and
- * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE-
- * and DISCARD-type are not supported.
+ * pinned; BVEC-, KVEC-, FOLIOQ- and XARRAY-type are extracted but aren't
+ * pinned; DISCARD-type is not supported.
*
* No end mark is placed on the scatterlist; that's left to the caller.
*
@@ -1356,6 +1418,9 @@ ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize,
case ITER_KVEC:
return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max,
extraction_flags);
+ case ITER_FOLIOQ:
+ return extract_folioq_to_sg(iter, maxsize, sgtable, sg_max,
+ extraction_flags);
case ITER_XARRAY:
return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max,
extraction_flags);