From 8449eedaa1da6a51d67190c905b1b54243e095f6 Mon Sep 17 00:00:00 2001
From: Stefan Bühler <source@stbuehler.de>
Date: Sat, 27 Apr 2019 20:34:19 +0200
Subject: io_uring: fix handling SQEs requesting NOWAIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not all request types set REQ_F_FORCE_NONBLOCK when they needed async
punting; reverse logic instead and set REQ_F_NOWAIT if request mustn't
be punted.

Signed-off-by: Stefan Bühler <source@stbuehler.de>

Merged with my previous patch for this.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0e9fb2cb1984..d5e23a6dd6aa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -221,7 +221,7 @@ struct io_kiocb {
 	struct list_head	list;
 	unsigned int		flags;
 	refcount_t		refs;
-#define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
+#define REQ_F_NOWAIT		1	/* must not punt to workers */
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
 #define REQ_F_SEQ_PREV		8	/* sequential with previous */
@@ -774,10 +774,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 	if (unlikely(ret))
 		return ret;
-	if (force_nonblock) {
+
+	/* don't allow async punt if RWF_NOWAIT was requested */
+	if (kiocb->ki_flags & IOCB_NOWAIT)
+		req->flags |= REQ_F_NOWAIT;
+
+	if (force_nonblock)
 		kiocb->ki_flags |= IOCB_NOWAIT;
-		req->flags |= REQ_F_FORCE_NONBLOCK;
-	}
+
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
 		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 		    !kiocb->ki_filp->f_op->iopoll)
@@ -1436,8 +1440,7 @@ restart:
 		struct sqe_submit *s = &req->submit;
 		const struct io_uring_sqe *sqe = s->sqe;
 
-		/* Ensure we clear previously set forced non-block flag */
-		req->flags &= ~REQ_F_FORCE_NONBLOCK;
+		/* Ensure we clear previously set non-block flag */
 		req->rw.ki_flags &= ~IOCB_NOWAIT;
 
 		ret = 0;
@@ -1623,7 +1626,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 		goto out;
 
 	ret = __io_submit_sqe(ctx, req, s, true);
-	if (ret == -EAGAIN) {
+	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 		struct io_uring_sqe *sqe_copy;
 
 		sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
-- 
cgit v1.2.3


From 1e84b97b7377bd0198f87b49ad3e396e84bf0458 Mon Sep 17 00:00:00 2001
From: Stefan Bühler <source@stbuehler.de>
Date: Wed, 24 Apr 2019 23:54:16 +0200
Subject: io_uring: fix notes on barriers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The application reading the CQ ring needs a barrier to pair with the
smp_store_release in io_commit_cqring, not the barrier after it.

Also a write barrier *after* writing something (but not *before*
writing anything interesting) doesn't order anything, so an smp_wmb()
after writing SQ tail is not needed.

Additionally consider reading SQ head and writing CQ tail in the notes.

Also add some clarifications how the various other fields in the ring
buffers are used.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 110 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d5e23a6dd6aa..7ab93e854eb2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4,15 +4,28 @@
  * supporting fast/efficient IO.
  *
  * A note on the read/write ordering memory barriers that are matched between
- * the application and kernel side. When the application reads the CQ ring
- * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
- * the kernel uses after writing the tail. Failure to do so could cause a
- * delay in when the application notices that completion events available.
- * This isn't a fatal condition. Likewise, the application must use an
- * appropriate smp_wmb() both before writing the SQ tail, and after writing
- * the SQ tail. The first one orders the sqe writes with the tail write, and
- * the latter is paired with the smp_rmb() the kernel will issue before
- * reading the SQ tail on submission.
+ * the application and kernel side.
+ *
+ * After the application reads the CQ ring tail, it must use an
+ * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
+ * before writing the tail (using smp_load_acquire to read the tail will
+ * do). It also needs a smp_mb() before updating CQ head (ordering the
+ * entry load(s) with the head store), pairing with an implicit barrier
+ * through a control-dependency in io_get_cqring (smp_store_release to
+ * store head will do). Failure to do so could lead to reading invalid
+ * CQ entries.
+ *
+ * Likewise, the application must use an appropriate smp_wmb() before
+ * writing the SQ tail (ordering SQ entry stores with the tail store),
+ * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
+ * to store the tail will do). And it needs a barrier ordering the SQ
+ * head load before writing new SQ entries (smp_load_acquire to read
+ * head will do).
+ *
+ * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
+ * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
+ * updating the SQ tail; a full memory barrier smp_mb() is needed
+ * between.
  *
  * Also see the examples in the liburing library:
  *
@@ -70,20 +83,108 @@ struct io_uring {
 	u32 tail ____cacheline_aligned_in_smp;
 };
 
+/*
+ * This data is shared with the application through the mmap at offset
+ * IORING_OFF_SQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
 struct io_sq_ring {
+	/*
+	 * Head and tail offsets into the ring; the offsets need to be
+	 * masked to get valid indices.
+	 *
+	 * The kernel controls head and the application controls tail.
+	 */
 	struct io_uring		r;
+	/*
+	 * Bitmask to apply to head and tail offsets (constant, equals
+	 * ring_entries - 1)
+	 */
 	u32			ring_mask;
+	/* Ring size (constant, power of 2) */
 	u32			ring_entries;
+	/*
+	 * Number of invalid entries dropped by the kernel due to
+	 * invalid index stored in array
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application (i.e. get number of "new events" by comparing to
+	 * cached value).
+	 *
+	 * After a new SQ head value was read by the application this
+	 * counter includes all submissions that were dropped reaching
+	 * the new SQ head (and possibly more).
+	 */
 	u32			dropped;
+	/*
+	 * Runtime flags
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application.
+	 *
+	 * The application needs a full memory barrier before checking
+	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+	 */
 	u32			flags;
+	/*
+	 * Ring buffer of indices into array of io_uring_sqe, which is
+	 * mmapped by the application using the IORING_OFF_SQES offset.
+	 *
+	 * This indirection could e.g. be used to assign fixed
+	 * io_uring_sqe entries to operations and only submit them to
+	 * the queue when needed.
+	 *
+	 * The kernel modifies neither the indices array nor the entries
+	 * array.
+	 */
 	u32			array[];
 };
 
+/*
+ * This data is shared with the application through the mmap at offset
+ * IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_cqring_offsets when calling io_uring_setup.
+ */
 struct io_cq_ring {
+	/*
+	 * Head and tail offsets into the ring; the offsets need to be
+	 * masked to get valid indices.
+	 *
+	 * The application controls head and the kernel tail.
+	 */
 	struct io_uring		r;
+	/*
+	 * Bitmask to apply to head and tail offsets (constant, equals
+	 * ring_entries - 1)
+	 */
 	u32			ring_mask;
+	/* Ring size (constant, power of 2) */
 	u32			ring_entries;
+	/*
+	 * Number of completion events lost because the queue was full;
+	 * this should be avoided by the application by making sure
+	 * there are not more requests pending thatn there is space in
+	 * the completion queue.
+	 *
+	 * Written by the kernel, shouldn't be modified by the
+	 * application (i.e. get number of "new events" by comparing to
+	 * cached value).
+	 *
+	 * As completion events come in out of order this counter is not
+	 * ordered with any other data.
+	 */
 	u32			overflow;
+	/*
+	 * Ring buffer of completion events.
+	 *
+	 * The kernel writes completion events fresh every time they are
+	 * produced, so the application is allowed to modify pending
+	 * entries.
+	 */
 	struct io_uring_cqe	cqes[];
 };
 
-- 
cgit v1.2.3


From 4f7067c3fb7f2974363a28c597a41949d971af02 Mon Sep 17 00:00:00 2001
From: Stefan Bühler <source@stbuehler.de>
Date: Wed, 24 Apr 2019 23:54:17 +0200
Subject: io_uring: remove unnecessary barrier before wq_has_sleeper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

wq_has_sleeper has a full barrier internally. The smp_rmb barrier in
io_uring_poll synchronizes with it.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7ab93e854eb2..bb71b7f00bb3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -418,12 +418,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
 		/* order cqe stores with ring update */
 		smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
 
-		/*
-		 * Write sider barrier of tail update, app has read side. See
-		 * comment at the top of this file.
-		 */
-		smp_wmb();
-
 		if (wq_has_sleeper(&ctx->cq_wait)) {
 			wake_up_interruptible(&ctx->cq_wait);
 			kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
@@ -2677,7 +2671,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	__poll_t mask = 0;
 
 	poll_wait(file, &ctx->cq_wait, wait);
-	/* See comment at the top of this file */
+	/*
+	 * synchronizes with barrier from wq_has_sleeper call in
+	 * io_commit_cqring
+	 */
 	smp_rmb();
 	if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
 	    ctx->sq_ring->ring_entries)
-- 
cgit v1.2.3


From 115e12e58dbc055e98c965e3255aed7b20214f95 Mon Sep 17 00:00:00 2001
From: Stefan Bühler <source@stbuehler.de>
Date: Wed, 24 Apr 2019 23:54:18 +0200
Subject: io_uring: remove unnecessary barrier before reading cq head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The memory operations before reading cq head are unrelated and we
don't care about their order.

Document that the control dependency in combination with READ_ONCE and
WRITE_ONCE forms a barrier we need.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bb71b7f00bb3..3671a654a146 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -431,8 +431,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	unsigned tail;
 
 	tail = ctx->cached_cq_tail;
-	/* See comment at the top of the file */
-	smp_rmb();
+	/*
+	 * writes to the cq entry need to come after reading head; the
+	 * control dependency is enough as we're using WRITE_ONCE to
+	 * fill the cq entry
+	 */
 	if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
 		return NULL;
 
-- 
cgit v1.2.3


From 9e4c15a3939448d2ea9b9bf59561183bbe3fdc49 Mon Sep 17 00:00:00 2001
From: Stefan Bühler <source@stbuehler.de>
Date: Wed, 24 Apr 2019 23:54:19 +0200
Subject: io_uring: remove unnecessary barrier after updating SQ head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no operation afterwards to order with.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3671a654a146..d3c57ee233fe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1798,12 +1798,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
 		 * write new data to them.
 		 */
 		smp_store_release(&ring->r.head, ctx->cached_sq_head);
-
-		/*
-		 * write side barrier of head update, app has read side. See
-		 * comment at the top of this file
-		 */
-		smp_wmb();
 	}
 }
 
-- 
cgit v1.2.3


From 82ab082c0e2f8592c2ff6b2ab99a92d8406c8c2c Mon Sep 17 00:00:00 2001
From: Stefan Bühler <source@stbuehler.de>
Date: Wed, 24 Apr 2019 23:54:20 +0200
Subject: io_uring: remove unnecessary barrier before reading SQ tail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no operation before to order with.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d3c57ee233fe..662f1c070c8c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1831,8 +1831,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 	 *    though the application is the one updating it.
 	 */
 	head = ctx->cached_sq_head;
-	/* See comment at the top of this file */
-	smp_rmb();
 	/* make sure SQ entry isn't read before tail */
 	if (head == smp_load_acquire(&ring->r.tail))
 		return false;
-- 
cgit v1.2.3


From b841f19524a16cd93a39f9306191f85c549a2bc2 Mon Sep 17 00:00:00 2001
From: Stefan Bühler <source@stbuehler.de>
Date: Wed, 24 Apr 2019 23:54:21 +0200
Subject: io_uring: remove unnecessary barrier after incrementing dropped
 counter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

smp_store_release in io_commit_sqring already orders the store to
dropped before the update to SQ head.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 662f1c070c8c..2ebc33cc907b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1846,8 +1846,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 	/* drop invalid entries */
 	ctx->cached_sq_head++;
 	ring->dropped++;
-	/* See comment at the top of this file */
-	smp_wmb();
 	return false;
 }
 
-- 
cgit v1.2.3


From 62977281a6384d3904c02272a638cc3ac3bac54d Mon Sep 17 00:00:00 2001
From: Stefan Bühler <source@stbuehler.de>
Date: Wed, 24 Apr 2019 23:54:22 +0200
Subject: io_uring: remove unnecessary barrier after unsetting
 IORING_SQ_NEED_WAKEUP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no operation to order with afterwards, and removing the flag is
not critical in any way.

There will always be a "race condition" where the application will
trigger IORING_ENTER_SQ_WAKEUP when it isn't actually needed.

Signed-off-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2ebc33cc907b..77b247b5d10b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1969,13 +1969,11 @@ static int io_sq_thread(void *data)
 				finish_wait(&ctx->sqo_wait, &wait);
 
 				ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
-				smp_wmb();
 				continue;
 			}
 			finish_wait(&ctx->sqo_wait, &wait);
 
 			ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
-			smp_wmb();
 		}
 
 		i = 0;
-- 
cgit v1.2.3


From 5c8b0b54db22c54f2aec991b388f550d3a927f26 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 30 Apr 2019 10:16:07 -0600
Subject: io_uring: have submission side sqe errors post a cqe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently we only post a cqe if we get an error OUTSIDE of submission.
For submission, we return the error directly through io_uring_enter().
This is a bit awkward for applications, and it makes more sense to
always post a cqe with an error, if the error happens on behalf of an
sqe.

This changes submission behavior a bit. io_uring_enter() returns -ERROR
for an error, and > 0 for number of sqes submitted. Before this change,
if you wanted to submit 8 entries and had an error on the 5th entry,
io_uring_enter() would return 4 (for number of entries successfully
submitted) and rewind the sqring. The application would then have to
peek at the sqring and figure out what was wrong with the head sqe, and
then skip it itself. With this change, we'll return 5 since we did
consume 5 sqes, and the last sqe (with the error) will result in a cqe
being posted with the error.

This makes the logic easier to handle in the application, and it cleans
up the submission part.

Suggested-by: Stefan Bühler <source@stbuehler.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 34 ++++++----------------------------
 1 file changed, 6 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77b247b5d10b..0a894d7baceb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1801,14 +1801,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
 	}
 }
 
-/*
- * Undo last io_get_sqring()
- */
-static void io_drop_sqring(struct io_ring_ctx *ctx)
-{
-	ctx->cached_sq_head--;
-}
-
 /*
  * Fetch an sqe, if one is available. Note that s->sqe will point to memory
  * that is mapped by userspace. This means that care needs to be taken to
@@ -2018,7 +2010,7 @@ static int io_sq_thread(void *data)
 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 {
 	struct io_submit_state state, *statep = NULL;
-	int i, ret = 0, submit = 0;
+	int i, submit = 0;
 
 	if (to_submit > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, ctx, to_submit);
@@ -2027,6 +2019,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 
 	for (i = 0; i < to_submit; i++) {
 		struct sqe_submit s;
+		int ret;
 
 		if (!io_get_sqring(ctx, &s))
 			break;
@@ -2034,21 +2027,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 		s.has_user = true;
 		s.needs_lock = false;
 		s.needs_fixed_file = false;
+		submit++;
 
 		ret = io_submit_sqe(ctx, &s, statep);
-		if (ret) {
-			io_drop_sqring(ctx);
-			break;
-		}
-
-		submit++;
+		if (ret)
+			io_cqring_add_event(ctx, s.sqe->user_data, ret, 0);
 	}
 	io_commit_sqring(ctx);
 
 	if (statep)
 		io_submit_state_end(statep);
 
-	return submit ? submit : ret;
+	return submit;
 }
 
 static unsigned io_cqring_events(struct io_cq_ring *ring)
@@ -2779,24 +2769,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		mutex_lock(&ctx->uring_lock);
 		submitted = io_ring_submit(ctx, to_submit);
 		mutex_unlock(&ctx->uring_lock);
-
-		if (submitted < 0)
-			goto out_ctx;
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
 		unsigned nr_events = 0;
 
 		min_complete = min(min_complete, ctx->cq_entries);
 
-		/*
-		 * The application could have included the 'to_submit' count
-		 * in how many events it wanted to wait for. If we failed to
-		 * submit the desired count, we may need to adjust the number
-		 * of events to poll/wait for.
-		 */
-		if (submitted < to_submit)
-			min_complete = min_t(unsigned, submitted, min_complete);
-
 		if (ctx->flags & IORING_SETUP_IOPOLL) {
 			mutex_lock(&ctx->uring_lock);
 			ret = io_iopoll_check(ctx, &nr_events, min_complete);
-- 
cgit v1.2.3


From 975554b03eddc1df73bda3a764a09e18cadd5f1c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 30 Apr 2019 13:34:51 +0100
Subject: io_uring: fix SQPOLL cpu validation

In io_sq_offload_start(), we call cpu_possible() on an unbounded cpu
value from userspace. On v5.1-rc7 on arm64 with
CONFIG_DEBUG_PER_CPU_MAPS, this results in a splat:

  WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpu_max_bits_warn include/linux/cpumask.h:121 [inline]

There was an attempt to fix this in commit:

  917257daa0fea7a0 ("io_uring: only test SQPOLL cpu after we've verified it")

... by adding a check after the cpu value had been limited to NR_CPU_IDS
using array_index_nospec(). However, this left an unbound check at the
start of the function, for which the warning still fires.

Let's fix this correctly by checking that the cpu value is bound by
nr_cpu_ids before passing it to cpu_possible(). Note that only
nr_cpu_ids of a cpumask are guaranteed to exist at runtime, and
nr_cpu_ids can be significantly smaller than NR_CPUs. For example, an
arm64 defconfig has NR_CPUS=256, while my test VM has 4 vCPUs.

Following the intent from the commit message for 917257daa0fea7a0, the
check is moved under the SQ_AFF branch, which is the only branch where
the cpu values is consumed. The check is performed before bounding the
value with array_index_nospec() so that we don't silently accept bogus
cpu values from userspace, where array_index_nospec() would force these
values to 0.

I suspect we can remove the array_index_nospec() call entirely, but I've
conservatively left that in place, updated to use nr_cpu_ids to match
the prior check.

Tested on arm64 with the Syzkaller reproducer:

  https://syzkaller.appspot.com/bug?extid=cd714a07c6de2bc34293
  https://syzkaller.appspot.com/x/repro.syz?x=15d8b397200000

Full splat from before this patch:

WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpu_max_bits_warn include/linux/cpumask.h:121 [inline]
WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpumask_check include/linux/cpumask.h:128 [inline]
WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 cpumask_test_cpu include/linux/cpumask.h:344 [inline]
WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_sq_offload_start fs/io_uring.c:2244 [inline]
WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_uring_create fs/io_uring.c:2864 [inline]
WARNING: CPU: 1 PID: 27601 at include/linux/cpumask.h:121 io_uring_setup+0x1108/0x15a0 fs/io_uring.c:2916
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 27601 Comm: syz-executor.0 Not tainted 5.1.0-rc7 #3
Hardware name: linux,dummy-virt (DT)
Call trace:
 dump_backtrace+0x0/0x2f0 include/linux/compiler.h:193
 show_stack+0x20/0x30 arch/arm64/kernel/traps.c:158
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x110/0x190 lib/dump_stack.c:113
 panic+0x384/0x68c kernel/panic.c:214
 __warn+0x2bc/0x2c0 kernel/panic.c:571
 report_bug+0x228/0x2d8 lib/bug.c:186
 bug_handler+0xa0/0x1a0 arch/arm64/kernel/traps.c:956
 call_break_hook arch/arm64/kernel/debug-monitors.c:301 [inline]
 brk_handler+0x1d4/0x388 arch/arm64/kernel/debug-monitors.c:316
 do_debug_exception+0x1a0/0x468 arch/arm64/mm/fault.c:831
 el1_dbg+0x18/0x8c
 cpu_max_bits_warn include/linux/cpumask.h:121 [inline]
 cpumask_check include/linux/cpumask.h:128 [inline]
 cpumask_test_cpu include/linux/cpumask.h:344 [inline]
 io_sq_offload_start fs/io_uring.c:2244 [inline]
 io_uring_create fs/io_uring.c:2864 [inline]
 io_uring_setup+0x1108/0x15a0 fs/io_uring.c:2916
 __do_sys_io_uring_setup fs/io_uring.c:2929 [inline]
 __se_sys_io_uring_setup fs/io_uring.c:2926 [inline]
 __arm64_sys_io_uring_setup+0x50/0x70 fs/io_uring.c:2926
 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline]
 invoke_syscall arch/arm64/kernel/syscall.c:47 [inline]
 el0_svc_common.constprop.0+0x148/0x2e0 arch/arm64/kernel/syscall.c:83
 el0_svc_handler+0xdc/0x100 arch/arm64/kernel/syscall.c:129
 el0_svc+0x8/0xc arch/arm64/kernel/entry.S:948
SMP: stopping secondary CPUs
Dumping ftrace buffer:
   (ftrace buffer empty)
Kernel Offset: disabled
CPU features: 0x002,23000438
Memory Limit: none
Rebooting in 1 seconds..

Fixes: 917257daa0fea7a0 ("io_uring: only test SQPOLL cpu after we've verified it")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org

Simplied the logic

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0a894d7baceb..5954047ee96d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2319,10 +2319,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 	mmgrab(current->mm);
 	ctx->sqo_mm = current->mm;
 
-	ret = -EINVAL;
-	if (!cpu_possible(p->sq_thread_cpu))
-		goto err;
-
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
 		ret = -EPERM;
 		if (!capable(CAP_SYS_ADMIN))
@@ -2333,11 +2329,11 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			ctx->sq_thread_idle = HZ;
 
 		if (p->flags & IORING_SETUP_SQ_AFF) {
-			int cpu;
+			int cpu = array_index_nospec(p->sq_thread_cpu,
+							nr_cpu_ids);
 
-			cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS);
 			ret = -EINVAL;
-			if (!cpu_possible(p->sq_thread_cpu))
+			if (!cpu_possible(cpu))
 				goto err;
 
 			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
-- 
cgit v1.2.3


From 52e04ef4c9d459cba3afd86ec335a411b40b7fd2 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 30 Apr 2019 17:30:21 +0100
Subject: io_uring: free allocated io_memory once

If io_allocate_scq_urings() fails to allocate an sq_* region, it will
call io_mem_free() for any previously allocated regions, but leave
dangling pointers to these regions in the ctx. Any regions which have
not yet been allocated are left NULL. Note that when returning
-EOVERFLOW, the previously allocated sq_ring is not freed, which appears
to be an unintentional leak.

When io_allocate_scq_urings() fails, io_uring_create() will call
io_ring_ctx_wait_and_kill(), which calls io_mem_free() on all the sq_*
regions, assuming the pointers are valid and not NULL.

This can result in pages being freed multiple times, which has been
observed to corrupt the page state, leading to subsequent fun. This can
also result in virt_to_page() on NULL, resulting in the use of bogus
page addresses, and yet more subsequent fun. The latter can be detected
with CONFIG_DEBUG_VIRTUAL on arm64.

Adding a cleanup path to io_allocate_scq_urings() complicates the logic,
so let's leave it to io_ring_ctx_free() to consistently free these
pointers, and simplify the io_allocate_scq_urings() error paths.

Full splats from before this patch below. Note that the pointer logged
by the DEBUG_VIRTUAL "non-linear address" warning has been hashed, and
is actually NULL.

[   26.098129] page:ffff80000e949a00 count:0 mapcount:-128 mapping:0000000000000000 index:0x0
[   26.102976] flags: 0x63fffc000000()
[   26.104373] raw: 000063fffc000000 ffff80000e86c188 ffff80000ea3df08 0000000000000000
[   26.108917] raw: 0000000000000000 0000000000000001 00000000ffffff7f 0000000000000000
[   26.137235] page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0)
[   26.143960] ------------[ cut here ]------------
[   26.146020] kernel BUG at include/linux/mm.h:547!
[   26.147586] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
[   26.149163] Modules linked in:
[   26.150287] Process syz-executor.21 (pid: 20204, stack limit = 0x000000000e9cefeb)
[   26.153307] CPU: 2 PID: 20204 Comm: syz-executor.21 Not tainted 5.1.0-rc7-00004-g7d30b2ea43d6 #18
[   26.156566] Hardware name: linux,dummy-virt (DT)
[   26.158089] pstate: 40400005 (nZcv daif +PAN -UAO)
[   26.159869] pc : io_mem_free+0x9c/0xa8
[   26.161436] lr : io_mem_free+0x9c/0xa8
[   26.162720] sp : ffff000013003d60
[   26.164048] x29: ffff000013003d60 x28: ffff800025048040
[   26.165804] x27: 0000000000000000 x26: ffff800025048040
[   26.167352] x25: 00000000000000c0 x24: ffff0000112c2820
[   26.169682] x23: 0000000000000000 x22: 0000000020000080
[   26.171899] x21: ffff80002143b418 x20: ffff80002143b400
[   26.174236] x19: ffff80002143b280 x18: 0000000000000000
[   26.176607] x17: 0000000000000000 x16: 0000000000000000
[   26.178997] x15: 0000000000000000 x14: 0000000000000000
[   26.181508] x13: 00009178a5e077b2 x12: 0000000000000001
[   26.183863] x11: 0000000000000000 x10: 0000000000000980
[   26.186437] x9 : ffff000013003a80 x8 : ffff800025048a20
[   26.189006] x7 : ffff8000250481c0 x6 : ffff80002ffe9118
[   26.191359] x5 : ffff80002ffe9118 x4 : 0000000000000000
[   26.193863] x3 : ffff80002ffefe98 x2 : 44c06ddd107d1f00
[   26.196642] x1 : 0000000000000000 x0 : 000000000000003e
[   26.198892] Call trace:
[   26.199893]  io_mem_free+0x9c/0xa8
[   26.201155]  io_ring_ctx_wait_and_kill+0xec/0x180
[   26.202688]  io_uring_setup+0x6c4/0x6f0
[   26.204091]  __arm64_sys_io_uring_setup+0x18/0x20
[   26.205576]  el0_svc_common.constprop.0+0x7c/0xe8
[   26.207186]  el0_svc_handler+0x28/0x78
[   26.208389]  el0_svc+0x8/0xc
[   26.209408] Code: aa0203e0 d0006861 9133a021 97fcdc3c (d4210000)
[   26.211995] ---[ end trace bdb81cd43a21e50d ]---

[   81.770626] ------------[ cut here ]------------
[   81.825015] virt_to_phys used for non-linear address: 000000000d42f2c7 (          (null))
[   81.827860] WARNING: CPU: 1 PID: 30171 at arch/arm64/mm/physaddr.c:15 __virt_to_phys+0x48/0x68
[   81.831202] Modules linked in:
[   81.832212] CPU: 1 PID: 30171 Comm: syz-executor.20 Not tainted 5.1.0-rc7-00004-g7d30b2ea43d6 #19
[   81.835616] Hardware name: linux,dummy-virt (DT)
[   81.836863] pstate: 60400005 (nZCv daif +PAN -UAO)
[   81.838727] pc : __virt_to_phys+0x48/0x68
[   81.840572] lr : __virt_to_phys+0x48/0x68
[   81.842264] sp : ffff80002cf67c70
[   81.843858] x29: ffff80002cf67c70 x28: ffff800014358e18
[   81.846463] x27: 0000000000000000 x26: 0000000020000080
[   81.849148] x25: 0000000000000000 x24: ffff80001bb01f40
[   81.851986] x23: ffff200011db06c8 x22: ffff2000127e3c60
[   81.854351] x21: ffff800014358cc0 x20: ffff800014358d98
[   81.856711] x19: 0000000000000000 x18: 0000000000000000
[   81.859132] x17: 0000000000000000 x16: 0000000000000000
[   81.861586] x15: 0000000000000000 x14: 0000000000000000
[   81.863905] x13: 0000000000000000 x12: ffff1000037603e9
[   81.866226] x11: 1ffff000037603e8 x10: 0000000000000980
[   81.868776] x9 : ffff80002cf67840 x8 : ffff80001bb02920
[   81.873272] x7 : ffff1000037603e9 x6 : ffff80001bb01f47
[   81.875266] x5 : ffff1000037603e9 x4 : dfff200000000000
[   81.876875] x3 : ffff200010087528 x2 : ffff1000059ecf58
[   81.878751] x1 : 44c06ddd107d1f00 x0 : 0000000000000000
[   81.880453] Call trace:
[   81.881164]  __virt_to_phys+0x48/0x68
[   81.882919]  io_mem_free+0x18/0x110
[   81.886585]  io_ring_ctx_wait_and_kill+0x13c/0x1f0
[   81.891212]  io_uring_setup+0xa60/0xad0
[   81.892881]  __arm64_sys_io_uring_setup+0x2c/0x38
[   81.894398]  el0_svc_common.constprop.0+0xac/0x150
[   81.896306]  el0_svc_handler+0x34/0x88
[   81.897744]  el0_svc+0x8/0xc
[   81.898715] ---[ end trace b4a703802243cbba ]---

Fixes: 2b188cc1bb857a9d ("Add io_uring IO interface")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5954047ee96d..046fc4e1e155 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2396,8 +2396,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
 
 static void io_mem_free(void *ptr)
 {
-	struct page *page = virt_to_head_page(ptr);
+	struct page *page;
+
+	if (!ptr)
+		return;
 
+	page = virt_to_head_page(ptr);
 	if (put_page_testzero(page))
 		free_compound_page(page);
 }
@@ -2816,17 +2820,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 
 	ctx->sq_sqes = io_mem_alloc(size);
-	if (!ctx->sq_sqes) {
-		io_mem_free(ctx->sq_ring);
+	if (!ctx->sq_sqes)
 		return -ENOMEM;
-	}
 
 	cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
-	if (!cq_ring) {
-		io_mem_free(ctx->sq_ring);
-		io_mem_free(ctx->sq_sqes);
+	if (!cq_ring)
 		return -ENOMEM;
-	}
 
 	ctx->cq_ring = cq_ring;
 	cq_ring->ring_mask = p->cq_entries - 1;
-- 
cgit v1.2.3


From 817869d2519f0cb7be5b3482129dadc806dfb747 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 30 Apr 2019 14:44:05 -0600
Subject: io_uring: drop req submit reference always in async punt

If we don't end up actually calling submit in io_sq_wq_submit_work(),
we still need to drop the submit reference to the request. If we
don't, then we can leak the request. This can happen if we race
with ring shutdown while flushing the workqueue for requests that
require use of the mm_struct.

Fixes: e65ef56db494 ("io_uring: use regular request ref counts")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 046fc4e1e155..18cecb6a0151 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1568,10 +1568,11 @@ restart:
 					break;
 				cond_resched();
 			} while (1);
-
-			/* drop submission reference */
-			io_put_req(req);
 		}
+
+		/* drop submission reference */
+		io_put_req(req);
+
 		if (ret) {
 			io_cqring_add_event(ctx, sqe->user_data, ret, 0);
 			io_put_req(req);
-- 
cgit v1.2.3


From 60a27b906d1a372474669c914c10d6c993858a4a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 26 Apr 2019 18:45:20 +0800
Subject: block: fix handling for BIO_NO_PAGE_REF

Commit 399254aaf489211 ("block: add BIO_NO_PAGE_REF flag") introduces
BIO_NO_PAGE_REF, and once this flag is set for one bio, all pages
in the bio won't be get/put during IO.

However, if one bio is submitted via __blkdev_direct_IO_simple(),
even though BIO_NO_PAGE_REF is set, pages still may be put.

Fixes this issue by avoiding to put pages if BIO_NO_PAGE_REF is
set.

Fixes: 399254aaf489211 ("block: add BIO_NO_PAGE_REF flag")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 24615c76c1d0..bb28e2ead679 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -264,7 +264,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	bio_for_each_segment_all(bvec, &bio, i, iter_all) {
 		if (should_dirty && !PageCompound(bvec->bv_page))
 			set_page_dirty_lock(bvec->bv_page);
-		put_page(bvec->bv_page);
+		if (!bio_flagged(&bio, BIO_NO_PAGE_REF))
+			put_page(bvec->bv_page);
 	}
 
 	if (unlikely(bio.bi_status))
-- 
cgit v1.2.3


From d4ef647510b1200fe1c996ff1cbf5ac47eb930cc Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 1 May 2019 16:59:16 +0100
Subject: io_uring: avoid page allocation warnings

In io_sqe_buffer_register() we allocate a number of arrays based on the
iov_len from the user-provided iov. While we limit iov_len to SZ_1G,
we can still attempt to allocate arrays exceeding MAX_ORDER.

On a 64-bit system with 4KiB pages, for an iov where iov_base = 0x10 and
iov_len = SZ_1G, we'll calculate that nr_pages = 262145. When we try to
allocate a corresponding array of (16-byte) bio_vecs, requiring 4194320
bytes, which is greater than 4MiB. This results in SLUB warning that
we're trying to allocate greater than MAX_ORDER, and failing the
allocation.

Avoid this by using kvmalloc() for allocations dependent on the
user-provided iov_len. At the same time, fix a leak of imu->bvec when
registration fails.

Full splat from before this patch:

WARNING: CPU: 1 PID: 2314 at mm/page_alloc.c:4595 __alloc_pages_nodemask+0x7ac/0x2938 mm/page_alloc.c:4595
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 2314 Comm: syz-executor326 Not tainted 5.1.0-rc7-dirty #4
Hardware name: linux,dummy-virt (DT)
Call trace:
 dump_backtrace+0x0/0x2f0 include/linux/compiler.h:193
 show_stack+0x20/0x30 arch/arm64/kernel/traps.c:158
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x110/0x190 lib/dump_stack.c:113
 panic+0x384/0x68c kernel/panic.c:214
 __warn+0x2bc/0x2c0 kernel/panic.c:571
 report_bug+0x228/0x2d8 lib/bug.c:186
 bug_handler+0xa0/0x1a0 arch/arm64/kernel/traps.c:956
 call_break_hook arch/arm64/kernel/debug-monitors.c:301 [inline]
 brk_handler+0x1d4/0x388 arch/arm64/kernel/debug-monitors.c:316
 do_debug_exception+0x1a0/0x468 arch/arm64/mm/fault.c:831
 el1_dbg+0x18/0x8c
 __alloc_pages_nodemask+0x7ac/0x2938 mm/page_alloc.c:4595
 alloc_pages_current+0x164/0x278 mm/mempolicy.c:2132
 alloc_pages include/linux/gfp.h:509 [inline]
 kmalloc_order+0x20/0x50 mm/slab_common.c:1231
 kmalloc_order_trace+0x30/0x2b0 mm/slab_common.c:1243
 kmalloc_large include/linux/slab.h:480 [inline]
 __kmalloc+0x3dc/0x4f0 mm/slub.c:3791
 kmalloc_array include/linux/slab.h:670 [inline]
 io_sqe_buffer_register fs/io_uring.c:2472 [inline]
 __io_uring_register fs/io_uring.c:2962 [inline]
 __do_sys_io_uring_register fs/io_uring.c:3008 [inline]
 __se_sys_io_uring_register fs/io_uring.c:2990 [inline]
 __arm64_sys_io_uring_register+0x9e0/0x1bc8 fs/io_uring.c:2990
 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline]
 invoke_syscall arch/arm64/kernel/syscall.c:47 [inline]
 el0_svc_common.constprop.0+0x148/0x2e0 arch/arm64/kernel/syscall.c:83
 el0_svc_handler+0xdc/0x100 arch/arm64/kernel/syscall.c:129
 el0_svc+0x8/0xc arch/arm64/kernel/entry.S:948
SMP: stopping secondary CPUs
Dumping ftrace buffer:
   (ftrace buffer empty)
Kernel Offset: disabled
CPU features: 0x002,23000438
Memory Limit: none
Rebooting in 1 seconds..

Fixes: edafccee56ff3167 ("io_uring: add support for pre-mapped user IO buffers")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 18cecb6a0151..84efb8956734 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2443,7 +2443,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
 
 		if (ctx->account_mem)
 			io_unaccount_mem(ctx->user, imu->nr_bvecs);
-		kfree(imu->bvec);
+		kvfree(imu->bvec);
 		imu->nr_bvecs = 0;
 	}
 
@@ -2535,9 +2535,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 		if (!pages || nr_pages > got_pages) {
 			kfree(vmas);
 			kfree(pages);
-			pages = kmalloc_array(nr_pages, sizeof(struct page *),
+			pages = kvmalloc_array(nr_pages, sizeof(struct page *),
 						GFP_KERNEL);
-			vmas = kmalloc_array(nr_pages,
+			vmas = kvmalloc_array(nr_pages,
 					sizeof(struct vm_area_struct *),
 					GFP_KERNEL);
 			if (!pages || !vmas) {
@@ -2549,7 +2549,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 			got_pages = nr_pages;
 		}
 
-		imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+		imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
 						GFP_KERNEL);
 		ret = -ENOMEM;
 		if (!imu->bvec) {
@@ -2588,6 +2588,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 			}
 			if (ctx->account_mem)
 				io_unaccount_mem(ctx->user, nr_pages);
+			kvfree(imu->bvec);
 			goto err;
 		}
 
@@ -2610,12 +2611,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 
 		ctx->nr_user_bufs++;
 	}
-	kfree(pages);
-	kfree(vmas);
+	kvfree(pages);
+	kvfree(vmas);
 	return 0;
 err:
-	kfree(pages);
-	kfree(vmas);
+	kvfree(pages);
+	kvfree(vmas);
 	io_sqe_buffer_unregister(ctx);
 	return ret;
 }
-- 
cgit v1.2.3