From e02b66512738db161e83634255e9826c8cb51336 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:36 +0100 Subject: io_uring: initialise msghdr::msg_ubuf Initialise newly added ->msg_ubuf in io_recv() and io_send(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b8f9f263875a4a36e7f26cc5d55ebe315308f57d.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index e61efa31c729..bbc9c603641a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -294,6 +294,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; + msg.msg_ubuf = NULL; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -783,6 +784,7 @@ retry_multishot: msg.msg_flags = 0; msg.msg_controllen = 0; msg.msg_iocb = NULL; + msg.msg_ubuf = NULL; flags = sr->msg_flags; if (force_nonblock) -- cgit v1.2.3 From e70cb60893ca64b7df06864aa16c1cf6d6c671db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:37 +0100 Subject: io_uring: export io_put_task() Make io_put_task() available to non-core parts of io_uring, we'll need it for notification infrastructure. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3686807d4c03b72e389947b0e8692d4d44334ef0.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 25 +++++++++++++++++++++++++ io_uring/io_uring.c | 11 +---------- io_uring/io_uring.h | 10 ++++++++++ io_uring/tctx.h | 26 -------------------------- 4 files changed, 36 insertions(+), 36 deletions(-) (limited to 'io_uring') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d54b8b7e0746..368c34d14b13 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -4,6 +4,7 @@ #include #include #include +#include #include struct io_wq_work_node { @@ -43,6 +44,30 @@ struct io_hash_table { unsigned hash_bits; }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + +struct io_uring_task { + /* submission side */ + int cached_refs; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct file *registered_rings[IO_RINGFD_REG_MAX]; + + struct xarray xa; + struct wait_queue_head wait; + atomic_t in_idle; + atomic_t inflight_tracked; + struct percpu_counter inflight; + + struct { /* task_work */ + struct llist_head task_list; + struct callback_head task_work; + } ____cacheline_aligned_in_smp; +}; + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4b3fd645d023..7795cfedf6bf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -608,7 +608,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) return ret; } -static void __io_put_task(struct task_struct *task, int nr) +void __io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; @@ -618,15 +618,6 @@ static void __io_put_task(struct task_struct *task, int nr) put_task_struct_many(task, nr); } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - if (likely(task == current)) - task->io_uring->cached_refs += nr; - else - __io_put_task(task, nr); -} - static void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 5db0a60dc04e..b1c0c0a400d8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -71,6 +71,7 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); +void __io_put_task(struct task_struct *task, int nr); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); @@ -258,4 +259,13 @@ static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) __io_commit_cqring_flush(ctx); } +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + task->io_uring->cached_refs += nr; + else + __io_put_task(task, nr); +} + #endif diff --git a/io_uring/tctx.h b/io_uring/tctx.h index 8a33ff6e5d91..25974beed4d6 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -1,31 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include - -/* - * Arbitrary limit, can be raised if need be - */ -#define IO_RINGFD_REG_MAX 16 - -struct io_uring_task { - /* submission side */ - int cached_refs; - const struct io_ring_ctx *last; - struct io_wq *io_wq; - struct file *registered_rings[IO_RINGFD_REG_MAX]; - - struct xarray xa; - struct wait_queue_head wait; - atomic_t in_idle; - atomic_t inflight_tracked; - struct percpu_counter inflight; - - struct { /* task_work */ - struct llist_head task_list; - struct callback_head task_work; - } ____cacheline_aligned_in_smp; -}; - struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; -- cgit v1.2.3 From eb42cebb2cf24c48f60c32856a4bba93d42659c8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:38 +0100 Subject: io_uring: add zc notification infrastructure Add internal part of send zerocopy notifications. There are two main structures, the first one is struct io_notif, which carries inside struct ubuf_info and maps 1:1 to it. io_uring will be binding a number of zerocopy send requests to it and ask to complete (aka flush) it. When flushed and all attached requests and skbs complete, it'll generate one and only one CQE. There are intended to be passed into the network layer as struct msghdr::msg_ubuf. The second concept is notification slots. The userspace will be able to register an array of slots and subsequently addressing them by the index in the array. Slots are independent of each other. Each slot can have only one notifier at a time (called active notifier) but many notifiers during the lifetime. When active, a notifier not going to post any completion but the userspace can attach requests to it by specifying the corresponding slot while issueing send zc requests. Eventually, the userspace will want to "flush" the notifier losing any way to attach new requests to it, however it can use the next atomatically added notifier of this slot or of any other slot. When the network layer is done with all enqueued skbs attached to a notifier and doesn't need the specified in them user data, the flushed notifier will post a CQE. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3ecf54c31a85762bf679b0a432c9f43ecf7e61cc.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ++ io_uring/Makefile | 2 +- io_uring/io_uring.c | 8 ++-- io_uring/io_uring.h | 2 + io_uring/notif.c | 102 +++++++++++++++++++++++++++++++++++++++++ io_uring/notif.h | 64 ++++++++++++++++++++++++++ 6 files changed, 179 insertions(+), 4 deletions(-) create mode 100644 io_uring/notif.c create mode 100644 io_uring/notif.h (limited to 'io_uring') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 368c34d14b13..f7fab3758cb9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -34,6 +34,9 @@ struct io_file_table { unsigned int alloc_hint; }; +struct io_notif; +struct io_notif_slot; + struct io_hash_bucket { spinlock_t lock; struct hlist_head list; @@ -237,6 +240,8 @@ struct io_ring_ctx { unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; + struct io_notif_slot *notif_slots; + unsigned nr_notif_slots; struct io_submit_state submit_state; diff --git a/io_uring/Makefile b/io_uring/Makefile index 466639c289be..8cc8e5387a75 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o + cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7795cfedf6bf..65ac407dd74e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -90,6 +90,7 @@ #include "rsrc.h" #include "cancel.h" #include "net.h" +#include "notif.h" #include "timeout.h" #include "poll.h" @@ -732,9 +733,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) return &rings->cqes[off]; } -static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { struct io_uring_cqe *cqe; @@ -2491,6 +2491,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) } #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); + WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); @@ -2667,6 +2668,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); if (ctx->rings) io_poll_remove_all(ctx, NULL, true); + io_notif_unregister(ctx); mutex_unlock(&ctx->uring_lock); /* failed during ring init, it couldn't have issued any requests */ diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index b1c0c0a400d8..66bfd880d07f 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -33,6 +33,8 @@ void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); diff --git a/io_uring/notif.c b/io_uring/notif.c new file mode 100644 index 000000000000..6ee948af6a49 --- /dev/null +++ b/io_uring/notif.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include + +#include "io_uring.h" +#include "notif.h" + +static void __io_notif_complete_tw(struct callback_head *cb) +{ + struct io_notif *notif = container_of(cb, struct io_notif, task_work); + struct io_ring_ctx *ctx = notif->ctx; + + io_cq_lock(ctx); + io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true); + io_cq_unlock_post(ctx); + + percpu_ref_put(&ctx->refs); + kfree(notif); +} + +static inline void io_notif_complete(struct io_notif *notif) +{ + __io_notif_complete_tw(¬if->task_work); +} + +static void io_notif_complete_wq(struct work_struct *work) +{ + struct io_notif *notif = container_of(work, struct io_notif, commit_work); + + io_notif_complete(notif); +} + +static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, + struct ubuf_info *uarg, + bool success) +{ + struct io_notif *notif = container_of(uarg, struct io_notif, uarg); + + if (!refcount_dec_and_test(&uarg->refcnt)) + return; + INIT_WORK(¬if->commit_work, io_notif_complete_wq); + queue_work(system_unbound_wq, ¬if->commit_work); +} + +struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, + struct io_notif_slot *slot) + __must_hold(&ctx->uring_lock) +{ + struct io_notif *notif; + + notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT); + if (!notif) + return NULL; + + notif->seq = slot->seq++; + notif->tag = slot->tag; + notif->ctx = ctx; + notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + notif->uarg.callback = io_uring_tx_zerocopy_callback; + /* master ref owned by io_notif_slot, will be dropped on flush */ + refcount_set(¬if->uarg.refcnt, 1); + percpu_ref_get(&ctx->refs); + return notif; +} + +static void io_notif_slot_flush(struct io_notif_slot *slot) + __must_hold(&ctx->uring_lock) +{ + struct io_notif *notif = slot->notif; + + slot->notif = NULL; + + if (WARN_ON_ONCE(in_interrupt())) + return; + /* drop slot's master ref */ + if (refcount_dec_and_test(¬if->uarg.refcnt)) + io_notif_complete(notif); +} + +__cold int io_notif_unregister(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + int i; + + if (!ctx->notif_slots) + return -ENXIO; + + for (i = 0; i < ctx->nr_notif_slots; i++) { + struct io_notif_slot *slot = &ctx->notif_slots[i]; + + if (slot->notif) + io_notif_slot_flush(slot); + } + + kvfree(ctx->notif_slots); + ctx->notif_slots = NULL; + ctx->nr_notif_slots = 0; + return 0; +} \ No newline at end of file diff --git a/io_uring/notif.h b/io_uring/notif.h new file mode 100644 index 000000000000..3d7a1d242e17 --- /dev/null +++ b/io_uring/notif.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +struct io_notif { + struct ubuf_info uarg; + struct io_ring_ctx *ctx; + + /* cqe->user_data, io_notif_slot::tag if not overridden */ + u64 tag; + /* see struct io_notif_slot::seq */ + u32 seq; + + union { + struct callback_head task_work; + struct work_struct commit_work; + }; +}; + +struct io_notif_slot { + /* + * Current/active notifier. A slot holds only one active notifier at a + * time and keeps one reference to it. Flush releases the reference and + * lazily replaces it with a new notifier. + */ + struct io_notif *notif; + + /* + * Default ->user_data for this slot notifiers CQEs + */ + u64 tag; + /* + * Notifiers of a slot live in generations, we create a new notifier + * only after flushing the previous one. Track the sequential number + * for all notifiers and copy it into notifiers's cqe->cflags + */ + u32 seq; +}; + +int io_notif_unregister(struct io_ring_ctx *ctx); + +struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, + struct io_notif_slot *slot); + +static inline struct io_notif *io_get_notif(struct io_ring_ctx *ctx, + struct io_notif_slot *slot) +{ + if (!slot->notif) + slot->notif = io_alloc_notif(ctx, slot); + return slot->notif; +} + +static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx, + int idx) + __must_hold(&ctx->uring_lock) +{ + if (idx >= ctx->nr_notif_slots) + return NULL; + idx = array_index_nospec(idx, ctx->nr_notif_slots); + return &ctx->notif_slots[idx]; +} -- cgit v1.2.3 From eb4a299b2f95437af6183946c2a2e850621cefdb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:39 +0100 Subject: io_uring: cache struct io_notif kmalloc'ing struct io_notif is too expensive when done frequently, cache them as many other resources in io_uring. Keep two list, the first one is from where we're getting notifiers, it's protected by ->uring_lock. The second is protected by ->completion_lock, to which we queue released notifiers. Then we splice one list into another when needed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9dec18f7fcbab9f4bd40b96e5ae158b119945230.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 ++++++ io_uring/io_uring.c | 3 +++ io_uring/notif.c | 57 ++++++++++++++++++++++++++++++++++++------ io_uring/notif.h | 5 ++++ 4 files changed, 65 insertions(+), 7 deletions(-) (limited to 'io_uring') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f7fab3758cb9..144493cbadb5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -249,6 +249,9 @@ struct io_ring_ctx { struct xarray io_bl_xa; struct list_head io_buffers_cache; + /* struct io_notif cache, protected by uring_lock */ + struct list_head notif_list; + struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; struct io_alloc_cache apoll_cache; @@ -259,6 +262,10 @@ struct io_ring_ctx { struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; + /* struct io_notif cache protected by completion_lock */ + struct list_head notif_list_locked; + unsigned int notif_locked_nr; + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 65ac407dd74e..20e65d45ca1c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -321,6 +321,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); + INIT_LIST_HEAD(&ctx->notif_list); + INIT_LIST_HEAD(&ctx->notif_list_locked); return ctx; err: kfree(ctx->dummy_ubuf); @@ -2493,6 +2495,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); + io_notif_cache_purge(ctx); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); diff --git a/io_uring/notif.c b/io_uring/notif.c index 6ee948af6a49..b257db2120b4 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -15,10 +15,12 @@ static void __io_notif_complete_tw(struct callback_head *cb) io_cq_lock(ctx); io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true); + + list_add(¬if->cache_node, &ctx->notif_list_locked); + ctx->notif_locked_nr++; io_cq_unlock_post(ctx); percpu_ref_put(&ctx->refs); - kfree(notif); } static inline void io_notif_complete(struct io_notif *notif) @@ -45,21 +47,62 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, queue_work(system_unbound_wq, ¬if->commit_work); } +static void io_notif_splice_cached(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + spin_lock(&ctx->completion_lock); + list_splice_init(&ctx->notif_list_locked, &ctx->notif_list); + ctx->notif_locked_nr = 0; + spin_unlock(&ctx->completion_lock); +} + +void io_notif_cache_purge(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + io_notif_splice_cached(ctx); + + while (!list_empty(&ctx->notif_list)) { + struct io_notif *notif = list_first_entry(&ctx->notif_list, + struct io_notif, cache_node); + + list_del(¬if->cache_node); + kfree(notif); + } +} + +static inline bool io_notif_has_cached(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + if (likely(!list_empty(&ctx->notif_list))) + return true; + if (data_race(READ_ONCE(ctx->notif_locked_nr) <= IO_NOTIF_SPLICE_BATCH)) + return false; + io_notif_splice_cached(ctx); + return !list_empty(&ctx->notif_list); +} + struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot) __must_hold(&ctx->uring_lock) { struct io_notif *notif; - notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT); - if (!notif) - return NULL; + if (likely(io_notif_has_cached(ctx))) { + notif = list_first_entry(&ctx->notif_list, + struct io_notif, cache_node); + list_del(¬if->cache_node); + } else { + notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT); + if (!notif) + return NULL; + /* pre-initialise some fields */ + notif->ctx = ctx; + notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + notif->uarg.callback = io_uring_tx_zerocopy_callback; + } notif->seq = slot->seq++; notif->tag = slot->tag; - notif->ctx = ctx; - notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; - notif->uarg.callback = io_uring_tx_zerocopy_callback; /* master ref owned by io_notif_slot, will be dropped on flush */ refcount_set(¬if->uarg.refcnt, 1); percpu_ref_get(&ctx->refs); diff --git a/io_uring/notif.h b/io_uring/notif.h index 3d7a1d242e17..b23c9c0515bb 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -5,6 +5,8 @@ #include #include +#define IO_NOTIF_SPLICE_BATCH 32 + struct io_notif { struct ubuf_info uarg; struct io_ring_ctx *ctx; @@ -13,6 +15,8 @@ struct io_notif { u64 tag; /* see struct io_notif_slot::seq */ u32 seq; + /* hook into ctx->notif_list and ctx->notif_list_locked */ + struct list_head cache_node; union { struct callback_head task_work; @@ -41,6 +45,7 @@ struct io_notif_slot { }; int io_notif_unregister(struct io_ring_ctx *ctx); +void io_notif_cache_purge(struct io_ring_ctx *ctx); struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot); -- cgit v1.2.3 From e58d498e81baa9fd8acf5132d8b2d4f829361f6b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:40 +0100 Subject: io_uring: complete notifiers in tw We need a task context to post CQEs but using wq is too expensive. Try to complete notifiers using task_work and fall back to wq if fails. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/089799ab665b10b78fdc614ae6d59fa7ef0d5f91.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 22 +++++++++++++++++++--- io_uring/notif.h | 3 +++ 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/notif.c b/io_uring/notif.c index b257db2120b4..aec74f88fc33 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -13,6 +13,11 @@ static void __io_notif_complete_tw(struct callback_head *cb) struct io_notif *notif = container_of(cb, struct io_notif, task_work); struct io_ring_ctx *ctx = notif->ctx; + if (likely(notif->task)) { + io_put_task(notif->task, 1); + notif->task = NULL; + } + io_cq_lock(ctx); io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true); @@ -43,6 +48,14 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, if (!refcount_dec_and_test(&uarg->refcnt)) return; + + if (likely(notif->task)) { + init_task_work(¬if->task_work, __io_notif_complete_tw); + if (likely(!task_work_add(notif->task, ¬if->task_work, + TWA_SIGNAL))) + return; + } + INIT_WORK(¬if->commit_work, io_notif_complete_wq); queue_work(system_unbound_wq, ¬if->commit_work); } @@ -134,12 +147,15 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx) for (i = 0; i < ctx->nr_notif_slots; i++) { struct io_notif_slot *slot = &ctx->notif_slots[i]; - if (slot->notif) - io_notif_slot_flush(slot); + if (!slot->notif) + continue; + if (WARN_ON_ONCE(slot->notif->task)) + slot->notif->task = NULL; + io_notif_slot_flush(slot); } kvfree(ctx->notif_slots); ctx->notif_slots = NULL; ctx->nr_notif_slots = 0; return 0; -} \ No newline at end of file +} diff --git a/io_uring/notif.h b/io_uring/notif.h index b23c9c0515bb..23ca7620fff9 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -11,6 +11,9 @@ struct io_notif { struct ubuf_info uarg; struct io_ring_ctx *ctx; + /* complete via tw if ->task is non-NULL, fallback to wq otherwise */ + struct task_struct *task; + /* cqe->user_data, io_notif_slot::tag if not overridden */ u64 tag; /* see struct io_notif_slot::seq */ -- cgit v1.2.3 From 68ef5578efc8893489400b1ec30af66dab4f75ff Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:41 +0100 Subject: io_uring: add rsrc referencing for notifiers In preparation to zerocopy sends with fixed buffers make notifiers to reference the rsrc node to protect the used fixed buffers. We can't just grab it for a send request as notifiers can likely outlive requests that used it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3cd7a01d26837945b6982fa9cf15a63230f2ed4f.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 5 +++++ io_uring/notif.h | 1 + io_uring/rsrc.h | 12 +++++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/notif.c b/io_uring/notif.c index aec74f88fc33..0a2e98bd74f6 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -7,10 +7,12 @@ #include "io_uring.h" #include "notif.h" +#include "rsrc.h" static void __io_notif_complete_tw(struct callback_head *cb) { struct io_notif *notif = container_of(cb, struct io_notif, task_work); + struct io_rsrc_node *rsrc_node = notif->rsrc_node; struct io_ring_ctx *ctx = notif->ctx; if (likely(notif->task)) { @@ -25,6 +27,7 @@ static void __io_notif_complete_tw(struct callback_head *cb) ctx->notif_locked_nr++; io_cq_unlock_post(ctx); + io_rsrc_put_node(rsrc_node, 1); percpu_ref_put(&ctx->refs); } @@ -119,6 +122,8 @@ struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, /* master ref owned by io_notif_slot, will be dropped on flush */ refcount_set(¬if->uarg.refcnt, 1); percpu_ref_get(&ctx->refs); + notif->rsrc_node = ctx->rsrc_node; + io_charge_rsrc_node(ctx); return notif; } diff --git a/io_uring/notif.h b/io_uring/notif.h index 23ca7620fff9..1dd48efb7744 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -10,6 +10,7 @@ struct io_notif { struct ubuf_info uarg; struct io_ring_ctx *ctx; + struct io_rsrc_node *rsrc_node; /* complete via tw if ->task is non-NULL, fallback to wq otherwise */ struct task_struct *task; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 87f58315b247..af342fd239d0 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -135,6 +135,13 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req, } } +static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx) +{ + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned int issue_flags) @@ -144,9 +151,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, if (!(issue_flags & IO_URING_F_UNLOCKED)) { lockdep_assert_held(&ctx->uring_lock); - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); + + io_charge_rsrc_node(ctx); } else { percpu_ref_get(&req->rsrc_node->refs); } -- cgit v1.2.3 From bc24d6bd32df0be19df3d30e74be4ba56493c0e2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:42 +0100 Subject: io_uring: add notification slot registration Let the userspace to register and unregister notification slots. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a0aa8161fe3ebb2a4cc6e5dbd0cffb96e6881cf5.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 17 +++++++++++++++++ io_uring/io_uring.c | 9 +++++++++ io_uring/notif.c | 43 +++++++++++++++++++++++++++++++++++++++++++ io_uring/notif.h | 3 +++ 4 files changed, 72 insertions(+) (limited to 'io_uring') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4c9b11e2e991..dcfc7a0bda0c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -457,6 +457,10 @@ enum { /* register a range of fixed file slots for automatic slot allocation */ IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* zerocopy notification API */ + IORING_REGISTER_NOTIFIERS = 26, + IORING_UNREGISTER_NOTIFIERS = 27, + /* this goes last */ IORING_REGISTER_LAST }; @@ -503,6 +507,19 @@ struct io_uring_rsrc_update2 { __u32 resv2; }; +struct io_uring_notification_slot { + __u64 tag; + __u64 resv[3]; +}; + +struct io_uring_notification_register { + __u32 nr_slots; + __u32 resv; + __u64 resv2; + __u64 data; + __u64 resv3; +}; + /* Skip updating fd indexes set to this value in the fd table */ #define IORING_REGISTER_FILES_SKIP (-2) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 20e65d45ca1c..cae11374456e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3870,6 +3870,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_file_alloc_range(ctx, arg); break; + case IORING_REGISTER_NOTIFIERS: + ret = io_notif_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_NOTIFIERS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_notif_unregister(ctx); + break; default: ret = -EINVAL; break; diff --git a/io_uring/notif.c b/io_uring/notif.c index 0a2e98bd74f6..e6d98dc208c7 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -162,5 +162,48 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx) kvfree(ctx->notif_slots); ctx->notif_slots = NULL; ctx->nr_notif_slots = 0; + io_notif_cache_purge(ctx); + return 0; +} + +__cold int io_notif_register(struct io_ring_ctx *ctx, + void __user *arg, unsigned int size) + __must_hold(&ctx->uring_lock) +{ + struct io_uring_notification_slot __user *slots; + struct io_uring_notification_slot slot; + struct io_uring_notification_register reg; + unsigned i; + + if (ctx->nr_notif_slots) + return -EBUSY; + if (size != sizeof(reg)) + return -EINVAL; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS) + return -EINVAL; + if (reg.resv || reg.resv2 || reg.resv3) + return -EINVAL; + + slots = u64_to_user_ptr(reg.data); + ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]), + GFP_KERNEL_ACCOUNT); + if (!ctx->notif_slots) + return -ENOMEM; + + for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) { + struct io_notif_slot *notif_slot = &ctx->notif_slots[i]; + + if (copy_from_user(&slot, &slots[i], sizeof(slot))) { + io_notif_unregister(ctx); + return -EFAULT; + } + if (slot.resv[0] | slot.resv[1] | slot.resv[2]) { + io_notif_unregister(ctx); + return -EINVAL; + } + notif_slot->tag = slot.tag; + } return 0; } diff --git a/io_uring/notif.h b/io_uring/notif.h index 1dd48efb7744..00efe164bdc4 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -6,6 +6,7 @@ #include #define IO_NOTIF_SPLICE_BATCH 32 +#define IORING_MAX_NOTIF_SLOTS (1U << 10) struct io_notif { struct ubuf_info uarg; @@ -48,6 +49,8 @@ struct io_notif_slot { u32 seq; }; +int io_notif_register(struct io_ring_ctx *ctx, + void __user *arg, unsigned int size); int io_notif_unregister(struct io_ring_ctx *ctx); void io_notif_cache_purge(struct io_ring_ctx *ctx); -- cgit v1.2.3 From 06a5464be84e4ae48394d34441baf34bf9706827 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:43 +0100 Subject: io_uring: wire send zc request type Add a new io_uring opcode IORING_OP_SENDZC. The main distinction from IORING_OP_SEND is that the user should specify a notification slot index in sqe::notification_idx and the buffers are safe to reuse only when the used notification is flushed and completes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a80387c6a68ce9cf99b3b6ef6f71068468761fb7.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 +++ io_uring/net.c | 94 +++++++++++++++++++++++++++++++++++++++++++ io_uring/net.h | 3 ++ io_uring/opdef.c | 15 +++++++ 4 files changed, 117 insertions(+) (limited to 'io_uring') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index dcfc7a0bda0c..82bf2991e9bd 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -66,6 +66,10 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + struct { + __u16 notification_idx; + __u16 __pad; + }; }; union { struct { @@ -197,6 +201,7 @@ enum io_uring_op { IORING_OP_GETXATTR, IORING_OP_SOCKET, IORING_OP_URING_CMD, + IORING_OP_SENDZC_NOTIF, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/net.c b/io_uring/net.c index bbc9c603641a..89a8678ce69b 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -14,6 +14,7 @@ #include "kbuf.h" #include "alloc_cache.h" #include "net.h" +#include "notif.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -59,6 +60,15 @@ struct io_sr_msg { unsigned int flags; }; +struct io_sendzc { + struct file *file; + void __user *buf; + size_t len; + u16 slot_idx; + unsigned msg_flags; + unsigned flags; +}; + #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -834,6 +844,90 @@ out_free: return ret; } +int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sendzc *zc = io_kiocb_to_cmd(req); + + if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) || + READ_ONCE(sqe->addr3)) + return -EINVAL; + + zc->flags = READ_ONCE(sqe->ioprio); + if (zc->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; + + zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + zc->len = READ_ONCE(sqe->len); + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + zc->slot_idx = READ_ONCE(sqe->notification_idx); + if (zc->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + zc->msg_flags |= MSG_CMSG_COMPAT; +#endif + return 0; +} + +int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_sendzc *zc = io_kiocb_to_cmd(req); + struct io_notif_slot *notif_slot; + struct io_notif *notif; + struct msghdr msg; + struct iovec iov; + struct socket *sock; + unsigned msg_flags; + int ret, min_ret = 0; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + if (issue_flags & IO_URING_F_UNLOCKED) + return -EAGAIN; + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + notif_slot = io_get_notif_slot(ctx, zc->slot_idx); + if (!notif_slot) + return -EINVAL; + notif = io_get_notif(ctx, notif_slot); + if (!notif) + return -ENOMEM; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret; + + msg_flags = zc->msg_flags | MSG_ZEROCOPY; + if (issue_flags & IO_URING_F_NONBLOCK) + msg_flags |= MSG_DONTWAIT; + if (msg_flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + + msg.msg_flags = msg_flags; + msg.msg_ubuf = ¬if->uarg; + msg.sg_from_iter = NULL; + ret = sock_sendmsg(sock, &msg); + + if (unlikely(ret < min_ret)) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + return ret == -ERESTARTSYS ? -EINTR : ret; + } + + io_req_set_res(req, ret, 0); + return IOU_OK; +} + int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req); diff --git a/io_uring/net.h b/io_uring/net.h index db20ce9d6546..7c438d39c089 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -52,6 +52,9 @@ int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); +int io_sendzc(struct io_kiocb *req, unsigned int issue_flags); +int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + void io_netmsg_cache_free(struct io_cache_entry *entry); #else static inline void io_netmsg_cache_free(struct io_cache_entry *entry) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index a7b84b43e6c2..7ab19bbf3126 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -470,6 +470,21 @@ const struct io_op_def io_op_defs[] = { .issue = io_uring_cmd, .prep_async = io_uring_cmd_prep_async, }, + [IORING_OP_SENDZC_NOTIF] = { + .name = "SENDZC_NOTIF", + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .audit_skip = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_sendzc_prep, + .issue = io_sendzc, +#else + .prep = io_eopnotsupp_prep, +#endif + + }, }; const char *io_uring_get_opcode(u8 opcode) -- cgit v1.2.3 From e29e3bd4b968d50bfb3bbdcee6bfdc340f7792cf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:44 +0100 Subject: io_uring: account locked pages for non-fixed zc Fixed buffers are RLIMIT_MEMLOCK accounted, however it doesn't cover iovec based zerocopy sends. Do the accounting on the io_uring side. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/19b6e3975440f59f1f6199c7ee7acf977b4eecdc.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + io_uring/notif.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 89a8678ce69b..2d04a70b0632 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -906,6 +906,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter); if (unlikely(ret)) return ret; + mm_account_pinned_pages(¬if->uarg.mmp, zc->len); msg_flags = zc->msg_flags | MSG_ZEROCOPY; if (issue_flags & IO_URING_F_NONBLOCK) diff --git a/io_uring/notif.c b/io_uring/notif.c index e6d98dc208c7..c5179e5c1cd6 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -14,7 +14,13 @@ static void __io_notif_complete_tw(struct callback_head *cb) struct io_notif *notif = container_of(cb, struct io_notif, task_work); struct io_rsrc_node *rsrc_node = notif->rsrc_node; struct io_ring_ctx *ctx = notif->ctx; + struct mmpin *mmp = ¬if->uarg.mmp; + if (mmp->user) { + atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); + free_uid(mmp->user); + mmp->user = NULL; + } if (likely(notif->task)) { io_put_task(notif->task, 1); notif->task = NULL; -- cgit v1.2.3 From 092aeedb750a9fad0f0252d6067fc91d76ca44bd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:45 +0100 Subject: io_uring: allow to pass addr into sendzc Allow to specify an address to zerocopy sends making it more like sendto(2). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/70417a8f7c5b51ab454690bae08adc0c187f89e8.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- io_uring/net.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 82bf2991e9bd..0736e2773a5d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -68,7 +68,7 @@ struct io_uring_sqe { __u32 file_index; struct { __u16 notification_idx; - __u16 __pad; + __u16 addr_len; }; }; union { diff --git a/io_uring/net.c b/io_uring/net.c index 2d04a70b0632..61414d865cd7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -67,6 +67,8 @@ struct io_sendzc { u16 slot_idx; unsigned msg_flags; unsigned flags; + unsigned addr_len; + void __user *addr; }; #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) @@ -848,8 +850,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sendzc *zc = io_kiocb_to_cmd(req); - if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) || - READ_ONCE(sqe->addr3)) + if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)) return -EINVAL; zc->flags = READ_ONCE(sqe->ioprio); @@ -862,6 +863,10 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->slot_idx = READ_ONCE(sqe->notification_idx); if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + + zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + zc->addr_len = READ_ONCE(sqe->addr_len); + #ifdef CONFIG_COMPAT if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; @@ -871,6 +876,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) { + struct sockaddr_storage address; struct io_ring_ctx *ctx = req->ctx; struct io_sendzc *zc = io_kiocb_to_cmd(req); struct io_notif_slot *notif_slot; @@ -908,6 +914,14 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) return ret; mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + if (zc->addr) { + ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address); + if (unlikely(ret < 0)) + return ret; + msg.msg_name = (struct sockaddr *)&address; + msg.msg_namelen = zc->addr_len; + } + msg_flags = zc->msg_flags | MSG_ZEROCOPY; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; -- cgit v1.2.3 From 10c7d33ecd51619e453cf6aeee8e326f8ba5cfea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:46 +0100 Subject: io_uring: sendzc with fixed buffers Allow zerocopy sends to use fixed buffers. There is an optimisation for this case, the network layer don't need to reference the pages, see SKBFL_MANAGED_FRAG_REFS, so io_uring have to ensure validity of fixed buffers until the notifier is released. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e1d8bd1b5934e541d90c1824eb4020ae3f5f43f3.1657643355.git.asml.silence@gmail.com [axboe: fold in 32-bit pointer cast warning fix] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 +++++- io_uring/net.c | 29 ++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) (limited to 'io_uring') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0736e2773a5d..f1a9ff9b9ea7 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -272,9 +272,13 @@ enum io_uring_op { * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if * the handler will continue to report * CQEs on behalf of the same SQE. + * + * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in + * the buf_index field. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) -#define IORING_RECV_MULTISHOT (1U << 1) +#define IORING_RECV_MULTISHOT (1U << 1) +#define IORING_RECVSEND_FIXED_BUF (1U << 2) /* * accept flags stored in sqe->ioprio diff --git a/io_uring/net.c b/io_uring/net.c index 61414d865cd7..ab443c52dcfd 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -15,6 +15,7 @@ #include "alloc_cache.h" #include "net.h" #include "notif.h" +#include "rsrc.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -849,13 +850,23 @@ out_free: int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sendzc *zc = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)) return -EINVAL; zc->flags = READ_ONCE(sqe->ioprio); - if (zc->flags & ~IORING_RECVSEND_POLL_FIRST) + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)) return -EINVAL; + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { + unsigned idx = READ_ONCE(sqe->buf_index); + + if (unlikely(idx >= ctx->nr_user_bufs)) + return -EFAULT; + idx = array_index_nospec(idx, ctx->nr_user_bufs); + req->imu = READ_ONCE(ctx->user_bufs[idx]); + io_req_set_rsrc_node(req, ctx, 0); + } zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); @@ -909,10 +920,18 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) msg.msg_controllen = 0; msg.msg_namelen = 0; - ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - return ret; - mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { + ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu, + (u64)(uintptr_t)zc->buf, zc->len); + if (unlikely(ret)) + return ret; + } else { + ret = import_single_range(WRITE, zc->buf, zc->len, &iov, + &msg.msg_iter); + if (unlikely(ret)) + return ret; + mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + } if (zc->addr) { ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address); -- cgit v1.2.3 From 63809137ebb58f0aa2ce359117422686e3304f45 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:47 +0100 Subject: io_uring: flush notifiers after sendzc Allow to flush notifiers as a part of sendzc request by setting IORING_SENDZC_FLUSH flag. When the sendzc request succeedes it will flush the used [active] notifier. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e0b4d9a6797e2fd6092824fe42953db7a519bbc8.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 ++++ io_uring/io_uring.c | 11 +---------- io_uring/io_uring.h | 10 ++++++++++ io_uring/net.c | 5 ++++- io_uring/notif.c | 2 +- io_uring/notif.h | 11 +++++++++++ 6 files changed, 31 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f1a9ff9b9ea7..45272eb37d10 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -275,10 +275,14 @@ enum io_uring_op { * * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in * the buf_index field. + * + * IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful + * successful. Only for zerocopy sends. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) +#define IORING_RECVSEND_NOTIF_FLUSH (1U << 3) /* * accept flags stored in sqe->ioprio diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cae11374456e..1d600a63643b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -621,7 +621,7 @@ void __io_put_task(struct task_struct *task, int nr) put_task_struct_many(task, nr); } -static void io_task_refs_refill(struct io_uring_task *tctx) +void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; @@ -630,15 +630,6 @@ static void io_task_refs_refill(struct io_uring_task *tctx) tctx->cached_refs += refill; } -static inline void io_get_task_refs(int nr) -{ - struct io_uring_task *tctx = current->io_uring; - - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) - io_task_refs_refill(tctx); -} - static __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 66bfd880d07f..cc81a9d1fd4d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -74,6 +74,7 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); void __io_put_task(struct task_struct *task, int nr); +void io_task_refs_refill(struct io_uring_task *tctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); @@ -270,4 +271,13 @@ static inline void io_put_task(struct task_struct *task, int nr) __io_put_task(task, nr); } +static inline void io_get_task_refs(int nr) +{ + struct io_uring_task *tctx = current->io_uring; + + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) + io_task_refs_refill(tctx); +} + #endif diff --git a/io_uring/net.c b/io_uring/net.c index ab443c52dcfd..9ac2ce37c522 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -856,7 +856,8 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; zc->flags = READ_ONCE(sqe->ioprio); - if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)) + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | + IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH)) return -EINVAL; if (zc->flags & IORING_RECVSEND_FIXED_BUF) { unsigned idx = READ_ONCE(sqe->buf_index); @@ -958,6 +959,8 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) return ret == -ERESTARTSYS ? -EINTR : ret; } + if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH) + io_notif_slot_flush_submit(notif_slot, 0); io_req_set_res(req, ret, 0); return IOU_OK; } diff --git a/io_uring/notif.c b/io_uring/notif.c index c5179e5c1cd6..a93887451bbb 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -133,7 +133,7 @@ struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, return notif; } -static void io_notif_slot_flush(struct io_notif_slot *slot) +void io_notif_slot_flush(struct io_notif_slot *slot) __must_hold(&ctx->uring_lock) { struct io_notif *notif = slot->notif; diff --git a/io_uring/notif.h b/io_uring/notif.h index 00efe164bdc4..6cd73d7b965b 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -54,6 +54,7 @@ int io_notif_register(struct io_ring_ctx *ctx, int io_notif_unregister(struct io_ring_ctx *ctx); void io_notif_cache_purge(struct io_ring_ctx *ctx); +void io_notif_slot_flush(struct io_notif_slot *slot); struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot); @@ -74,3 +75,13 @@ static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx, idx = array_index_nospec(idx, ctx->nr_notif_slots); return &ctx->notif_slots[idx]; } + +static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot, + unsigned int issue_flags) +{ + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + slot->notif->task = current; + io_get_task_refs(1); + } + io_notif_slot_flush(slot); +} -- cgit v1.2.3 From 4379d5f15b3fd4224c37841029178aa8082a242e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:48 +0100 Subject: io_uring: rename IORING_OP_FILES_UPDATE IORING_OP_FILES_UPDATE will be a more generic opcode serving different resource types, rename it into IORING_OP_RSRC_UPDATE and add subtype handling. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0a907133907d9af3415a8a7aa1802c6aa97c03c6.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 +++++++++++- io_uring/opdef.c | 9 +++++---- io_uring/rsrc.c | 17 +++++++++++++++-- io_uring/rsrc.h | 4 ++-- 4 files changed, 33 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 45272eb37d10..210a00ab6301 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -174,7 +174,8 @@ enum io_uring_op { IORING_OP_FALLOCATE, IORING_OP_OPENAT, IORING_OP_CLOSE, - IORING_OP_FILES_UPDATE, + IORING_OP_RSRC_UPDATE, + IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE, IORING_OP_STATX, IORING_OP_READ, IORING_OP_WRITE, @@ -223,6 +224,7 @@ enum io_uring_op { #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) + /* * sqe->splice_flags * extends splice(2) flags @@ -289,6 +291,14 @@ enum io_uring_op { */ #define IORING_ACCEPT_MULTISHOT (1U << 0) + +/* + * IORING_OP_RSRC_UPDATE flags + */ +enum { + IORING_RSRC_UPDATE_FILES, +}; + /* * IORING_OP_MSG_RING command types, stored in sqe->addr */ diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 7ab19bbf3126..72dd2b2d8a9d 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -246,12 +246,13 @@ const struct io_op_def io_op_defs[] = { .prep = io_close_prep, .issue = io_close, }, - [IORING_OP_FILES_UPDATE] = { + [IORING_OP_RSRC_UPDATE] = { .audit_skip = 1, .iopoll = 1, - .name = "FILES_UPDATE", - .prep = io_files_update_prep, - .issue = io_files_update, + .name = "RSRC_UPDATE", + .prep = io_rsrc_update_prep, + .issue = io_rsrc_update, + .ioprio = 1, }, [IORING_OP_STATX] = { .audit_skip = 1, diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 7f66b0e25674..fc2b337e6c25 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -21,6 +21,7 @@ struct io_rsrc_update { u64 arg; u32 nr_args; u32 offset; + int type; }; static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, @@ -657,7 +658,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; } -int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rsrc_update *up = io_kiocb_to_cmd(req); @@ -671,6 +672,7 @@ int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!up->nr_args) return -EINVAL; up->arg = READ_ONCE(sqe->addr); + up->type = READ_ONCE(sqe->ioprio); return 0; } @@ -713,7 +715,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, return ret; } -int io_files_update(struct io_kiocb *req, unsigned int issue_flags) +static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; @@ -742,6 +744,17 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + + switch (up->type) { + case IORING_RSRC_UPDATE_FILES: + return io_files_update(req, issue_flags); + } + return -EINVAL; +} + int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index af342fd239d0..21813a23215f 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -167,6 +167,6 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) return &data->tags[table_idx][off]; } -int io_files_update(struct io_kiocb *req, unsigned int issue_flags); -int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags); +int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); #endif -- cgit v1.2.3 From 492dddb4f6e3a5839c27d41ff1fecdbe6c3ab851 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:49 +0100 Subject: io_uring: add zc notification flush requests Overlay notification control onto IORING_OP_RSRC_UPDATE (former IORING_OP_FILES_UPDATE). It allows to flush a range of zc notifications from slots with indexes [sqe->off, sqe->off+sqe->len). If sqe->arg is not zero, it also copies sqe->arg as a new tag for all flushed notifications. Note, it doesn't flush a notification of a slot if there was no requests attached to it (since last flush or registration). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/df13e2363400682a73dd9e71c3b990b8d1ff0333.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/rsrc.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'io_uring') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 210a00ab6301..1463cfecb56b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -297,6 +297,7 @@ enum io_uring_op { */ enum { IORING_RSRC_UPDATE_FILES, + IORING_RSRC_UPDATE_NOTIF, }; /* diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index fc2b337e6c25..9165fdf64269 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -15,6 +15,7 @@ #include "io_uring.h" #include "openclose.h" #include "rsrc.h" +#include "notif.h" struct io_rsrc_update { struct file *file; @@ -744,6 +745,41 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + unsigned len = up->nr_args; + unsigned idx_end, idx = up->offset; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (unlikely(check_add_overflow(idx, len, &idx_end))) { + ret = -EOVERFLOW; + goto out; + } + if (unlikely(idx_end > ctx->nr_notif_slots)) { + ret = -EINVAL; + goto out; + } + + for (; idx < idx_end; idx++) { + struct io_notif_slot *slot = &ctx->notif_slots[idx]; + + if (!slot->notif) + continue; + if (up->arg) + slot->tag = up->arg; + io_notif_slot_flush_submit(slot, issue_flags); + } +out: + io_ring_submit_unlock(ctx, issue_flags); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req); @@ -751,6 +787,8 @@ int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags) switch (up->type) { case IORING_RSRC_UPDATE_FILES: return io_files_update(req, issue_flags); + case IORING_RSRC_UPDATE_NOTIF: + return io_notif_update(req, issue_flags); } return -EINVAL; } -- cgit v1.2.3 From 3ff1a0d395c00e42ee15f561431e0c04097595b9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:50 +0100 Subject: io_uring: enable managed frags with register buffers io_uring's registered buffers infra has a good performant way of pinning pages, so let's use SKBFL_MANAGED_FRAG_REFS when our requests are purely register buffer backed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/278731d3f20caf346cfc025fbee0b4c9ee4ed751.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 9ac2ce37c522..62be89837d82 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -886,6 +886,60 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int frag = shinfo->nr_frags; + int ret = 0; + struct bvec_iter bi; + ssize_t copied = 0; + unsigned long truesize = 0; + + if (!shinfo->nr_frags) + shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; + + if (!skb_zcopy_managed(skb) || !iov_iter_is_bvec(from)) { + skb_zcopy_downgrade_managed(skb); + return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + } + + bi.bi_size = min(from->count, length); + bi.bi_bvec_done = from->iov_offset; + bi.bi_idx = 0; + + while (bi.bi_size && frag < MAX_SKB_FRAGS) { + struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); + + copied += v.bv_len; + truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); + __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, + v.bv_offset, v.bv_len); + bvec_iter_advance_single(from->bvec, &bi, v.bv_len); + } + if (bi.bi_size) + ret = -EMSGSIZE; + + shinfo->nr_frags = frag; + from->bvec += bi.bi_idx; + from->nr_segs -= bi.bi_idx; + from->count = bi.bi_size; + from->iov_offset = bi.bi_bvec_done; + + skb->data_len += copied; + skb->len += copied; + skb->truesize += truesize; + + if (sk && sk->sk_type == SOCK_STREAM) { + sk_wmem_queued_add(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } + return ret; +} + int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) { struct sockaddr_storage address; @@ -950,7 +1004,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) msg.msg_flags = msg_flags; msg.msg_ubuf = ¬if->uarg; - msg.sg_from_iter = NULL; + msg.sg_from_iter = io_sg_from_iter; ret = sock_sendmsg(sock, &msg); if (unlikely(ret < min_ret)) { -- cgit v1.2.3 From cb309ae49da7a7c28f0051deea13970291134fac Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Jul 2022 10:52:03 +0100 Subject: io_uring/net: improve io_get_notif_slot types Don't use signed int for slot indexing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e4d15aefebb5e55729dd9b5ec01ab16b70033343.1658742118.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/notif.h b/io_uring/notif.h index 6cd73d7b965b..3e05d2cecb6f 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -67,7 +67,7 @@ static inline struct io_notif *io_get_notif(struct io_ring_ctx *ctx, } static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx, - int idx) + unsigned idx) __must_hold(&ctx->uring_lock) { if (idx >= ctx->nr_notif_slots) -- cgit v1.2.3 From 2e32ba5607ee2b668baa8831dd74f7cc867a1f7e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Jul 2022 10:52:04 +0100 Subject: io_uring/net: checks errors of zc mem accounting mm_account_pinned_pages() may fail, don't ignore the return value. Fixes: e29e3bd4b968 ("io_uring: account locked pages for non-fixed zc") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/dae0542ed8e6706071bb83ad3e7ad6a70d207fd9.1658742118.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 62be89837d82..8fb8469c3315 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -985,7 +985,9 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) &msg.msg_iter); if (unlikely(ret)) return ret; - mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + ret = mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + if (unlikely(ret)) + return ret; } if (zc->addr) { -- cgit v1.2.3 From 6a9ce66f4d0872861e0bbc67eee6ce5dca5dd886 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Jul 2022 10:52:05 +0100 Subject: io_uring/net: make page accounting more consistent Make network page accounting more consistent with how buffer registration is working, i.e. account all memory to ctx->user. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4aacfe64bbb81b27f9ecf5d5c219c69a07e5aa56.1658742118.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- io_uring/notif.c | 9 ++++----- io_uring/notif.h | 19 +++++++++++++++++++ io_uring/rsrc.c | 12 ++++-------- io_uring/rsrc.h | 9 +++++++++ 5 files changed, 37 insertions(+), 14 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 8fb8469c3315..c13d971c7826 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -985,7 +985,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) &msg.msg_iter); if (unlikely(ret)) return ret; - ret = mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + ret = io_notif_account_mem(notif, zc->len); if (unlikely(ret)) return ret; } diff --git a/io_uring/notif.c b/io_uring/notif.c index a93887451bbb..e986a0ed958c 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -14,12 +14,10 @@ static void __io_notif_complete_tw(struct callback_head *cb) struct io_notif *notif = container_of(cb, struct io_notif, task_work); struct io_rsrc_node *rsrc_node = notif->rsrc_node; struct io_ring_ctx *ctx = notif->ctx; - struct mmpin *mmp = ¬if->uarg.mmp; - if (mmp->user) { - atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); - free_uid(mmp->user); - mmp->user = NULL; + if (notif->account_pages && ctx->user) { + __io_unaccount_mem(ctx->user, notif->account_pages); + notif->account_pages = 0; } if (likely(notif->task)) { io_put_task(notif->task, 1); @@ -121,6 +119,7 @@ struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, notif->ctx = ctx; notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; notif->uarg.callback = io_uring_tx_zerocopy_callback; + notif->account_pages = 0; } notif->seq = slot->seq++; diff --git a/io_uring/notif.h b/io_uring/notif.h index 3e05d2cecb6f..d6f366b1518b 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -5,6 +5,8 @@ #include #include +#include "rsrc.h" + #define IO_NOTIF_SPLICE_BATCH 32 #define IORING_MAX_NOTIF_SLOTS (1U << 10) @@ -23,6 +25,8 @@ struct io_notif { /* hook into ctx->notif_list and ctx->notif_list_locked */ struct list_head cache_node; + unsigned long account_pages; + union { struct callback_head task_work; struct work_struct commit_work; @@ -85,3 +89,18 @@ static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot, } io_notif_slot_flush(slot); } + +static inline int io_notif_account_mem(struct io_notif *notif, unsigned len) +{ + struct io_ring_ctx *ctx = notif->ctx; + unsigned nr_pages = (len >> PAGE_SHIFT) + 2; + int ret; + + if (ctx->user) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + notif->account_pages += nr_pages; + } + return 0; +} diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 9165fdf64269..59704b9ac537 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -44,17 +44,13 @@ void io_rsrc_refs_drop(struct io_ring_ctx *ctx) } } -static inline void __io_unaccount_mem(struct user_struct *user, - unsigned long nr_pages) -{ - atomic_long_sub(nr_pages, &user->locked_vm); -} - -static inline int __io_account_mem(struct user_struct *user, - unsigned long nr_pages) +int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; + if (!nr_pages) + return 0; + /* Don't allow more pages than we can safely lock */ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 21813a23215f..f3a9a177941f 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -169,4 +169,13 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags); int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + +int __io_account_mem(struct user_struct *user, unsigned long nr_pages); + +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) +{ + atomic_long_sub(nr_pages, &user->locked_vm); +} + #endif -- cgit v1.2.3 From 293402e564a7391f38541c7694e736f5fde20aea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Jul 2022 10:52:06 +0100 Subject: io_uring/net: use unsigned for flags Use unsigned int type for msg flags. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5cfaed13d3191337b14b8664ca68b515d9e2d1b4.1658742118.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index c13d971c7826..8276b9537194 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -55,10 +55,10 @@ struct io_sr_msg { struct user_msghdr __user *umsg; void __user *buf; }; - int msg_flags; + unsigned msg_flags; + unsigned flags; size_t len; size_t done_io; - unsigned int flags; }; struct io_sendzc { -- cgit v1.2.3 From bd1a3783dd749012134b142b52e5704f7c142897 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 27 Jul 2022 10:30:40 +0100 Subject: io_uring: export req alloc from core We want to do request allocation out of the core io_uring code, make the allocation functions public for other io_uring parts. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0314fedd3a02a514210ba42d4720332538c65956.1658913593.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 +--------------------- io_uring/io_uring.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1d600a63643b..822819d0f607 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -852,18 +852,13 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, spin_unlock(&ctx->completion_lock); } -static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) -{ - return !ctx->submit_state.free_list.next; -} - /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ -static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) +__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; @@ -904,21 +899,6 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) return true; } -static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) -{ - if (unlikely(io_req_cache_empty(ctx))) - return __io_alloc_req_refill(ctx); - return true; -} - -static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) -{ - struct io_wq_work_node *node; - - node = wq_stack_extract(&ctx->submit_state.free_list); - return container_of(node, struct io_kiocb, comp_list); -} - static inline void io_dismantle_req(struct io_kiocb *req) { unsigned int flags = req->flags; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index cc81a9d1fd4d..2f73f83af960 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,6 +75,7 @@ void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); void __io_put_task(struct task_struct *task, int nr); void io_task_refs_refill(struct io_uring_task *tctx); +bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); @@ -280,4 +281,24 @@ static inline void io_get_task_refs(int nr) io_task_refs_refill(tctx); } +static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) +{ + return !ctx->submit_state.free_list.next; +} + +static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +{ + if (unlikely(io_req_cache_empty(ctx))) + return __io_alloc_req_refill(ctx); + return true; +} + +static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +{ + struct io_wq_work_node *node; + + node = wq_stack_extract(&ctx->submit_state.free_list); + return container_of(node, struct io_kiocb, comp_list); +} + #endif -- cgit v1.2.3 From 14b146b688ad9593f5eee93d51a34d09a47e50b5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 27 Jul 2022 10:30:41 +0100 Subject: io_uring: notification completion optimisation We want to use all optimisations that we have for io_uring requests like completion batching, memory caching and more but for zc notifications. Fortunately, notification perfectly fit the request model so we can overlay them onto struct io_kiocb and use all the infratructure. Most of the fields of struct io_notif natively fits into io_kiocb, so we replace struct io_notif with struct io_kiocb carrying struct io_notif_data in the cmd cache line. Then we adapt io_alloc_notif() to use io_alloc_req()/io_alloc_req_refill(), and kill leftovers of hand coded caching. __io_notif_complete_tw() is converted to use io_uring's tw infra. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9e010125175e80baf51f0ca63bdc7cc6a4a9fa56.1658913593.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 -- io_uring/io_uring.c | 3 - io_uring/net.c | 4 +- io_uring/notif.c | 159 ++++++++++++++--------------------------- io_uring/notif.h | 42 ++++------- 5 files changed, 67 insertions(+), 148 deletions(-) (limited to 'io_uring') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 144493cbadb5..f7fab3758cb9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -249,9 +249,6 @@ struct io_ring_ctx { struct xarray io_bl_xa; struct list_head io_buffers_cache; - /* struct io_notif cache, protected by uring_lock */ - struct list_head notif_list; - struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; struct io_alloc_cache apoll_cache; @@ -262,10 +259,6 @@ struct io_ring_ctx { struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; - /* struct io_notif cache protected by completion_lock */ - struct list_head notif_list_locked; - unsigned int notif_locked_nr; - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 822819d0f607..b54218da075c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -321,8 +321,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); - INIT_LIST_HEAD(&ctx->notif_list); - INIT_LIST_HEAD(&ctx->notif_list_locked); return ctx; err: kfree(ctx->dummy_ubuf); @@ -2466,7 +2464,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); - io_notif_cache_purge(ctx); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); diff --git a/io_uring/net.c b/io_uring/net.c index 8276b9537194..32fc3da04e41 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -946,7 +946,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_sendzc *zc = io_kiocb_to_cmd(req); struct io_notif_slot *notif_slot; - struct io_notif *notif; + struct io_kiocb *notif; struct msghdr msg; struct iovec iov; struct socket *sock; @@ -1005,7 +1005,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) min_ret = iov_iter_count(&msg.msg_iter); msg.msg_flags = msg_flags; - msg.msg_ubuf = ¬if->uarg; + msg.msg_ubuf = &io_notif_to_data(notif)->uarg; msg.sg_from_iter = io_sg_from_iter; ret = sock_sendmsg(sock, &msg); diff --git a/io_uring/notif.c b/io_uring/notif.c index e986a0ed958c..b5f989dff9de 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,140 +9,79 @@ #include "notif.h" #include "rsrc.h" -static void __io_notif_complete_tw(struct callback_head *cb) +static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) { - struct io_notif *notif = container_of(cb, struct io_notif, task_work); - struct io_rsrc_node *rsrc_node = notif->rsrc_node; + struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; - if (notif->account_pages && ctx->user) { - __io_unaccount_mem(ctx->user, notif->account_pages); - notif->account_pages = 0; + if (nd->account_pages && ctx->user) { + __io_unaccount_mem(ctx->user, nd->account_pages); + nd->account_pages = 0; } - if (likely(notif->task)) { - io_put_task(notif->task, 1); - notif->task = NULL; - } - - io_cq_lock(ctx); - io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true); - - list_add(¬if->cache_node, &ctx->notif_list_locked); - ctx->notif_locked_nr++; - io_cq_unlock_post(ctx); - - io_rsrc_put_node(rsrc_node, 1); - percpu_ref_put(&ctx->refs); + io_req_task_complete(notif, locked); } -static inline void io_notif_complete(struct io_notif *notif) +static inline void io_notif_complete(struct io_kiocb *notif) + __must_hold(¬if->ctx->uring_lock) { - __io_notif_complete_tw(¬if->task_work); -} - -static void io_notif_complete_wq(struct work_struct *work) -{ - struct io_notif *notif = container_of(work, struct io_notif, commit_work); + bool locked = true; - io_notif_complete(notif); + __io_notif_complete_tw(notif, &locked); } static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success) { - struct io_notif *notif = container_of(uarg, struct io_notif, uarg); - - if (!refcount_dec_and_test(&uarg->refcnt)) - return; - - if (likely(notif->task)) { - init_task_work(¬if->task_work, __io_notif_complete_tw); - if (likely(!task_work_add(notif->task, ¬if->task_work, - TWA_SIGNAL))) - return; - } - - INIT_WORK(¬if->commit_work, io_notif_complete_wq); - queue_work(system_unbound_wq, ¬if->commit_work); -} - -static void io_notif_splice_cached(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->notif_list_locked, &ctx->notif_list); - ctx->notif_locked_nr = 0; - spin_unlock(&ctx->completion_lock); -} - -void io_notif_cache_purge(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - io_notif_splice_cached(ctx); + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); + struct io_kiocb *notif = cmd_to_io_kiocb(nd); - while (!list_empty(&ctx->notif_list)) { - struct io_notif *notif = list_first_entry(&ctx->notif_list, - struct io_notif, cache_node); - - list_del(¬if->cache_node); - kfree(notif); + if (refcount_dec_and_test(&uarg->refcnt)) { + notif->io_task_work.func = __io_notif_complete_tw; + io_req_task_work_add(notif); } } -static inline bool io_notif_has_cached(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - if (likely(!list_empty(&ctx->notif_list))) - return true; - if (data_race(READ_ONCE(ctx->notif_locked_nr) <= IO_NOTIF_SPLICE_BATCH)) - return false; - io_notif_splice_cached(ctx); - return !list_empty(&ctx->notif_list); -} - -struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot) __must_hold(&ctx->uring_lock) { - struct io_notif *notif; - - if (likely(io_notif_has_cached(ctx))) { - notif = list_first_entry(&ctx->notif_list, - struct io_notif, cache_node); - list_del(¬if->cache_node); - } else { - notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT); - if (!notif) - return NULL; - /* pre-initialise some fields */ - notif->ctx = ctx; - notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; - notif->uarg.callback = io_uring_tx_zerocopy_callback; - notif->account_pages = 0; - } - - notif->seq = slot->seq++; - notif->tag = slot->tag; + struct io_kiocb *notif; + struct io_notif_data *nd; + + if (unlikely(!io_alloc_req_refill(ctx))) + return NULL; + notif = io_alloc_req(ctx); + notif->opcode = IORING_OP_NOP; + notif->flags = 0; + notif->file = NULL; + notif->task = current; + io_get_task_refs(1); + notif->rsrc_node = NULL; + io_req_set_rsrc_node(notif, ctx, 0); + notif->cqe.user_data = slot->tag; + notif->cqe.flags = slot->seq++; + notif->cqe.res = 0; + + nd = io_notif_to_data(notif); + nd->account_pages = 0; + nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + nd->uarg.callback = io_uring_tx_zerocopy_callback; /* master ref owned by io_notif_slot, will be dropped on flush */ - refcount_set(¬if->uarg.refcnt, 1); - percpu_ref_get(&ctx->refs); - notif->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx); + refcount_set(&nd->uarg.refcnt, 1); return notif; } void io_notif_slot_flush(struct io_notif_slot *slot) __must_hold(&ctx->uring_lock) { - struct io_notif *notif = slot->notif; + struct io_kiocb *notif = slot->notif; + struct io_notif_data *nd = io_notif_to_data(notif); slot->notif = NULL; - if (WARN_ON_ONCE(in_interrupt())) - return; /* drop slot's master ref */ - if (refcount_dec_and_test(¬if->uarg.refcnt)) + if (refcount_dec_and_test(&nd->uarg.refcnt)) io_notif_complete(notif); } @@ -156,18 +95,22 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx) for (i = 0; i < ctx->nr_notif_slots; i++) { struct io_notif_slot *slot = &ctx->notif_slots[i]; + struct io_kiocb *notif = slot->notif; + struct io_notif_data *nd; - if (!slot->notif) + if (!notif) + continue; + nd = io_kiocb_to_cmd(notif); + slot->notif = NULL; + if (!refcount_dec_and_test(&nd->uarg.refcnt)) continue; - if (WARN_ON_ONCE(slot->notif->task)) - slot->notif->task = NULL; - io_notif_slot_flush(slot); + notif->io_task_work.func = __io_notif_complete_tw; + io_req_task_work_add(notif); } kvfree(ctx->notif_slots); ctx->notif_slots = NULL; ctx->nr_notif_slots = 0; - io_notif_cache_purge(ctx); return 0; } @@ -180,6 +123,8 @@ __cold int io_notif_register(struct io_ring_ctx *ctx, struct io_uring_notification_register reg; unsigned i; + BUILD_BUG_ON(sizeof(struct io_notif_data) > 64); + if (ctx->nr_notif_slots) return -EBUSY; if (size != sizeof(reg)) diff --git a/io_uring/notif.h b/io_uring/notif.h index d6f366b1518b..0819304d7e00 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -10,27 +10,10 @@ #define IO_NOTIF_SPLICE_BATCH 32 #define IORING_MAX_NOTIF_SLOTS (1U << 10) -struct io_notif { +struct io_notif_data { + struct file *file; struct ubuf_info uarg; - struct io_ring_ctx *ctx; - struct io_rsrc_node *rsrc_node; - - /* complete via tw if ->task is non-NULL, fallback to wq otherwise */ - struct task_struct *task; - - /* cqe->user_data, io_notif_slot::tag if not overridden */ - u64 tag; - /* see struct io_notif_slot::seq */ - u32 seq; - /* hook into ctx->notif_list and ctx->notif_list_locked */ - struct list_head cache_node; - unsigned long account_pages; - - union { - struct callback_head task_work; - struct work_struct commit_work; - }; }; struct io_notif_slot { @@ -39,7 +22,7 @@ struct io_notif_slot { * time and keeps one reference to it. Flush releases the reference and * lazily replaces it with a new notifier. */ - struct io_notif *notif; + struct io_kiocb *notif; /* * Default ->user_data for this slot notifiers CQEs @@ -56,13 +39,17 @@ struct io_notif_slot { int io_notif_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int size); int io_notif_unregister(struct io_ring_ctx *ctx); -void io_notif_cache_purge(struct io_ring_ctx *ctx); void io_notif_slot_flush(struct io_notif_slot *slot); -struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, +struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot); -static inline struct io_notif *io_get_notif(struct io_ring_ctx *ctx, +static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) +{ + return io_kiocb_to_cmd(notif); +} + +static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot) { if (!slot->notif) @@ -83,16 +70,13 @@ static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx, static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot, unsigned int issue_flags) { - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - slot->notif->task = current; - io_get_task_refs(1); - } io_notif_slot_flush(slot); } -static inline int io_notif_account_mem(struct io_notif *notif, unsigned len) +static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) { struct io_ring_ctx *ctx = notif->ctx; + struct io_notif_data *nd = io_notif_to_data(notif); unsigned nr_pages = (len >> PAGE_SHIFT) + 2; int ret; @@ -100,7 +84,7 @@ static inline int io_notif_account_mem(struct io_notif *notif, unsigned len) ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; - notif->account_pages += nr_pages; + nd->account_pages += nr_pages; } return 0; } -- cgit v1.2.3