summaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c1327
1 files changed, 770 insertions, 557 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0e8902be6b96..22e31050f33e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -245,6 +245,8 @@ struct io_sq_data {
struct task_struct *thread;
struct wait_queue_head wait;
+
+ unsigned sq_thread_idle;
};
struct io_ring_ctx {
@@ -285,7 +287,6 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head cq_overflow_list;
- wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
@@ -310,7 +311,6 @@ struct io_ring_ctx {
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
- struct wait_queue_entry sqo_wait_entry;
struct list_head sqd_list;
/*
@@ -395,16 +395,18 @@ struct io_ring_ctx {
*/
struct io_poll_iocb {
struct file *file;
- union {
- struct wait_queue_head *head;
- u64 addr;
- };
+ struct wait_queue_head *head;
__poll_t events;
bool done;
bool canceled;
struct wait_queue_entry wait;
};
+struct io_poll_remove {
+ struct file *file;
+ u64 addr;
+};
+
struct io_close {
struct file *file;
struct file *put_file;
@@ -444,11 +446,17 @@ struct io_timeout {
u32 off;
u32 target_seq;
struct list_head list;
+ /* head of the link, used by linked timeouts only */
+ struct io_kiocb *head;
};
struct io_timeout_rem {
struct file *file;
u64 addr;
+
+ /* timeout update */
+ struct timespec64 ts;
+ u32 flags;
};
struct io_rw {
@@ -541,6 +549,27 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_shutdown {
+ struct file *file;
+ int how;
+};
+
+struct io_rename {
+ struct file *file;
+ int old_dfd;
+ int new_dfd;
+ struct filename *oldpath;
+ struct filename *newpath;
+ int flags;
+};
+
+struct io_unlink {
+ struct file *file;
+ int dfd;
+ int flags;
+ struct filename *filename;
+};
+
struct io_completion {
struct file *file;
struct list_head list;
@@ -575,7 +604,6 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
- REQ_F_LINK_HEAD_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
@@ -607,8 +635,6 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
- /* head of a link */
- REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -651,6 +677,7 @@ struct io_kiocb {
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
+ struct io_poll_remove poll_remove;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
@@ -667,6 +694,9 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ struct io_shutdown shutdown;
+ struct io_rename rename;
+ struct io_unlink unlink;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
@@ -686,15 +716,14 @@ struct io_kiocb {
struct task_struct *task;
u64 user_data;
- struct list_head link_list;
+ struct io_kiocb *link;
+ struct percpu_ref *fixed_file_refs;
/*
* 1. used with ctx->iopoll_list with reads/writes
* 2. to track reqs with ->files (see io_op_def::file_table)
*/
struct list_head inflight_entry;
-
- struct percpu_ref *fixed_file_refs;
struct callback_head task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
@@ -725,6 +754,8 @@ struct io_submit_state {
void *reqs[IO_IOPOLL_BATCH];
unsigned int free_reqs;
+ bool plug_started;
+
/*
* Batch completion logic
*/
@@ -735,7 +766,7 @@ struct io_submit_state {
*/
struct file *file;
unsigned int fd;
- unsigned int has_refs;
+ unsigned int file_refs;
unsigned int ios_left;
};
@@ -757,6 +788,8 @@ struct io_op_def {
unsigned buffer_select : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
unsigned work_flags;
@@ -770,6 +803,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -779,6 +813,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -791,6 +826,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
@@ -799,6 +835,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
IO_WQ_WORK_MM,
@@ -818,8 +855,7 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
@@ -828,15 +864,17 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
.work_flags = IO_WQ_WORK_MM,
},
- [IORING_OP_TIMEOUT_REMOVE] = {},
+ [IORING_OP_TIMEOUT_REMOVE] = {
+ /* used by timeout updates' prep() */
+ .work_flags = IO_WQ_WORK_MM,
+ },
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -863,7 +901,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ IO_WQ_WORK_FS | IO_WQ_WORK_MM,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
@@ -882,6 +920,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -889,6 +928,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -915,7 +955,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT2] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG,
+ IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
@@ -934,6 +974,17 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
+ [IORING_OP_SHUTDOWN] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_RENAMEAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
+ [IORING_OP_UNLINKAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
};
enum io_mem_account {
@@ -983,6 +1034,9 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+#define io_for_each_link(pos, head) \
+ for (pos = (head); pos; pos = pos->link)
+
static inline void io_clean_op(struct io_kiocb *req)
{
if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
@@ -990,8 +1044,39 @@ static inline void io_clean_op(struct io_kiocb *req)
__io_clean_op(req);
}
-static void io_sq_thread_drop_mm(void)
+static inline void io_set_resource_node(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->fixed_file_refs) {
+ req->fixed_file_refs = &ctx->file_data->node->refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
+}
+
+static bool io_match_task(struct io_kiocb *head,
+ struct task_struct *task,
+ struct files_struct *files)
{
+ struct io_kiocb *req;
+
+ if (task && head->task != task)
+ return false;
+ if (!files)
+ return true;
+
+ io_for_each_link(req, head) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES) &&
+ req->work.identity->files == files)
+ return true;
+ }
+ return false;
+}
+
+static void io_sq_thread_drop_mm_files(void)
+{
+ struct files_struct *files = current->files;
struct mm_struct *mm = current->mm;
if (mm) {
@@ -999,6 +1084,41 @@ static void io_sq_thread_drop_mm(void)
mmput(mm);
current->mm = NULL;
}
+ if (files) {
+ struct nsproxy *nsproxy = current->nsproxy;
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
+ put_files_struct(files);
+ put_nsproxy(nsproxy);
+ }
+}
+
+static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
+{
+ if (!current->files) {
+ struct files_struct *files;
+ struct nsproxy *nsproxy;
+
+ task_lock(ctx->sqo_task);
+ files = ctx->sqo_task->files;
+ if (!files) {
+ task_unlock(ctx->sqo_task);
+ return -EOWNERDEAD;
+ }
+ atomic_inc(&files->count);
+ get_nsproxy(ctx->sqo_task->nsproxy);
+ nsproxy = ctx->sqo_task->nsproxy;
+ task_unlock(ctx->sqo_task);
+
+ task_lock(current);
+ current->files = files;
+ current->nsproxy = nsproxy;
+ task_unlock(current);
+ }
+ return 0;
}
static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
@@ -1026,12 +1146,25 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
return -EFAULT;
}
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
- if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
- return 0;
- return __io_sq_thread_acquire_mm(ctx);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ int ret;
+
+ if (def->work_flags & IO_WQ_WORK_MM) {
+ ret = __io_sq_thread_acquire_mm(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
+ ret = __io_sq_thread_acquire_files(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return 0;
}
static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
@@ -1174,7 +1307,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
- init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
@@ -1416,10 +1548,8 @@ static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
- io_prep_async_work(req);
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(cur, &req->link_list, link_list)
- io_prep_async_work(cur);
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
}
static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
@@ -1460,30 +1590,18 @@ static void io_kill_timeout(struct io_kiocb *req)
}
}
-static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!tsk || req->task == tsk)
- return true;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (ctx->sq_data && req->task == ctx->sq_data->thread)
- return true;
- }
- return false;
-}
-
/*
* Returns true if we found and killed one or more timeouts
*/
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_task_match(req, tsk)) {
+ if (io_match_task(req, tsk, files)) {
io_kill_timeout(req);
canceled++;
}
@@ -1594,32 +1712,6 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
}
}
-static inline bool __io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- return ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES)) &&
- req->work.identity->files == files;
-}
-
-static bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- struct io_kiocb *link;
-
- if (!files)
- return true;
- if (__io_match_files(req, files))
- return true;
- if (req->flags & REQ_F_LINK_HEAD) {
- list_for_each_entry(link, &req->link_list, link_list) {
- if (__io_match_files(link, files))
- return true;
- }
- }
- return false;
-}
-
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
@@ -1647,9 +1739,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
cqe = NULL;
list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- if (tsk && req->task != tsk)
- continue;
- if (!io_match_files(req, files))
+ if (!io_match_task(req, tsk, files))
continue;
cqe = io_get_cqring(ctx);
@@ -1845,9 +1935,7 @@ fallback:
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
- if (fixed)
- percpu_ref_put(req->fixed_file_refs);
- else
+ if (!fixed)
fput(file);
}
@@ -1859,7 +1947,8 @@ static void io_dismantle_req(struct io_kiocb *req)
kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-
+ if (req->fixed_file_refs)
+ percpu_ref_put(req->fixed_file_refs);
io_req_clean_work(req);
}
@@ -1882,6 +1971,14 @@ static void __io_free_req(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
+static inline void io_remove_next_linked(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt = req->link;
+
+ req->link = nxt->link;
+ nxt->link = NULL;
+}
+
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1890,8 +1987,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
+ link = req->link;
+
/*
* Can happen if a linked timeout fired and link had been like
* req -> link t-out -> link t-out [-> ...]
@@ -1900,7 +1997,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
struct io_timeout_data *io = link->async_data;
int ret;
- list_del_init(&link->link_list);
+ io_remove_next_linked(req);
+ link->timeout.head = NULL;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
io_cqring_fill_event(link, -ECANCELED);
@@ -1917,41 +2015,22 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
}
}
-static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
- /*
- * The list should never be empty when we are called here. But could
- * potentially happen if the chain is messed up, check to be on the
- * safe side.
- */
- if (unlikely(list_empty(&req->link_list)))
- return NULL;
-
- nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- return nxt;
-}
-
-/*
- * Called if REQ_F_LINK_HEAD is set, and we fail the head request
- */
static void io_fail_links(struct io_kiocb *req)
{
+ struct io_kiocb *link, *nxt;
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *link = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
+ link = req->link;
+ req->link = NULL;
- list_del_init(&link->link_list);
- trace_io_uring_fail_link(req, link);
+ while (link) {
+ nxt = link->link;
+ link->link = NULL;
+ trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
/*
@@ -1963,8 +2042,8 @@ static void io_fail_links(struct io_kiocb *req)
io_put_req_deferred(link, 2);
else
io_double_put_req(link);
+ link = nxt;
}
-
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -1973,7 +2052,6 @@ static void io_fail_links(struct io_kiocb *req)
static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
- req->flags &= ~REQ_F_LINK_HEAD;
if (req->flags & REQ_F_LINK_TIMEOUT)
io_kill_linked_timeout(req);
@@ -1983,15 +2061,19 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (likely(!(req->flags & REQ_F_FAIL_LINK)))
- return io_req_link_next(req);
+ if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
+ struct io_kiocb *nxt = req->link;
+
+ req->link = NULL;
+ return nxt;
+ }
io_fail_links(req);
return NULL;
}
-static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
return NULL;
return __io_req_find_next(req);
}
@@ -2050,7 +2132,8 @@ static void __io_req_task_submit(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!__io_sq_thread_acquire_mm(ctx)) {
+ if (!__io_sq_thread_acquire_mm(ctx) &&
+ !__io_sq_thread_acquire_files(ctx)) {
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
@@ -2086,7 +2169,7 @@ static void io_req_task_queue(struct io_kiocb *req)
}
}
-static void io_queue_next(struct io_kiocb *req)
+static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2143,8 +2226,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
io_free_req(req);
return;
}
- if (req->flags & REQ_F_LINK_HEAD)
- io_queue_next(req);
+ io_queue_next(req);
if (req->task != rb->task) {
if (rb->task) {
@@ -2246,7 +2328,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
* we wake up the task, and the next invocation will flush the
* entries. We cannot safely to it from here.
*/
- if (noflush && !list_empty(&ctx->cq_overflow_list))
+ if (noflush)
return -1U;
io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -2593,7 +2675,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
- ret = io_sq_thread_acquire_mm(req->ctx, req);
+ ret = io_sq_thread_acquire_mm_files(req->ctx, req);
if (io_resubmit_prep(req, ret)) {
refcount_inc(&req->refs);
@@ -2641,7 +2723,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_iopoll_getevents() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2670,21 +2752,25 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
else
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
- if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ /*
+ * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
+ * task context or in io worker task context. If current task context is
+ * sq thread, we don't need to check whether should wake up sq thread.
+ */
+ if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
}
-static void __io_state_file_put(struct io_submit_state *state)
+static inline void __io_state_file_put(struct io_submit_state *state)
{
- if (state->has_refs)
- fput_many(state->file, state->has_refs);
- state->file = NULL;
+ fput_many(state->file, state->file_refs);
+ state->file_refs = 0;
}
static inline void io_state_file_put(struct io_submit_state *state)
{
- if (state->file)
+ if (state->file_refs)
__io_state_file_put(state);
}
@@ -2698,19 +2784,19 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (!state)
return fget(fd);
- if (state->file) {
+ if (state->file_refs) {
if (state->fd == fd) {
- state->has_refs--;
+ state->file_refs--;
return state->file;
}
__io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
- if (!state->file)
+ if (unlikely(!state->file))
return NULL;
state->fd = fd;
- state->has_refs = state->ios_left - 1;
+ state->file_refs = state->ios_left - 1;
return state->file;
}
@@ -3065,7 +3151,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
-static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
{
@@ -3094,7 +3180,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret < 0 ? ret : sqe_len;
+ return ret;
}
if (req->flags & REQ_F_BUFFER_SELECT) {
@@ -3111,18 +3197,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
req->ctx->compat);
}
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock)
-{
- struct io_async_rw *iorw = req->async_data;
-
- if (!iorw)
- return __io_import_iovec(rw, req, iovec, iter, needs_lock);
- *iovec = NULL;
- return iov_iter_count(&iorw->iter);
-}
-
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -3246,7 +3320,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
struct iovec *iov = iorw->fast_iov;
ssize_t ret;
- ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
return ret;
@@ -3379,17 +3453,17 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
- size_t iov_count;
bool no_async;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
ret = 0;
@@ -3405,7 +3479,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (no_async)
goto copy_iov;
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3424,7 +3498,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (req->file->f_flags & O_NONBLOCK)
goto done;
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
goto copy_iov;
} else if (ret < 0) {
@@ -3507,17 +3581,17 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
- size_t iov_count;
ssize_t ret, ret2, io_size;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
/* Ensure we clear previously set non-block flag */
@@ -3535,7 +3609,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
(req->flags & REQ_F_ISREG))
goto copy_iov;
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3578,7 +3652,7 @@ done:
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (!ret)
return -EAGAIN;
@@ -3590,6 +3664,135 @@ out_free:
return ret;
}
+static int io_renameat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_rename *ren = &req->rename;
+ const char __user *oldf, *newf;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ren->old_dfd = READ_ONCE(sqe->fd);
+ oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ren->new_dfd = READ_ONCE(sqe->len);
+ ren->flags = READ_ONCE(sqe->rename_flags);
+
+ ren->oldpath = getname(oldf);
+ if (IS_ERR(ren->oldpath))
+ return PTR_ERR(ren->oldpath);
+
+ ren->newpath = getname(newf);
+ if (IS_ERR(ren->newpath)) {
+ putname(ren->oldpath);
+ return PTR_ERR(ren->newpath);
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_renameat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_rename *ren = &req->rename;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+ ren->newpath, ren->flags);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_unlinkat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_unlink *un = &req->unlink;
+ const char __user *fname;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ un->dfd = READ_ONCE(sqe->fd);
+
+ un->flags = READ_ONCE(sqe->unlink_flags);
+ if (un->flags & ~AT_REMOVEDIR)
+ return -EINVAL;
+
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ un->filename = getname(fname);
+ if (IS_ERR(un->filename))
+ return PTR_ERR(un->filename);
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_unlink *un = &req->unlink;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ if (un->flags & AT_REMOVEDIR)
+ ret = do_rmdir(un->dfd, un->filename);
+ else
+ ret = do_unlinkat(un->dfd, un->filename);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_shutdown_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_NET)
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index)
+ return -EINVAL;
+
+ req->shutdown.how = READ_ONCE(sqe->len);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct socket *sock;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_shutdown_sock(sock, req->shutdown.how);
+ io_req_complete(req, ret);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static int __io_splice_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -3804,7 +4007,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
u64 flags, mode;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
mode = READ_ONCE(sqe->len);
flags = READ_ONCE(sqe->open_flags);
@@ -3818,7 +4021,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
size_t len;
int ret;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
@@ -3948,11 +4151,17 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
head = idr_find(&ctx->io_buffer_idr, p->bgid);
if (head)
ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
-
- io_ring_submit_lock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ __io_req_complete(req, ret, 0, cs);
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ } else {
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ __io_req_complete(req, ret, 0, cs);
+ }
return 0;
}
@@ -4037,10 +4246,17 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
}
}
out:
- io_ring_submit_unlock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ __io_req_complete(req, ret, 0, cs);
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ } else {
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ __io_req_complete(req, ret, 0, cs);
+ }
return 0;
}
@@ -4212,7 +4428,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
@@ -4694,7 +4910,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = &req->accept;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index)
return -EINVAL;
@@ -4735,7 +4951,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_connect *conn = &req->connect;
struct io_async_connect *io = req->async_data;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
@@ -5270,7 +5486,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5282,7 +5499,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_task_match(req, tsk))
+ if (io_match_task(req, tsk, files))
posted += io_poll_remove_one(req);
}
}
@@ -5320,7 +5537,7 @@ static int io_poll_remove_prep(struct io_kiocb *req,
sqe->poll_events)
return -EINVAL;
- req->poll.addr = READ_ONCE(sqe->addr);
+ req->poll_remove.addr = READ_ONCE(sqe->addr);
return 0;
}
@@ -5331,12 +5548,10 @@ static int io_poll_remove_prep(struct io_kiocb *req,
static int io_poll_remove(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- u64 addr;
int ret;
- addr = req->poll.addr;
spin_lock_irq(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, addr);
+ ret = io_poll_cancel(ctx, req->poll_remove.addr);
spin_unlock_irq(&ctx->completion_lock);
if (ret < 0)
@@ -5429,15 +5644,37 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static int __io_timeout_cancel(struct io_kiocb *req)
+static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
+ __u64 user_data)
{
- struct io_timeout_data *io = req->async_data;
- int ret;
+ struct io_timeout_data *io;
+ struct io_kiocb *req;
+ int ret = -ENOENT;
+ list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
+ if (user_data == req->user_data) {
+ ret = 0;
+ break;
+ }
+ }
+
+ if (ret == -ENOENT)
+ return ERR_PTR(ret);
+
+ io = req->async_data;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret == -1)
- return -EALREADY;
+ return ERR_PTR(-EALREADY);
list_del_init(&req->timeout.list);
+ return req;
+}
+
+static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+{
+ struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req_set_fail_links(req);
io_cqring_fill_event(req, -ECANCELED);
@@ -5445,35 +5682,48 @@ static int __io_timeout_cancel(struct io_kiocb *req)
return 0;
}
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+ struct timespec64 *ts, enum hrtimer_mode mode)
{
- struct io_kiocb *req;
- int ret = -ENOENT;
-
- list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- if (user_data == req->user_data) {
- ret = 0;
- break;
- }
- }
+ struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ struct io_timeout_data *data;
- if (ret == -ENOENT)
- return ret;
+ if (IS_ERR(req))
+ return PTR_ERR(req);
- return __io_timeout_cancel(req);
+ req->timeout.off = 0; /* noseq */
+ data = req->async_data;
+ list_add_tail(&req->timeout.list, &ctx->timeout_list);
+ hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+ data->timer.function = io_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
+ return 0;
}
static int io_timeout_remove_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ struct io_timeout_rem *tr = &req->timeout_rem;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
+ if (sqe->ioprio || sqe->buf_index || sqe->len)
return -EINVAL;
- req->timeout_rem.addr = READ_ONCE(sqe->addr);
+ tr->addr = READ_ONCE(sqe->addr);
+ tr->flags = READ_ONCE(sqe->timeout_flags);
+ if (tr->flags & IORING_TIMEOUT_UPDATE) {
+ if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+ return -EINVAL;
+ if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
+ return -EFAULT;
+ } else if (tr->flags) {
+ /* timeout removal doesn't support flags */
+ return -EINVAL;
+ }
+
return 0;
}
@@ -5482,11 +5732,19 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
*/
static int io_timeout_remove(struct io_kiocb *req)
{
+ struct io_timeout_rem *tr = &req->timeout_rem;
struct io_ring_ctx *ctx = req->ctx;
int ret;
spin_lock_irq(&ctx->completion_lock);
- ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
+ if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
+ enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
+ ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+
+ ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ } else {
+ ret = io_timeout_cancel(ctx, tr->addr);
+ }
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
@@ -5766,6 +6024,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_remove_buffers_prep(req, sqe);
case IORING_OP_TEE:
return io_tee_prep(req, sqe);
+ case IORING_OP_SHUTDOWN:
+ return io_shutdown_prep(req, sqe);
+ case IORING_OP_RENAMEAT:
+ return io_renameat_prep(req, sqe);
+ case IORING_OP_UNLINKAT:
+ return io_unlinkat_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -5787,11 +6051,10 @@ static u32 io_get_sequence(struct io_kiocb *req)
{
struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
- u32 total_submitted, nr_reqs = 1;
+ u32 total_submitted, nr_reqs = 0;
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(pos, &req->link_list, link_list)
- nr_reqs++;
+ io_for_each_link(pos, req)
+ nr_reqs++;
total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
return total_submitted - nr_reqs;
@@ -5843,12 +6106,13 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static void io_req_drop_files(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_task *tctx = req->task->io_uring;
unsigned long flags;
spin_lock_irqsave(&ctx->inflight_lock, flags);
list_del(&req->inflight_entry);
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
+ if (atomic_read(&tctx->in_idle))
+ wake_up(&tctx->wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
req->flags &= ~REQ_F_INFLIGHT;
put_files_struct(req->work.identity->files);
@@ -5903,6 +6167,13 @@ static void __io_clean_op(struct io_kiocb *req)
if (req->open.filename)
putname(req->open.filename);
break;
+ case IORING_OP_RENAMEAT:
+ putname(req->rename.oldpath);
+ putname(req->rename.newpath);
+ break;
+ case IORING_OP_UNLINKAT:
+ putname(req->unlink.filename);
+ break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
@@ -6009,6 +6280,15 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
case IORING_OP_TEE:
ret = io_tee(req, force_nonblock);
break;
+ case IORING_OP_SHUTDOWN:
+ ret = io_shutdown(req, force_nonblock);
+ break;
+ case IORING_OP_RENAMEAT:
+ ret = io_renameat(req, force_nonblock);
+ break;
+ case IORING_OP_UNLINKAT:
+ ret = io_unlinkat(req, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
@@ -6025,7 +6305,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
if (in_async)
mutex_lock(&ctx->uring_lock);
- io_iopoll_req_issued(req);
+ io_iopoll_req_issued(req, in_async);
if (in_async)
mutex_unlock(&ctx->uring_lock);
@@ -6065,8 +6345,19 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
}
if (ret) {
- req_set_fail_links(req);
- io_req_complete(req, ret);
+ /*
+ * io_iopoll_complete() does not hold completion_lock to complete
+ * polled io, so here for polled io, just mark it done and still let
+ * io_iopoll_complete() complete it.
+ */
+ if (req->ctx->flags & IORING_SETUP_IOPOLL) {
+ struct kiocb *kiocb = &req->rw.kiocb;
+
+ kiocb_done(kiocb, ret, NULL);
+ } else {
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ }
}
return io_steal_work(req);
@@ -6092,10 +6383,7 @@ static struct file *io_file_get(struct io_submit_state *state,
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
- if (file) {
- req->fixed_file_refs = &ctx->file_data->node->refs;
- percpu_ref_get(req->fixed_file_refs);
- }
+ io_set_resource_node(req);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
@@ -6104,45 +6392,26 @@ static struct file *io_file_get(struct io_submit_state *state,
return file;
}
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- int fd)
-{
- bool fixed;
-
- fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
- if (unlikely(!fixed && io_async_submit(req->ctx)))
- return -EBADF;
-
- req->file = io_file_get(state, req, fd, fixed);
- if (req->file || io_op_defs[req->opcode].needs_file_no_error)
- return 0;
- return -EBADF;
-}
-
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
struct io_timeout_data, timer);
- struct io_kiocb *req = data->req;
+ struct io_kiocb *prev, *req = data->req;
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *prev = NULL;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
+ prev = req->timeout.head;
+ req->timeout.head = NULL;
/*
* We don't expect the list to be empty, that will only happen if we
* race with the completion of the linked work.
*/
- if (!list_empty(&req->link_list)) {
- prev = list_entry(req->link_list.prev, struct io_kiocb,
- link_list);
- if (refcount_inc_not_zero(&prev->refs))
- list_del_init(&req->link_list);
- else
- prev = NULL;
- }
-
+ if (prev && refcount_inc_not_zero(&prev->refs))
+ io_remove_next_linked(prev);
+ else
+ prev = NULL;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (prev) {
@@ -6158,10 +6427,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
static void __io_queue_linked_timeout(struct io_kiocb *req)
{
/*
- * If the list is now empty, then our linked request finished before
- * we got a chance to setup the timer
+ * If the back reference is NULL, then our linked request finished
+ * before we got a chance to setup the timer
*/
- if (!list_empty(&req->link_list)) {
+ if (req->timeout.head) {
struct io_timeout_data *data = req->async_data;
data->timer.function = io_link_timeout_fn;
@@ -6184,18 +6453,13 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
- struct io_kiocb *nxt;
+ struct io_kiocb *nxt = req->link;
- if (!(req->flags & REQ_F_LINK_HEAD))
- return NULL;
- if (req->flags & REQ_F_LINK_TIMEOUT)
- return NULL;
-
- nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
- if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
+ if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
+ nxt->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
+ nxt->timeout.head = req;
nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
req->flags |= REQ_F_LINK_TIMEOUT;
return nxt;
@@ -6301,8 +6565,13 @@ static inline void io_queue_link_head(struct io_kiocb *req,
io_queue_sqe(req, NULL, cs);
}
+struct io_submit_link {
+ struct io_kiocb *head;
+ struct io_kiocb *last;
+};
+
static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **link, struct io_comp_state *cs)
+ struct io_submit_link *link, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -6314,8 +6583,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
- if (*link) {
- struct io_kiocb *head = *link;
+ if (link->head) {
+ struct io_kiocb *head = link->head;
/*
* Taking sequential execution of a link, draining both sides
@@ -6335,12 +6604,13 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
}
trace_io_uring_link(ctx, req, head);
- list_add_tail(&req->link_list, &head->link_list);
+ link->last->link = req;
+ link->last = req;
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_link_head(head, cs);
- *link = NULL;
+ link->head = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
@@ -6348,13 +6618,11 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ctx->drain_next = 0;
}
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- req->flags |= REQ_F_LINK_HEAD;
- INIT_LIST_HEAD(&req->link_list);
-
ret = io_req_defer_prep(req, sqe);
if (unlikely(ret))
req->flags |= REQ_F_FAIL_LINK;
- *link = req;
+ link->head = req;
+ link->last = req;
} else {
io_queue_sqe(req, sqe, cs);
}
@@ -6370,7 +6638,8 @@ static void io_submit_state_end(struct io_submit_state *state)
{
if (!list_empty(&state->comp.list))
io_submit_flush_completions(&state->comp);
- blk_finish_plug(&state->plug);
+ if (state->plug_started)
+ blk_finish_plug(&state->plug);
io_state_file_put(state);
if (state->free_reqs)
kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
@@ -6382,12 +6651,12 @@ static void io_submit_state_end(struct io_submit_state *state)
static void io_submit_state_start(struct io_submit_state *state,
struct io_ring_ctx *ctx, unsigned int max_ios)
{
- blk_start_plug(&state->plug);
+ state->plug_started = false;
state->comp.nr = 0;
INIT_LIST_HEAD(&state->comp.list);
state->comp.ctx = ctx;
state->free_reqs = 0;
- state->file = NULL;
+ state->file_refs = 0;
state->ios_left = max_ios;
}
@@ -6482,6 +6751,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
+ req->link = NULL;
+ req->fixed_file_refs = NULL;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
req->task = current;
@@ -6490,7 +6761,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
return -EFAULT;
sqe_flags = READ_ONCE(sqe->flags);
@@ -6523,10 +6794,26 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags;
- if (!io_op_defs[req->opcode].needs_file)
- return 0;
+ /*
+ * Plug now if we have more than 1 IO left after this, and the target
+ * is potentially a read/write to block based storage.
+ */
+ if (!state->plug_started && state->ios_left > 1 &&
+ io_op_defs[req->opcode].plug) {
+ blk_start_plug(&state->plug);
+ state->plug_started = true;
+ }
+
+ ret = 0;
+ if (io_op_defs[req->opcode].needs_file) {
+ bool fixed = req->flags & REQ_F_FIXED_FILE;
+
+ req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+ if (unlikely(!req->file &&
+ !io_op_defs[req->opcode].needs_file_no_error))
+ ret = -EBADF;
+ }
- ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
state->ios_left--;
return ret;
}
@@ -6534,7 +6821,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
struct io_submit_state state;
- struct io_kiocb *link = NULL;
+ struct io_submit_link link;
int i, submitted = 0;
/* if we have a backlog and couldn't flush it all, return BUSY */
@@ -6554,6 +6841,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
refcount_add(nr, &current->usage);
io_submit_state_start(&state, ctx, nr);
+ link.head = NULL;
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
@@ -6599,8 +6887,8 @@ fail_req:
percpu_counter_sub(&tctx->inflight, unused);
put_task_struct_many(current, unused);
}
- if (link)
- io_queue_link_head(link, &state.comp);
+ if (link.head)
+ io_queue_link_head(link.head, &state.comp);
io_submit_state_end(&state);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
@@ -6624,111 +6912,45 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
-static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
- int sync, void *key)
+static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
- struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
- int ret;
-
- ret = autoremove_wake_function(wqe, mode, sync, key);
- if (ret) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- }
- return ret;
-}
-
-enum sq_ret {
- SQT_IDLE = 1,
- SQT_SPIN = 2,
- SQT_DID_WORK = 4,
-};
-
-static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
- unsigned long start_jiffies, bool cap_entries)
-{
- unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
- struct io_sq_data *sqd = ctx->sq_data;
unsigned int to_submit;
int ret = 0;
-again:
- if (!list_empty(&ctx->iopoll_list)) {
+ to_submit = io_sqring_entries(ctx);
+ /* if we're handling multiple rings, cap submit size for fairness */
+ if (cap_entries && to_submit > 8)
+ to_submit = 8;
+
+ if (!list_empty(&ctx->iopoll_list) || to_submit) {
unsigned nr_events = 0;
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ if (!list_empty(&ctx->iopoll_list))
io_do_iopoll(ctx, &nr_events, 0);
+
+ if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
+ ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
}
- to_submit = io_sqring_entries(ctx);
-
- /*
- * If submit got -EBUSY, flag us as needing the application
- * to enter the kernel to reap and flush events.
- */
- if (!to_submit || ret == -EBUSY || need_resched()) {
- /*
- * Drop cur_mm before scheduling, we can't hold it for
- * long periods (or over schedule()). Do this before
- * adding ourselves to the waitqueue, as the unuse/drop
- * may sleep.
- */
- io_sq_thread_drop_mm();
-
- /*
- * We're polling. If we're within the defined idle
- * period, then let us spin without work before going
- * to sleep. The exception is if we got EBUSY doing
- * more IO, we should wait for the application to
- * reap events and wake us up.
- */
- if (!list_empty(&ctx->iopoll_list) || need_resched() ||
- (!time_after(jiffies, timeout) && ret != -EBUSY &&
- !percpu_ref_is_dying(&ctx->refs)))
- return SQT_SPIN;
+ if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
+ wake_up(&ctx->sqo_sq_wait);
- prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
- TASK_INTERRUPTIBLE);
+ return ret;
+}
- /*
- * While doing polled IO, before going to sleep, we need
- * to check if there are new reqs added to iopoll_list,
- * it is because reqs may have been punted to io worker
- * and will be added to iopoll_list later, hence check
- * the iopoll_list again.
- */
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- goto again;
- }
+static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+{
+ struct io_ring_ctx *ctx;
+ unsigned sq_thread_idle = 0;
- to_submit = io_sqring_entries(ctx);
- if (!to_submit || ret == -EBUSY)
- return SQT_IDLE;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if (sq_thread_idle < ctx->sq_thread_idle)
+ sq_thread_idle = ctx->sq_thread_idle;
}
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- io_ring_clear_wakeup_flag(ctx);
-
- /* if we're handling multiple rings, cap submit size for fairness */
- if (cap_entries && to_submit > 8)
- to_submit = 8;
-
- mutex_lock(&ctx->uring_lock);
- if (likely(!percpu_ref_is_dying(&ctx->refs)))
- ret = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
-
- if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
- wake_up(&ctx->sqo_sq_wait);
-
- return SQT_DID_WORK;
+ sqd->sq_thread_idle = sq_thread_idle;
}
static void io_sqd_init_new(struct io_sq_data *sqd)
@@ -6737,39 +6959,56 @@ static void io_sqd_init_new(struct io_sq_data *sqd)
while (!list_empty(&sqd->ctx_new_list)) {
ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
- init_wait(&ctx->sqo_wait_entry);
- ctx->sqo_wait_entry.func = io_sq_wake_function;
list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
complete(&ctx->sq_thread_comp);
}
+
+ io_sqd_update_thread_idle(sqd);
}
static int io_sq_thread(void *data)
{
struct cgroup_subsys_state *cur_css = NULL;
+ struct files_struct *old_files = current->files;
+ struct nsproxy *old_nsproxy = current->nsproxy;
const struct cred *old_cred = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
- unsigned long start_jiffies;
+ unsigned long timeout = 0;
+ DEFINE_WAIT(wait);
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
- start_jiffies = jiffies;
while (!kthread_should_stop()) {
- enum sq_ret ret = 0;
- bool cap_entries;
+ int ret;
+ bool cap_entries, sqt_spin, needs_sched;
/*
* Any changes to the sqd lists are synchronized through the
* kthread parking. This synchronizes the thread vs users,
* the users are synchronized on the sqd->ctx_lock.
*/
- if (kthread_should_park())
+ if (kthread_should_park()) {
kthread_parkme();
+ /*
+ * When sq thread is unparked, in case the previous park operation
+ * comes from io_put_sq_data(), which means that sq thread is going
+ * to be stopped, so here needs to have a check.
+ */
+ if (kthread_should_stop())
+ break;
+ }
- if (unlikely(!list_empty(&sqd->ctx_new_list)))
+ if (unlikely(!list_empty(&sqd->ctx_new_list))) {
io_sqd_init_new(sqd);
+ timeout = jiffies + sqd->sq_thread_idle;
+ }
+ sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
-
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
if (current->cred != ctx->creds) {
if (old_cred)
@@ -6782,24 +7021,49 @@ static int io_sq_thread(void *data)
current->sessionid = ctx->sessionid;
#endif
- ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
+ ret = __io_sq_thread(ctx, cap_entries);
+ if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ sqt_spin = true;
- io_sq_thread_drop_mm();
+ io_sq_thread_drop_mm_files();
}
- if (ret & SQT_SPIN) {
+ if (sqt_spin || !time_after(jiffies, timeout)) {
io_run_task_work();
cond_resched();
- } else if (ret == SQT_IDLE) {
- if (kthread_should_park())
- continue;
+ if (sqt_spin)
+ timeout = jiffies + sqd->sq_thread_idle;
+ continue;
+ }
+
+ if (kthread_should_park())
+ continue;
+
+ needs_sched = true;
+ prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->iopoll_list)) {
+ needs_sched = false;
+ break;
+ }
+ if (io_sqring_entries(ctx)) {
+ needs_sched = false;
+ break;
+ }
+ }
+
+ if (needs_sched) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
+
schedule();
- start_jiffies = jiffies;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
}
+
+ finish_wait(&sqd->wait, &wait);
+ timeout = jiffies + sqd->sq_thread_idle;
}
io_run_task_work();
@@ -6809,6 +7073,11 @@ static int io_sq_thread(void *data)
if (old_cred)
revert_creds(old_cred);
+ task_lock(current);
+ current->files = old_files;
+ current->nsproxy = old_nsproxy;
+ task_unlock(current);
+
kthread_parkme();
return 0;
@@ -6863,7 +7132,8 @@ static int io_run_task_work_sig(void)
* application must reap them itself, as they reside on the shared cq ring.
*/
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
- const sigset_t __user *sig, size_t sigsz)
+ const sigset_t __user *sig, size_t sigsz,
+ struct __kernel_timespec __user *uts)
{
struct io_wait_queue iowq = {
.wq = {
@@ -6875,6 +7145,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings;
+ struct timespec64 ts;
+ signed long timeout = 0;
int ret = 0;
do {
@@ -6897,6 +7169,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
+ if (uts) {
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = timespec64_to_jiffies(&ts);
+ }
+
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
trace_io_uring_cqring_wait(ctx, min_events);
do {
@@ -6910,7 +7188,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
if (io_should_wake(&iowq, false))
break;
- schedule();
+ if (uts) {
+ timeout = schedule_timeout(timeout);
+ if (timeout == 0) {
+ ret = -ETIME;
+ break;
+ }
+ } else {
+ schedule();
+ }
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
@@ -6959,9 +7245,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
if (!data)
return -ENXIO;
- spin_lock(&data->lock);
+ spin_lock_bh(&data->lock);
ref_node = data->node;
- spin_unlock(&data->lock);
+ spin_unlock_bh(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
@@ -7084,12 +7370,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
mutex_lock(&sqd->ctx_lock);
list_del(&ctx->sqd_list);
+ io_sqd_update_thread_idle(sqd);
mutex_unlock(&sqd->ctx_lock);
- if (sqd->thread) {
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ if (sqd->thread)
io_sq_thread_unpark(sqd);
- }
io_put_sq_data(sqd);
ctx->sq_data = NULL;
@@ -7344,7 +7629,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
data = ref_node->file_data;
ctx = data->ctx;
- spin_lock(&data->lock);
+ spin_lock_bh(&data->lock);
ref_node->done = true;
while (!list_empty(&data->ref_list)) {
@@ -7356,7 +7641,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
list_del(&ref_node->node);
first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
}
- spin_unlock(&data->lock);
+ spin_unlock_bh(&data->lock);
if (percpu_ref_is_dying(&data->refs))
delay = 0;
@@ -7479,9 +7764,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
file_data->node = ref_node;
- spin_lock(&file_data->lock);
+ spin_lock_bh(&file_data->lock);
list_add_tail(&ref_node->node, &file_data->ref_list);
- spin_unlock(&file_data->lock);
+ spin_unlock_bh(&file_data->lock);
percpu_ref_get(&file_data->refs);
return ret;
out_fput:
@@ -7638,10 +7923,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (needs_switch) {
percpu_ref_kill(&data->node->refs);
- spin_lock(&data->lock);
+ spin_lock_bh(&data->lock);
list_add_tail(&ref_node->node, &data->ref_list);
data->node = ref_node;
- spin_unlock(&data->lock);
+ spin_unlock_bh(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
} else
destroy_fixed_file_ref_node(ref_node);
@@ -7769,7 +8054,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_sq_data *sqd;
ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
goto err;
sqd = io_get_sq_data(p);
@@ -8355,8 +8640,6 @@ static void io_ring_exit_work(struct work_struct *work)
* as nobody else will be looking for them.
*/
do {
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
@@ -8366,17 +8649,17 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx, NULL);
- io_poll_remove_all(ctx, NULL);
+ io_kill_timeouts(ctx, NULL, NULL);
+ io_poll_remove_all(ctx, NULL, NULL);
if (ctx->io_wq)
io_wq_cancel_all(ctx->io_wq);
/* if we failed setting up the ctx, we might not have any rings */
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
@@ -8407,120 +8690,31 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
-static bool io_wq_files_match(struct io_wq_work *work, void *data)
-{
- struct files_struct *files = data;
-
- return !files || ((work->flags & IO_WQ_WORK_FILES) &&
- work->identity->files == files);
-}
-
-/*
- * Returns true if 'preq' is the link parent of 'req'
- */
-static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
-{
- struct io_kiocb *link;
-
- if (!(preq->flags & REQ_F_LINK_HEAD))
- return false;
-
- list_for_each_entry(link, &preq->link_list, link_list) {
- if (link == req)
- return true;
- }
-
- return false;
-}
-
-/*
- * We're looking to cancel 'req' because it's holding on to our files, but
- * 'req' could be a link to another request. See if it is, and cancel that
- * parent request if so.
- */
-static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
- struct hlist_node *tmp;
- struct io_kiocb *preq;
- bool found = false;
- int i;
-
- spin_lock_irq(&ctx->completion_lock);
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
- found = io_match_link(preq, req);
- if (found) {
- io_poll_remove_one(preq);
- break;
- }
- }
- }
- spin_unlock_irq(&ctx->completion_lock);
- return found;
-}
-
-static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_kiocb *preq;
- bool found = false;
-
- spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
- found = io_match_link(preq, req);
- if (found) {
- __io_timeout_cancel(preq);
- break;
- }
- }
- spin_unlock_irq(&ctx->completion_lock);
- return found;
-}
+struct io_task_cancel {
+ struct task_struct *task;
+ struct files_struct *files;
+};
-static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_task_cancel *cancel = data;
bool ret;
- if (req->flags & REQ_F_LINK_TIMEOUT) {
+ if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
unsigned long flags;
struct io_ring_ctx *ctx = req->ctx;
/* protect against races with linked timeouts */
spin_lock_irqsave(&ctx->completion_lock, flags);
- ret = io_match_link(req, data);
+ ret = io_match_task(req, cancel->task, cancel->files);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
} else {
- ret = io_match_link(req, data);
+ ret = io_match_task(req, cancel->task, cancel->files);
}
return ret;
}
-static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
- enum io_wq_cancel cret;
-
- /* cancel this particular work, if it's running */
- cret = io_wq_cancel_work(ctx->io_wq, &req->work);
- if (cret != IO_WQ_CANCEL_NOTFOUND)
- return;
-
- /* find links that hold this pending, cancel those */
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
- if (cret != IO_WQ_CANCEL_NOTFOUND)
- return;
-
- /* if we have a poll link holding this pending, cancel that */
- if (io_poll_remove_link(ctx, req))
- return;
-
- /* final option, timeout link is holding this req pending */
- io_timeout_remove_link(ctx, req);
-}
-
static void io_cancel_defer_files(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files)
@@ -8530,8 +8724,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_task_match(de->req, task) &&
- io_match_files(de->req, files)) {
+ if (io_match_task(de->req, task, files)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -8548,72 +8741,52 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
}
}
-/*
- * Returns true if we found and killed one or more files pinning requests
- */
-static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
struct files_struct *files)
{
- if (list_empty_careful(&ctx->inflight_list))
- return false;
-
- /* cancel all at once, should be faster than doing it one by one*/
- io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
-
while (!list_empty_careful(&ctx->inflight_list)) {
- struct io_kiocb *cancel_req = NULL, *req;
+ struct io_task_cancel cancel = { .task = task, .files = files };
+ struct io_kiocb *req;
DEFINE_WAIT(wait);
+ bool found = false;
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
+ if (req->task != task ||
req->work.identity->files != files)
continue;
- /* req is being completed, ignore */
- if (!refcount_inc_not_zero(&req->refs))
- continue;
- cancel_req = req;
+ found = true;
break;
}
- if (cancel_req)
- prepare_to_wait(&ctx->inflight_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ if (found)
+ prepare_to_wait(&task->io_uring->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&ctx->inflight_lock);
/* We need to keep going until we don't find a matching req */
- if (!cancel_req)
+ if (!found)
break;
- /* cancel this request, or head link requests */
- io_attempt_cancel(ctx, cancel_req);
- io_put_req(cancel_req);
+
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
+ io_poll_remove_all(ctx, task, files);
+ io_kill_timeouts(ctx, task, files);
/* cancellations _may_ trigger task work */
io_run_task_work();
schedule();
- finish_wait(&ctx->inflight_wait, &wait);
+ finish_wait(&task->io_uring->wait, &wait);
}
-
- return true;
}
-static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task)
{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct task_struct *task = data;
-
- return io_task_match(req, task);
-}
-
-static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
-{
- bool ret;
-
- ret = io_uring_cancel_files(ctx, files);
- if (!files) {
+ while (1) {
+ struct io_task_cancel cancel = { .task = task, .files = NULL, };
enum io_wq_cancel cret;
+ bool ret = false;
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
if (cret != IO_WQ_CANCEL_NOTFOUND)
ret = true;
@@ -8625,11 +8798,13 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
}
}
- ret |= io_poll_remove_all(ctx, task);
- ret |= io_kill_timeouts(ctx, task);
+ ret |= io_poll_remove_all(ctx, task, NULL);
+ ret |= io_kill_timeouts(ctx, task, NULL);
+ if (!ret)
+ break;
+ io_run_task_work();
+ cond_resched();
}
-
- return ret;
}
/*
@@ -8648,17 +8823,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
io_sq_thread_park(ctx->sq_data);
}
- if (files)
- io_cancel_defer_files(ctx, NULL, files);
- else
- io_cancel_defer_files(ctx, task, NULL);
-
+ io_cancel_defer_files(ctx, task, files);
+ io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
io_cqring_overflow_flush(ctx, true, task, files);
+ io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
- while (__io_uring_cancel_task_requests(ctx, task, files)) {
- io_run_task_work();
- cond_resched();
- }
+ if (!files)
+ __io_uring_cancel_task_requests(ctx, task);
+ else
+ io_uring_cancel_files(ctx, task, files);
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
atomic_dec(&task->io_uring->in_idle);
@@ -8916,9 +9089,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
finish_wait(&ctx->sqo_sq_wait, &wait);
}
+static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
+ struct __kernel_timespec __user **ts,
+ const sigset_t __user **sig)
+{
+ struct io_uring_getevents_arg arg;
+
+ /*
+ * If EXT_ARG isn't set, then we have no timespec and the argp pointer
+ * is just a pointer to the sigset_t.
+ */
+ if (!(flags & IORING_ENTER_EXT_ARG)) {
+ *sig = (const sigset_t __user *) argp;
+ *ts = NULL;
+ return 0;
+ }
+
+ /*
+ * EXT_ARG is set - ensure we agree on the size of it and copy in our
+ * timespec and sigset_t pointers if good.
+ */
+ if (*argsz != sizeof(arg))
+ return -EINVAL;
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+ *sig = u64_to_user_ptr(arg.sigmask);
+ *argsz = arg.sigmask_sz;
+ *ts = u64_to_user_ptr(arg.ts);
+ return 0;
+}
+
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
- u32, min_complete, u32, flags, const sigset_t __user *, sig,
- size_t, sigsz)
+ u32, min_complete, u32, flags, const void __user *, argp,
+ size_t, argsz)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
@@ -8928,7 +9131,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
return -EINVAL;
f = fdget(fd);
@@ -8955,8 +9158,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
if (!list_empty_careful(&ctx->cq_overflow_list))
io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT)
@@ -8974,6 +9179,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
+ const sigset_t __user *sig;
+ struct __kernel_timespec __user *ts;
+
+ ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+ if (unlikely(ret))
+ goto out;
+
min_complete = min(min_complete, ctx->cq_entries);
/*
@@ -8986,7 +9198,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_iopoll_check(ctx, min_complete);
} else {
- ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+ ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
}
}
@@ -9354,7 +9566,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- IORING_FEAT_POLL_32BITS;
+ IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+ IORING_FEAT_EXT_ARG;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;