diff options
Diffstat (limited to 'io_uring/rsrc.c')
-rw-r--r-- | io_uring/rsrc.c | 350 |
1 files changed, 116 insertions, 234 deletions
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 7a43aed8e395..ddee7adb4006 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -23,25 +23,16 @@ struct io_rsrc_update { u32 offset; }; +static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); -#define IO_RSRC_REF_BATCH 100 - /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -void io_rsrc_refs_drop(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; - } -} - int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -151,216 +142,129 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } -void io_rsrc_refs_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); -} - -static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) -{ - struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; - struct io_rsrc_put *prsrc, *tmp; - - list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { - list_del(&prsrc->list); - - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - mutex_unlock(&ctx->uring_lock); - } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - } - } - - rsrc_data->do_put(ctx, prsrc); - kfree(prsrc); - } - - io_rsrc_node_destroy(ref_node); - if (atomic_dec_and_test(&rsrc_data->refs)) - complete(&rsrc_data->done); -} - -void io_rsrc_put_work(struct work_struct *work) +static void io_rsrc_put_work(struct io_rsrc_node *node) { - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); + struct io_rsrc_put *prsrc = &node->item; - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; + if (prsrc->tag) + io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; + switch (node->type) { + case IORING_RSRC_FILE: + io_rsrc_file_put(node->ctx, prsrc); + break; + case IORING_RSRC_BUFFER: + io_rsrc_buf_put(node->ctx, prsrc); + break; + default: + WARN_ON_ONCE(1); + break; } } -void io_rsrc_put_tw(struct callback_head *cb) +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, - rsrc_put_tw); - - io_rsrc_put_work(&ctx->rsrc_put_work.work); + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + kfree(node); } -void io_wait_rsrc_data(struct io_rsrc_data *data) +void io_rsrc_node_ref_zero(struct io_rsrc_node *node) + __must_hold(&node->ctx->uring_lock) { - if (data && !atomic_dec_and_test(&data->refs)) - wait_for_completion(&data->done); -} - -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) -{ - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); -} - -static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) -{ - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - unsigned long delay = HZ; - - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); - node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; + struct io_ring_ctx *ctx = node->ctx; while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ - if (!node->done) + if (node->refs) break; list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - if (!first_add) - return; - - if (ctx->submitter_task) { - if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw, - ctx->notify_method)) - return; + if (likely(!node->empty)) + io_rsrc_put_work(node); + io_rsrc_node_destroy(ctx, node); } - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); + if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) + wake_up_all(&ctx->rsrc_quiesce_wq); } -static struct io_rsrc_node *io_rsrc_node_alloc(void) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; + struct io_cache_entry *entry; - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; - return ref_node; -} - -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) - __must_hold(&ctx->uring_lock) -{ - WARN_ON_ONCE(!ctx->rsrc_backup_node); - WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); - - io_rsrc_refs_drop(ctx); - - if (data_to_kill) { - struct io_rsrc_node *rsrc_node = ctx->rsrc_node; - - rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); - list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); - - atomic_inc(&data_to_kill->refs); - percpu_ref_kill(&rsrc_node->refs); - ctx->rsrc_node = NULL; - } - - if (!ctx->rsrc_node) { - ctx->rsrc_node = ctx->rsrc_backup_node; - ctx->rsrc_backup_node = NULL; + entry = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (entry) { + ref_node = container_of(entry, struct io_rsrc_node, cache); + } else { + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; } -} -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) -{ - if (ctx->rsrc_backup_node) - return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); - return ctx->rsrc_backup_node ? 0 : -ENOMEM; + ref_node->ctx = ctx; + ref_node->empty = 0; + ref_node->refs = 1; + return ref_node; } __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { + struct io_rsrc_node *backup; + DEFINE_WAIT(we); int ret; - /* As we may drop ->uring_lock, other task may have started quiesce */ + /* As We may drop ->uring_lock, other task may have started quiesce */ if (data->quiesce) return -ENXIO; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - io_rsrc_node_switch(ctx, data); - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) + backup = io_rsrc_node_alloc(ctx); + if (!backup) + return -ENOMEM; + ctx->rsrc_node->empty = true; + ctx->rsrc_node->type = -1; + list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); + io_put_rsrc_node(ctx, ctx->rsrc_node); + ctx->rsrc_node = backup; + + if (list_empty(&ctx->rsrc_ref_list)) return 0; + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + + ctx->rsrc_quiesce++; data->quiesce = true; - mutex_unlock(&ctx->uring_lock); do { + prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); + mutex_unlock(&ctx->uring_lock); + ret = io_run_task_work_sig(ctx); if (ret < 0) { - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); - reinit_completion(&data->done); mutex_lock(&ctx->uring_lock); + if (list_empty(&ctx->rsrc_ref_list)) + ret = 0; break; } - flush_delayed_work(&ctx->rsrc_put_work); - ret = wait_for_completion_interruptible(&data->done); - if (!ret) { - mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) <= 0) - break; - /* - * it has been revived by another thread while - * we were unlocked - */ - mutex_unlock(&ctx->uring_lock); - } - } while (1); + schedule(); + __set_current_state(TASK_RUNNING); + mutex_lock(&ctx->uring_lock); + ret = 0; + } while (!list_empty(&ctx->rsrc_ref_list)); + + finish_wait(&ctx->rsrc_quiesce_wq, &we); data->quiesce = false; + ctx->rsrc_quiesce--; + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 0); + smp_mb(); + } return ret; } @@ -405,8 +309,8 @@ static __cold void **io_alloc_page_table(size_t size) return table; } -__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, - rsrc_put_fn *do_put, u64 __user *utags, +__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, + u64 __user *utags, unsigned nr, struct io_rsrc_data **pdata) { struct io_rsrc_data *data; @@ -424,7 +328,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, data->nr = nr; data->ctx = ctx; - data->do_put = do_put; + data->rsrc_type = type; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { @@ -435,9 +339,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, goto fail; } } - - atomic_set(&data->refs, 1); - init_completion(&data->done); *pdata = data; return 0; fail: @@ -456,7 +357,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct file *file; int fd, i, err = 0; unsigned int done; - bool needs_switch = false; if (!ctx->file_data) return -ENXIO; @@ -483,12 +383,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (file_slot->file_ptr) { file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); + err = io_queue_rsrc_removal(data, i, file); if (err) break; file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, i); - needs_switch = true; } if (fd != -1) { file = fget(fd); @@ -519,9 +418,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, io_file_bitmap_set(&ctx->file_table, i); } } - - if (needs_switch) - io_rsrc_node_switch(ctx, data); return done ? done : err; } @@ -532,7 +428,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); struct page *last_hpage = NULL; - bool needs_switch = false; __u32 done; int i, err; @@ -543,7 +438,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, for (done = 0; done < nr_args; done++) { struct io_mapped_ubuf *imu; - int offset = up->offset + done; u64 tag = 0; err = io_copy_iov(ctx, &iov, iovs, done); @@ -564,24 +458,20 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, if (err) break; - i = array_index_nospec(offset, ctx->nr_user_bufs); + i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); if (ctx->user_bufs[i] != ctx->dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->rsrc_node, ctx->user_bufs[i]); + ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); break; } ctx->user_bufs[i] = ctx->dummy_ubuf; - needs_switch = true; } ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, offset) = tag; + *io_get_tag_slot(ctx->buf_data, i) = tag; } - - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->buf_data); return done ? done : err; } @@ -590,13 +480,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, unsigned nr_args) { __u32 tmp; - int err; + + lockdep_assert_held(&ctx->uring_lock); if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; - err = io_rsrc_node_switch_start(ctx); - if (err) - return err; switch (type) { case IORING_RSRC_FILE: @@ -753,20 +641,24 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) { + struct io_ring_ctx *ctx = data->ctx; + struct io_rsrc_node *node = ctx->rsrc_node; u64 *tag_slot = io_get_tag_slot(data, idx); - struct io_rsrc_put *prsrc; - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) + ctx->rsrc_node = io_rsrc_node_alloc(ctx); + if (unlikely(!ctx->rsrc_node)) { + ctx->rsrc_node = node; return -ENOMEM; + } - prsrc->tag = *tag_slot; + node->item.rsrc = rsrc; + node->type = data->rsrc_type; + node->item.tag = *tag_slot; *tag_slot = 0; - prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); + list_add_tail(&node->node, &ctx->rsrc_ref_list); + io_put_rsrc_node(ctx, node); return 0; } @@ -882,20 +774,14 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) return 0; } -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) { - struct file *file = prsrc->file; #if defined(CONFIG_UNIX) struct sock *sock = ctx->ring_sock->sk; struct sk_buff_head list, *head = &sock->sk_receive_queue; struct sk_buff *skb; int i; - if (!io_file_need_scm(file)) { - fput(file); - return; - } - __skb_queue_head_init(&list); /* @@ -945,11 +831,19 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) __skb_queue_tail(head, skb); spin_unlock_irq(&head->lock); } -#else - fput(file); #endif } +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +{ + struct file *file = prsrc->file; + + if (likely(!io_file_need_scm(file))) + fput(file); + else + io_rsrc_file_scm_put(ctx, file); +} + int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { @@ -966,10 +860,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, + ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, &ctx->file_data); if (ret) return ret; @@ -1023,7 +914,6 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, /* default it to the whole table */ io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); - io_rsrc_node_switch(ctx, NULL); return 0; fail: __io_sqe_files_unregister(ctx); @@ -1163,17 +1053,14 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { - struct file *file = vmas[0]->vm_file; - /* don't support file backed memory */ for (i = 0; i < nr_pages; i++) { - if (vmas[i]->vm_file != file) { - ret = -EINVAL; - break; - } - if (!file) + struct vm_area_struct *vma = vmas[i]; + + if (vma_is_shmem(vma)) continue; - if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { + if (vma->vm_file && + !is_file_hugepages(vma->vm_file)) { ret = -EOPNOTSUPP; break; } @@ -1305,10 +1192,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); + ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); if (ret) return ret; ret = io_buffers_map_alloc(ctx, nr_args); @@ -1345,8 +1229,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ctx->buf_data = data; if (ret) __io_sqe_buffers_unregister(ctx); - else - io_rsrc_node_switch(ctx, NULL); return ret; } |