From e297cd54b3f81d652456ae6cb93941fc6b5c6683 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:30 -0600 Subject: vhost_task: Allow vhost layer to use copy_process Qemu will create vhost devices in the kernel which perform network, SCSI, etc IO and management operations from worker threads created by the kthread API. Because the kthread API does a copy_process on the kthreadd thread, the vhost layer has to use kthread_use_mm to access the Qemu thread's memory and cgroup_attach_task_all to add itself to the Qemu thread's cgroups, and it bypasses the RLIMIT_NPROC limit which can result in VMs creating more threads than the admin expected. This patch adds a new struct vhost_task which can be used instead of kthreads. They allow the vhost layer to use copy_process and inherit the userspace process's mm and cgroups, the task is accounted for under the userspace's nproc count and can be seen in its process tree, and other features like namespaces work and are inherited by default. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Christian Brauner --- drivers/vhost/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/vhost') diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 587fbae06182..b455d9ab6f3d 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -13,9 +13,14 @@ config VHOST_RING This option is selected by any driver which needs to access the host side of a virtio ring. +config VHOST_TASK + bool + default n + config VHOST tristate select VHOST_IOTLB + select VHOST_TASK help This option is selected by any driver which needs to access the core of vhost. -- cgit v1.2.3 From 1a5f8090c6de99306f4212dc7adfc6189a616eb9 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:31 -0600 Subject: vhost: move worker thread fields to new struct This is just a prep patch. It moves the worker related fields to a new vhost_worker struct and moves the code around to create some helpers that will be used in the next patch. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Christian Brauner --- drivers/vhost/vhost.c | 98 +++++++++++++++++++++++++++++++++------------------ drivers/vhost/vhost.h | 11 ++++-- 2 files changed, 72 insertions(+), 37 deletions(-) (limited to 'drivers/vhost') diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f11bdbe4c2c5..dc7f8236ec11 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -255,8 +255,8 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) * sure it was not in the list. * test_and_set_bit() implies a memory barrier. */ - llist_add(&work->node, &dev->work_list); - wake_up_process(dev->worker); + llist_add(&work->node, &dev->worker->work_list); + wake_up_process(dev->worker->task); } } EXPORT_SYMBOL_GPL(vhost_work_queue); @@ -264,7 +264,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue); /* A lockless hint for busy polling code to exit the loop */ bool vhost_has_work(struct vhost_dev *dev) { - return !llist_empty(&dev->work_list); + return dev->worker && !llist_empty(&dev->worker->work_list); } EXPORT_SYMBOL_GPL(vhost_has_work); @@ -335,7 +335,8 @@ static void vhost_vq_reset(struct vhost_dev *dev, static int vhost_worker(void *data) { - struct vhost_dev *dev = data; + struct vhost_worker *worker = data; + struct vhost_dev *dev = worker->dev; struct vhost_work *work, *work_next; struct llist_node *node; @@ -350,7 +351,7 @@ static int vhost_worker(void *data) break; } - node = llist_del_all(&dev->work_list); + node = llist_del_all(&worker->work_list); if (!node) schedule(); @@ -360,7 +361,7 @@ static int vhost_worker(void *data) llist_for_each_entry_safe(work, work_next, node, node) { clear_bit(VHOST_WORK_QUEUED, &work->flags); __set_current_state(TASK_RUNNING); - kcov_remote_start_common(dev->kcov_handle); + kcov_remote_start_common(worker->kcov_handle); work->fn(work); kcov_remote_stop(); if (need_resched()) @@ -479,7 +480,6 @@ void vhost_dev_init(struct vhost_dev *dev, dev->byte_weight = byte_weight; dev->use_worker = use_worker; dev->msg_handler = msg_handler; - init_llist_head(&dev->work_list); init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); INIT_LIST_HEAD(&dev->pending_list); @@ -571,10 +571,60 @@ static void vhost_detach_mm(struct vhost_dev *dev) dev->mm = NULL; } +static void vhost_worker_free(struct vhost_dev *dev) +{ + struct vhost_worker *worker = dev->worker; + + if (!worker) + return; + + dev->worker = NULL; + WARN_ON(!llist_empty(&worker->work_list)); + kthread_stop(worker->task); + kfree(worker); +} + +static int vhost_worker_create(struct vhost_dev *dev) +{ + struct vhost_worker *worker; + struct task_struct *task; + int ret; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); + if (!worker) + return -ENOMEM; + + dev->worker = worker; + worker->dev = dev; + worker->kcov_handle = kcov_common_handle(); + init_llist_head(&worker->work_list); + + task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto free_worker; + } + + worker->task = task; + wake_up_process(task); /* avoid contributing to loadavg */ + + ret = vhost_attach_cgroups(dev); + if (ret) + goto stop_worker; + + return 0; + +stop_worker: + kthread_stop(worker->task); +free_worker: + kfree(worker); + dev->worker = NULL; + return ret; +} + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { - struct task_struct *worker; int err; /* Is there an owner already? */ @@ -585,36 +635,21 @@ long vhost_dev_set_owner(struct vhost_dev *dev) vhost_attach_mm(dev); - dev->kcov_handle = kcov_common_handle(); if (dev->use_worker) { - worker = kthread_create(vhost_worker, dev, - "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; - } - - dev->worker = worker; - wake_up_process(worker); /* avoid contributing to loadavg */ - - err = vhost_attach_cgroups(dev); + err = vhost_worker_create(dev); if (err) - goto err_cgroup; + goto err_worker; } err = vhost_dev_alloc_iovecs(dev); if (err) - goto err_cgroup; + goto err_iovecs; return 0; -err_cgroup: - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - } +err_iovecs: + vhost_worker_free(dev); err_worker: vhost_detach_mm(dev); - dev->kcov_handle = 0; err_mm: return err; } @@ -705,12 +740,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); - WARN_ON(!llist_empty(&dev->work_list)); - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - dev->kcov_handle = 0; - } + vhost_worker_free(dev); vhost_detach_mm(dev); } EXPORT_SYMBOL_GPL(vhost_dev_cleanup); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 1647b750169c..67d614b5453d 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -25,6 +25,13 @@ struct vhost_work { unsigned long flags; }; +struct vhost_worker { + struct task_struct *task; + struct llist_head work_list; + struct vhost_dev *dev; + u64 kcov_handle; +}; + /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { @@ -147,8 +154,7 @@ struct vhost_dev { struct vhost_virtqueue **vqs; int nvqs; struct eventfd_ctx *log_ctx; - struct llist_head work_list; - struct task_struct *worker; + struct vhost_worker *worker; struct vhost_iotlb *umem; struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; @@ -158,7 +164,6 @@ struct vhost_dev { int iov_limit; int weight; int byte_weight; - u64 kcov_handle; bool use_worker; int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg); -- cgit v1.2.3 From 6e890c5d5021ca7e69bbe203fde42447874d9a82 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:32 -0600 Subject: vhost: use vhost_tasks for worker threads For vhost workers we use the kthread API which inherit's its values from and checks against the kthreadd thread. This results in the wrong RLIMITs being checked, so while tools like libvirt try to control the number of threads based on the nproc rlimit setting we can end up creating more threads than the user wanted. This patch has us use the vhost_task helpers which will inherit its values/checks from the thread that owns the device similar to if we did a clone in userspace. The vhost threads will now be counted in the nproc rlimits. And we get features like cgroups and mm sharing automatically, so we can remove those calls. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Christian Brauner --- drivers/vhost/vhost.c | 60 +++++++++++---------------------------------------- drivers/vhost/vhost.h | 4 ++-- 2 files changed, 15 insertions(+), 49 deletions(-) (limited to 'drivers/vhost') diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index dc7f8236ec11..518329c32d5d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -22,11 +22,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -256,7 +256,7 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) * test_and_set_bit() implies a memory barrier. */ llist_add(&work->node, &dev->worker->work_list); - wake_up_process(dev->worker->task); + wake_up_process(dev->worker->vtsk->task); } } EXPORT_SYMBOL_GPL(vhost_work_queue); @@ -336,17 +336,14 @@ static void vhost_vq_reset(struct vhost_dev *dev, static int vhost_worker(void *data) { struct vhost_worker *worker = data; - struct vhost_dev *dev = worker->dev; struct vhost_work *work, *work_next; struct llist_node *node; - kthread_use_mm(dev->mm); - for (;;) { /* mb paired w/ kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { + if (vhost_task_should_stop(worker->vtsk)) { __set_current_state(TASK_RUNNING); break; } @@ -368,7 +365,7 @@ static int vhost_worker(void *data) schedule(); } } - kthread_unuse_mm(dev->mm); + return 0; } @@ -509,31 +506,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_check_owner); -struct vhost_attach_cgroups_struct { - struct vhost_work work; - struct task_struct *owner; - int ret; -}; - -static void vhost_attach_cgroups_work(struct vhost_work *work) -{ - struct vhost_attach_cgroups_struct *s; - - s = container_of(work, struct vhost_attach_cgroups_struct, work); - s->ret = cgroup_attach_task_all(s->owner, current); -} - -static int vhost_attach_cgroups(struct vhost_dev *dev) -{ - struct vhost_attach_cgroups_struct attach; - - attach.owner = current; - vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_dev_flush(dev); - return attach.ret; -} - /* Caller should have device mutex */ bool vhost_dev_has_owner(struct vhost_dev *dev) { @@ -580,14 +552,15 @@ static void vhost_worker_free(struct vhost_dev *dev) dev->worker = NULL; WARN_ON(!llist_empty(&worker->work_list)); - kthread_stop(worker->task); + vhost_task_stop(worker->vtsk); kfree(worker); } static int vhost_worker_create(struct vhost_dev *dev) { struct vhost_worker *worker; - struct task_struct *task; + struct vhost_task *vtsk; + char name[TASK_COMM_LEN]; int ret; worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); @@ -595,27 +568,20 @@ static int vhost_worker_create(struct vhost_dev *dev) return -ENOMEM; dev->worker = worker; - worker->dev = dev; worker->kcov_handle = kcov_common_handle(); init_llist_head(&worker->work_list); + snprintf(name, sizeof(name), "vhost-%d", current->pid); - task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid); - if (IS_ERR(task)) { - ret = PTR_ERR(task); + vtsk = vhost_task_create(vhost_worker, worker, name); + if (!vtsk) { + ret = -ENOMEM; goto free_worker; } - worker->task = task; - wake_up_process(task); /* avoid contributing to loadavg */ - - ret = vhost_attach_cgroups(dev); - if (ret) - goto stop_worker; - + worker->vtsk = vtsk; + vhost_task_start(vtsk); return 0; -stop_worker: - kthread_stop(worker->task); free_worker: kfree(worker); dev->worker = NULL; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 67d614b5453d..0308638cdeee 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -16,6 +16,7 @@ #include struct vhost_work; +struct vhost_task; typedef void (*vhost_work_fn_t)(struct vhost_work *work); #define VHOST_WORK_QUEUED 1 @@ -26,9 +27,8 @@ struct vhost_work { }; struct vhost_worker { - struct task_struct *task; + struct vhost_task *vtsk; struct llist_head work_list; - struct vhost_dev *dev; u64 kcov_handle; }; -- cgit v1.2.3