summaryrefslogtreecommitdiffstats
path: root/kernel/exit.c
diff options
context:
space:
mode:
authorJim Newsome <jnewsome@torproject.org>2021-05-07 03:04:22 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2021-05-07 04:24:13 +0200
commit5449162ac001a926ad8884882b071601df5edb44 (patch)
tree773a4bf79f56ef4bfa6745095bed8c24416e38eb /kernel/exit.c
parenthpfs: replace one-element array with flexible-array member (diff)
downloadlinux-5449162ac001a926ad8884882b071601df5edb44.tar.xz
linux-5449162ac001a926ad8884882b071601df5edb44.zip
do_wait: make PIDTYPE_PID case O(1) instead of O(n)
Add a special-case when waiting on a pid (via waitpid, waitid, wait4, etc) to avoid doing an O(n) scan of children and tracees, and instead do an O(1) lookup. This improves performance when waiting on a pid from a thread group with many children and/or tracees. Time to fork and then call waitpid on the child, from a task that already has N children [1]: N | Before | After -----|---------|------ 1 | 74 us | 74 us 20 | 72 us | 75 us 100 | 83 us | 77 us 500 | 99 us | 74 us 1000 | 179 us | 75 us 5000 | 804 us | 79 us 8000 | 1268 us | 78 us [1]: https://lkml.org/lkml/2021/3/12/1567 This can make a substantial performance improvement for applications with a thread that has many children or tracees and frequently needs to wait on them. Tools that use ptrace to intercept syscalls for a large number of processes are likely to fall into this category. In particular this patch was developed while building a ptrace-based second generation of the Shadow emulator [2], for which it allows us to avoid quadratic scaling (without having to use a workaround that introduces a ~40% performance penalty) [3]. Other examples of tools that fall into this category which this patch may help include User Mode Linux [4] and DetTrace [5]. [2]: https://shadow.github.io/ [3]: https://github.com/shadow/shadow/issues/1134#issuecomment-798992292 [4]: https://en.wikipedia.org/wiki/User-mode_Linux [5]: https://github.com/dettrace/dettrace Link: https://lkml.kernel.org/r/20210314231544.9379-1-jnewsome@torproject.org Signed-off-by: James Newsome <jnewsome@torproject.org> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Cc: "Eric W . Biederman" <ebiederm@xmission.com> Cc: Christian Brauner <christian@brauner.io> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/exit.c')
-rw-r--r--kernel/exit.c67
1 files changed, 57 insertions, 10 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 0596526ed9ea..fd1c04193e18 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1440,9 +1440,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
TASK_INTERRUPTIBLE, p);
}
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+ struct task_struct *target)
+{
+ struct task_struct *parent =
+ !ptrace ? target->real_parent : target->parent;
+
+ return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+ same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+ bool ptrace;
+ struct task_struct *target;
+ int retval;
+
+ ptrace = false;
+ target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+ if (target && is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ ptrace = true;
+ target = pid_task(wo->wo_pid, PIDTYPE_PID);
+ if (target && target->ptrace &&
+ is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ return 0;
+}
+
static long do_wait(struct wait_opts *wo)
{
- struct task_struct *tsk;
int retval;
trace_sched_process_wait(wo->wo_pid);
@@ -1464,19 +1503,27 @@ repeat:
set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
- tsk = current;
- do {
- retval = do_wait_thread(wo, tsk);
- if (retval)
- goto end;
- retval = ptrace_do_wait(wo, tsk);
+ if (wo->wo_type == PIDTYPE_PID) {
+ retval = do_wait_pid(wo);
if (retval)
goto end;
+ } else {
+ struct task_struct *tsk = current;
+
+ do {
+ retval = do_wait_thread(wo, tsk);
+ if (retval)
+ goto end;
- if (wo->wo_flags & __WNOTHREAD)
- break;
- } while_each_thread(current, tsk);
+ retval = ptrace_do_wait(wo, tsk);
+ if (retval)
+ goto end;
+
+ if (wo->wo_flags & __WNOTHREAD)
+ break;
+ } while_each_thread(current, tsk);
+ }
read_unlock(&tasklist_lock);
notask: