From 864c2357ca898c6171fe5284f5ecc795c8ce27a8 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Tue, 1 Nov 2016 11:52:58 -0700 Subject: perf/core: Do not set cpuctx->cgrp for unscheduled cgroups Commit: db4a835601b7 ("perf/core: Set cgroup in CPU contexts for new cgroup events") failed to verify that event->cgrp is actually the scheduled cgroup in a CPU before setting cpuctx->cgrp. This patch fixes that. Now that there is a different path for scheduled and unscheduled cgroup, add a warning to catch when cpuctx->cgrp is still set after the last cgroup event has been unsheduled. To verify the bug: # Create 2 cgroups. mkdir /dev/cgroups/devices/g1 mkdir /dev/cgroups/devices/g2 # launch a task, bind it to a cpu and move it to g1 CPU=2 while :; do : ; done & P=$! taskset -pc $CPU $P echo $P > /dev/cgroups/devices/g1/tasks # monitor g2 (it runs no tasks) and observe output perf stat -e cycles -I 1000 -C $CPU -G g2 # time counts unit events 1.000091408 7,579,527 cycles g2 2.000350111 cycles g2 3.000589181 cycles g2 4.000771428 cycles g2 # note first line that displays that a task run in g2, despite # g2 having no tasks. This is because cpuctx->cgrp was wrongly # set when context of new event was installed. # After applying the fix we obtain the right output: perf stat -e cycles -I 1000 -C $CPU -G g2 # time counts unit events 1.000119615 cycles g2 2.000389430 cycles g2 3.000590962 cycles g2 Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Nilay Vaish Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Link: http://lkml.kernel.org/r/1478026378-86083-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e292132efac..ff230bb4a02e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); + + /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */ + if (perf_cgroup_from_task(current, ctx) != event->cgrp) { + /* + * We are removing the last cpu event in this context. + * If that event is not active in this cpu, cpuctx->cgrp + * should've been cleared by perf_cgroup_switch. + */ + WARN_ON_ONCE(!add && cpuctx->cgrp); + return; + } cpuctx->cgrp = add ? event->cgrp : NULL; } -- cgit v1.2.3 From e96271f3ed7e702fa36dd0605c0c5b5f065af816 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 18 Nov 2016 13:38:43 +0200 Subject: perf/core: Fix address filter parser The token table passed into match_token() must be null-terminated, which it currently is not in the perf's address filter string parser, as caught by Vince's perf_fuzzer and KASAN. It doesn't blow up otherwise because of the alignment padding of the table to the next element in the .rodata, which is luck. Fixing by adding a null-terminator to the token table. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: stable@vger.kernel.org # v4.7+ Fixes: 375637bc524 ("perf/core: Introduce address range filtering") Link: http://lkml.kernel.org/r/877f81f264.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ff230bb4a02e..6ee1febdf6ff 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8029,6 +8029,7 @@ restart: * if is not specified, the range is treated as a single address. */ enum { + IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, @@ -8052,6 +8053,7 @@ static const match_table_t if_tokens = { { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, + { IF_ACT_NONE, NULL }, }; /* -- cgit v1.2.3 From 18f649ef344127ef6de23a5a4272dbe2fdb73dde Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:09 +0100 Subject: sched/autogroup: Fix autogroup_move_group() to never skip sched_move_task() The PF_EXITING check in task_wants_autogroup() is no longer needed. Remove it, but see the next patch. However the comment is correct in that autogroup_move_group() must always change task_group() for every thread so the sysctl_ check is very wrong; we can race with cgroups and even sys_setsid() is not safe because a task running with task_group() == ag->tg must participate in refcounting: int main(void) { int sctl = open("/proc/sys/kernel/sched_autogroup_enabled", O_WRONLY); assert(sctl > 0); if (fork()) { wait(NULL); // destroy the child's ag/tg pause(); } assert(pwrite(sctl, "1\n", 2, 0) == 2); assert(setsid() > 0); if (fork()) pause(); kill(getppid(), SIGKILL); sleep(1); // The child has gone, the grandchild runs with kref == 1 assert(pwrite(sctl, "0\n", 2, 0) == 2); assert(setsid() > 0); // runs with the freed ag/tg for (;;) sleep(1); return 0; } crashes the kernel. It doesn't really need sleep(1), it doesn't matter if autogroup_move_group() actually frees the task_group or this happens later. Reported-by: Vern Lovejoy Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Link: http://lkml.kernel.org/r/20161114184609.GA15965@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..ad2b19ad6ca0 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). */ - if (p->flags & PF_EXITING) - return false; - return true; } @@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } -- cgit v1.2.3 From 8e5bfa8c1f8471aa4a2d30be631ef2b50e10abaf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:12 +0100 Subject: sched/autogroup: Do not use autogroup->tg in zombie threads Exactly because for_each_thread() in autogroup_move_group() can't see it and update its ->sched_task_group before _put() and possibly free(). So the exiting task needs another sched_move_task() before exit_notify() and we need to re-introduce the PF_EXITING (or similar) check removed by the previous change for another reason. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Cc: vlovejoy@redhat.com Link: http://lkml.kernel.org/r/20161114184612.GA15968@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/exit.c | 1 + kernel/sched/auto_group.c | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..e9c009dc3a4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2567,6 +2567,7 @@ extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); +extern void sched_autogroup_exit_task(struct task_struct *p); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); @@ -2576,6 +2577,7 @@ static inline void sched_autogroup_create_attach(struct task_struct *p) { } static inline void sched_autogroup_detach(struct task_struct *p) { } static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } +static inline void sched_autogroup_exit_task(struct task_struct *p) { } #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/exit.c b/kernel/exit.c index 9d68c45ebbe3..3076f3089919 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -836,6 +836,7 @@ void __noreturn do_exit(long code) */ perf_event_exit_task(tsk); + sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index ad2b19ad6ca0..f1c8fd566246 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -115,10 +115,26 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) * If we race with autogroup_move_group() the caller can use the old * value of signal->autogroup but in this case sched_move_task() will * be called again before autogroup_kref_put(). + * + * However, there is no way sched_autogroup_exit_task() could tell us + * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. */ + if (p->flags & PF_EXITING) + return false; + return true; } +void sched_autogroup_exit_task(struct task_struct *p) +{ + /* + * We are going to call exit_notify() and autogroup_move_group() can't + * see this thread after that: we can no longer use signal->autogroup. + * See the PF_EXITING check in task_wants_autogroup(). + */ + sched_move_task(p); +} + static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -142,6 +158,9 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * In the latter case for_each_thread() can not miss a migrating thread, * cpu_cgroup_attach() must not be possible after cgroup_exit() and it * can't be removed from thread list, we hold ->siglock. + * + * If an exiting thread was already removed from thread list we rely on + * sched_autogroup_exit_task(). */ for_each_thread(p, t) sched_move_task(t); -- cgit v1.2.3 From dbb26055defd03d59f678cb5f2c992abe05b064a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:41 +0000 Subject: locking/rtmutex: Prevent dequeue vs. unlock race David reported a futex/rtmutex state corruption. It's caused by the following problem: CPU0 CPU1 CPU2 l->owner=T1 rt_mutex_lock(l) lock(l->wait_lock) l->owner = T1 | HAS_WAITERS; enqueue(T2) boost() unlock(l->wait_lock) schedule() rt_mutex_lock(l) lock(l->wait_lock) l->owner = T1 | HAS_WAITERS; enqueue(T3) boost() unlock(l->wait_lock) schedule() signal(->T2) signal(->T3) lock(l->wait_lock) dequeue(T2) deboost() unlock(l->wait_lock) lock(l->wait_lock) dequeue(T3) ===> wait list is now empty deboost() unlock(l->wait_lock) lock(l->wait_lock) fixup_rt_mutex_waiters() if (wait_list_empty(l)) { owner = l->owner & ~HAS_WAITERS; l->owner = owner ==> l->owner = T1 } lock(l->wait_lock) rt_mutex_unlock(l) fixup_rt_mutex_waiters() if (wait_list_empty(l)) { owner = l->owner & ~HAS_WAITERS; cmpxchg(l->owner, T1, NULL) ===> Success (l->owner = NULL) l->owner = owner ==> l->owner = T1 } That means the problem is caused by fixup_rt_mutex_waiters() which does the RMW to clear the waiters bit unconditionally when there are no waiters in the rtmutexes rbtree. This can be fatal: A concurrent unlock can release the rtmutex in the fastpath because the waiters bit is not set. If the cmpxchg() gets in the middle of the RMW operation then the previous owner, which just unlocked the rtmutex is set as the owner again when the write takes place after the successfull cmpxchg(). The solution is rather trivial: verify that the owner member of the rtmutex has the waiters bit set before clearing it. This does not require a cmpxchg() or other atomic operations because the waiters bit can only be set and cleared with the rtmutex wait_lock held. It's also safe against the fast path unlock attempt. The unlock attempt via cmpxchg() will either see the bit set and take the slowpath or see the bit cleared and release it atomically in the fastpath. It's remarkable that the test program provided by David triggers on ARM64 and MIPS64 really quick, but it refuses to reproduce on x86-64, while the problem exists there as well. That refusal might explain that this got not discovered earlier despite the bug existing from day one of the rtmutex implementation more than 10 years ago. Thanks to David for meticulously instrumenting the code and providing the information which allowed to decode this subtle problem. Reported-by: David Daney Tested-by: David Daney Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Will Deacon Cc: stable@vger.kernel.org Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core") Link: http://lkml.kernel.org/r/20161130210030.351136722@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1ec0f48962b3..2c49d76f96c3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) static void fixup_rt_mutex_waiters(struct rt_mutex *lock) { - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); + unsigned long owner, *p = (unsigned long *) &lock->owner; + + if (rt_mutex_has_waiters(lock)) + return; + + /* + * The rbtree has no waiters enqueued, now make sure that the + * lock->owner still has the waiters bit set, otherwise the + * following can happen: + * + * CPU 0 CPU 1 CPU2 + * l->owner=T1 + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T2) + * boost() + * unlock(l->lock) + * block() + * + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T3) + * boost() + * unlock(l->lock) + * block() + * signal(->T2) signal(->T3) + * lock(l->lock) + * dequeue(T2) + * deboost() + * unlock(l->lock) + * lock(l->lock) + * dequeue(T3) + * ==> wait list is empty + * deboost() + * unlock(l->lock) + * lock(l->lock) + * fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * l->owner = owner + * owner = l->owner & ~HAS_WAITERS; + * ==> l->owner = T1 + * } + * lock(l->lock) + * rt_mutex_unlock(l) fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * owner = l->owner & ~HAS_WAITERS; + * cmpxchg(l->owner, T1, NULL) + * ===> Success (l->owner = NULL) + * + * l->owner = owner + * ==> l->owner = T1 + * } + * + * With the check for the waiter bit in place T3 on CPU2 will not + * overwrite. All tasks fiddling with the waiters bit are + * serialized by l->lock, so nothing else can modify the waiters + * bit. If the bit is set then nothing can change l->owner either + * so the simple RMW is safe. The cmpxchg() will simply fail if it + * happens in the middle of the RMW because the waiters bit is + * still set. + */ + owner = READ_ONCE(*p); + if (owner & RT_MUTEX_HAS_WAITERS) + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); } /* -- cgit v1.2.3 From 1be5d4fa0af34fb7bafa205aeb59f5c7cc7a089d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:42 +0000 Subject: locking/rtmutex: Use READ_ONCE() in rt_mutex_owner() While debugging the rtmutex unlock vs. dequeue race Will suggested to use READ_ONCE() in rt_mutex_owner() as it might race against the cmpxchg_release() in unlock_rt_mutex_safe(). Will: "It's a minor thing which will most likely not matter in practice" Careful search did not unearth an actual problem in todays code, but it's better to be safe than surprised. Suggested-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: David Daney Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Link: http://lkml.kernel.org/r/20161130210030.431379999@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex_common.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 4f5f83c7d2d3..e317e1cbb3eb 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -75,8 +75,9 @@ task_top_pi_waiter(struct task_struct *p) static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL); } /* -- cgit v1.2.3